In [2]:
#Import dependencies and files
import pandas as pd
import numpy as np


## Separating the table crew by gender

In [3]:
#Import csv file as dataframe
data_df = pd.read_csv("Resources/lilliana_crew.csv")

data_df.head()

Unnamed: 0,movie_id,department,gender,job,name
0,862,Directing,2.0,Director,John Lasseter
1,862,Writing,2.0,Screenplay,Joss Whedon
2,862,Writing,2.0,Screenplay,Andrew Stanton
3,862,Writing,2.0,Screenplay,Joel Cohen
4,862,Writing,0.0,Screenplay,Alec Sokolow


During the data review it was found that gender=1 was used for women and gender=2 for men, however both men and women could be found with gender=0, we will match the names found under gender=0. 

In [4]:
# Separate into df where gender is 0
gender0 = data_df.loc[(data_df['gender'] == 0.0)]
gender0.head()


Unnamed: 0,movie_id,department,gender,job,name
4,862,Writing,0.0,Screenplay,Alec Sokolow
6,862,Production,0.0,Executive Producer,Ed Catmull
12,862,Sound,0.0,Foley Editor,Mary Helen Leasman
13,862,Visual Effects,0.0,Animation,Kim Blanchette
14,862,Sound,0.0,ADR Editor,Marilyn McCoppen


In [5]:
gender0['gender'].value_counts()

0.0    272319
Name: gender, dtype: int64

In [6]:
# Separate into df where gender is 1
gender1 = data_df.loc[(data_df['gender'] == 1)]
gender1.head()

Unnamed: 0,movie_id,department,gender,job,name
5,862,Production,1.0,Producer,Bonnie Arnold
70,862,Lighting,1.0,Lighting Supervisor,Sharon Calahan
101,862,Sound,1.0,Assistant Sound Editor,Susan Sanford
104,862,Production,1.0,Casting Consultant,Ruth Lambert
130,31357,Production,1.0,Producer,Deborah Schindler


In [7]:
gender1['gender'].value_counts()

1.0    31123
Name: gender, dtype: int64

In [8]:
# Separate into df where gender is 2
gender2 = data_df.loc[(data_df['gender'] == 2)]
gender2.head()

Unnamed: 0,movie_id,department,gender,job,name
0,862,Directing,2.0,Director,John Lasseter
1,862,Writing,2.0,Screenplay,Joss Whedon
2,862,Writing,2.0,Screenplay,Andrew Stanton
3,862,Writing,2.0,Screenplay,Joel Cohen
7,862,Production,2.0,Producer,Ralph Guggenheim


In [9]:
# Review of the number of rows
gender2['gender'].value_counts()

2.0    160872
Name: gender, dtype: int64

In [10]:
# Deleting the gender column from the gender0 table to facilitate the merge
gender0formerge = gender0.drop(columns='gender')
gender0formerge

Unnamed: 0,movie_id,department,job,name
4,862,Writing,Screenplay,Alec Sokolow
6,862,Production,Executive Producer,Ed Catmull
12,862,Sound,Foley Editor,Mary Helen Leasman
13,862,Visual Effects,Animation,Kim Blanchette
14,862,Sound,ADR Editor,Marilyn McCoppen
...,...,...,...,...
465075,111109,Editing,Editor,Lav Diaz
465076,111109,Crew,Cinematography,Lav Diaz
465080,67758,Sound,Original Music Composer,Richard McHugh
465082,227506,Directing,Director,Yakov Protazanov


In [11]:
gender1.columns

Index(['movie_id', 'department', 'gender', 'job', 'name'], dtype='object')

In [12]:
# Deleting the movie id, department and job columns from the gender1 table to facilitate the merge, avoid duplicates 
gender1formerge = gender1.drop(columns=['movie_id', 'department','job'])
gender1formerge

Unnamed: 0,gender,name
5,1.0,Bonnie Arnold
70,1.0,Sharon Calahan
101,1.0,Susan Sanford
104,1.0,Ruth Lambert
130,1.0,Deborah Schindler
...,...,...
464958,1.0,Gail Parent
465020,1.0,Kimberly Rach
465044,1.0,Ann Roth
465057,1.0,Sarah Radclyffe


In [13]:
# Creating dataframe from the intersection of the tables by merge on the name column
intersection_1 = pd.merge(gender0formerge, gender1formerge, how='inner', on='name')
intersection_1

Unnamed: 0,movie_id,department,job,name,gender
0,11017,Sound,Music Editor,Sally Boldt,1.0
1,11017,Sound,Music Editor,Sally Boldt,1.0
2,11017,Sound,Music Editor,Sally Boldt,1.0
3,754,Sound,Music Editor,Sally Boldt,1.0
4,754,Sound,Music Editor,Sally Boldt,1.0
...,...,...,...,...,...
614,347026,Writing,Book,Susan Williams,1.0
615,263115,Editing,Digital Intermediate,Carrie Oliver,1.0
616,264760,Writing,Writer,Jenna Mattison,1.0
617,21038,Production,Executive Producer,Ethel Winant,1.0


In [14]:
gender1.columns

Index(['movie_id', 'department', 'gender', 'job', 'name'], dtype='object')

In [15]:
intersection_1.columns

Index(['movie_id', 'department', 'job', 'name', 'gender'], dtype='object')

In [16]:
# Changing order column to match final table
column_order = ['movie_id', 'department', 'gender', 'job', 'name']
intersection_1 = intersection_1[column_order]
intersection_1.head()

Unnamed: 0,movie_id,department,gender,job,name
0,11017,Sound,1.0,Music Editor,Sally Boldt
1,11017,Sound,1.0,Music Editor,Sally Boldt
2,11017,Sound,1.0,Music Editor,Sally Boldt
3,754,Sound,1.0,Music Editor,Sally Boldt
4,754,Sound,1.0,Music Editor,Sally Boldt


In [17]:
#Appending rows from the intersection df to the gender1 dataframe
frames1 = [gender1, intersection_1]
clean_gender1 = pd.concat(frames1)
clean_gender1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31742 entries, 5 to 618
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   movie_id    31742 non-null  int64  
 1   department  31742 non-null  object 
 2   gender      31742 non-null  float64
 3   job         31742 non-null  object 
 4   name        31742 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 1.5+ MB


Adding the rows from gender0 when name matches in gender1 records but mantaining all other information

In [18]:
gender2formerge = gender2.drop(columns=['movie_id', 'department','job'])
gender2formerge

Unnamed: 0,gender,name
0,2.0,John Lasseter
1,2.0,Joss Whedon
2,2.0,Andrew Stanton
3,2.0,Joel Cohen
7,2.0,Ralph Guggenheim
...,...,...
465077,2.0,Mark L. Lester
465078,2.0,C. Courtney Joyner
465079,2.0,Jeffrey Goldenberg
465081,2.0,João Fernandes


In [19]:
intersection_2 = pd.merge(gender0formerge, gender2formerge, how='inner', on='name')
intersection_2

Unnamed: 0,movie_id,department,job,name,gender
0,862,Visual Effects,Character Designer,Steve Johnson,2.0
1,862,Visual Effects,Character Designer,Steve Johnson,2.0
2,862,Visual Effects,Character Designer,Steve Johnson,2.0
3,862,Visual Effects,Character Designer,Steve Johnson,2.0
4,862,Visual Effects,Character Designer,Steve Johnson,2.0
...,...,...,...,...,...
17480,156310,Writing,Writer,Worth Keeter,2.0
17481,156310,Writing,Writer,Worth Keeter,2.0
17482,156310,Writing,Writer,Worth Keeter,2.0
17483,156310,Writing,Writer,Worth Keeter,2.0


In [20]:
intersection_2 = intersection_2[column_order]
intersection_2.head()

Unnamed: 0,movie_id,department,gender,job,name
0,862,Visual Effects,2.0,Character Designer,Steve Johnson
1,862,Visual Effects,2.0,Character Designer,Steve Johnson
2,862,Visual Effects,2.0,Character Designer,Steve Johnson
3,862,Visual Effects,2.0,Character Designer,Steve Johnson
4,862,Visual Effects,2.0,Character Designer,Steve Johnson


In [21]:
#Adding the reformatted dataframe to the gender2 df
frames2 = [gender2, intersection_2]
clean_gender2 = pd.concat(frames2)
clean_gender2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 178357 entries, 0 to 17484
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   movie_id    178357 non-null  int64  
 1   department  178357 non-null  object 
 2   gender      178357 non-null  float64
 3   job         178357 non-null  object 
 4   name        178357 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 8.2+ MB


Adding the rows from gender0 when name matches in gender2 records but mantaining all other information

In [22]:
frames3 = [clean_gender1, clean_gender2]
clean_crew = pd.concat(frames3)


In [23]:
clean_crew =clean_crew.drop_duplicates()
clean_crew['name'] = clean_crew['name'].apply(lambda x: x.replace('"', ''))
clean_crew['name'] = clean_crew['name'].apply(lambda x: x.replace(',',''))

Changing value assign to men from 2 to 0

In [24]:
clean_crew['gender'] = np.where(clean_crew['gender'] == 2.0, 0, clean_crew['gender'])
clean_crew

Unnamed: 0,movie_id,department,gender,job,name
5,862,Production,1.0,Producer,Bonnie Arnold
70,862,Lighting,1.0,Lighting Supervisor,Sharon Calahan
101,862,Sound,1.0,Assistant Sound Editor,Susan Sanford
104,862,Production,1.0,Casting Consultant,Ruth Lambert
130,31357,Production,1.0,Producer,Deborah Schindler
...,...,...,...,...,...
17472,52039,Writing,0.0,Writer,Cruz Angeles
17473,411012,Production,0.0,Producer,John Scheinfeld
17479,353491,Crew,0.0,Stunts,Eric Brown
17480,156310,Writing,0.0,Writer,Worth Keeter


In [32]:
clean_crew

Unnamed: 0,movie_id,department,gender,job,name
5,862,Production,1.0,Producer,Bonnie Arnold
70,862,Lighting,1.0,Lighting Supervisor,Sharon Calahan
101,862,Sound,1.0,Assistant Sound Editor,Susan Sanford
104,862,Production,1.0,Casting Consultant,Ruth Lambert
130,31357,Production,1.0,Producer,Deborah Schindler
...,...,...,...,...,...
17472,52039,Writing,0.0,Writer,Cruz Angeles
17473,411012,Production,0.0,Producer,John Scheinfeld
17479,353491,Crew,0.0,Stunts,Eric Brown
17480,156310,Writing,0.0,Writer,Worth Keeter


In [25]:
# Creating dataframe with all deparments
clean_crew.to_csv("Resources/all_deparments.csv", encoding='utf8', index=False)

Creating dataframe for the Directing department only

In [26]:
directing_df = clean_crew[clean_crew["department"]=="Directing"]

directing_df.head()

Unnamed: 0,movie_id,department,gender,job,name
166,949,Directing,1.0,Script Supervisor,Cate Hardman
245,11860,Directing,1.0,Script Supervisor,Mary A. Kelly
400,5,Directing,1.0,Director,Allison Anders
682,9263,Directing,1.0,Director,Lesli Linka Glatter
1027,687,Directing,1.0,Script Supervisor,Eva Z. Cabrera


In [27]:
directing_df.to_csv("Resources/dept_directing.csv", encoding='utf8', index=False)

Creating dataframe for the Writing department only

In [28]:
writing_df = clean_crew[clean_crew["department"]=="Writing"]

writing_df

Unnamed: 0,movie_id,department,gender,job,name
138,11862,Writing,1.0,Screenplay,Nancy Meyers
215,11860,Writing,1.0,Screenplay,Barbara Benedek
272,9091,Writing,1.0,Screenplay,Karen Elise Baldwin
345,21032,Writing,1.0,Writer,Elana Lesser
390,4584,Writing,1.0,Novel,Jane Austen
...,...,...,...,...,...
17413,180850,Writing,0.0,Writer,Robert Nathan
17421,205908,Writing,0.0,Story,William Alexander
17472,52039,Writing,0.0,Writer,Cruz Angeles
17480,156310,Writing,0.0,Writer,Worth Keeter


In [29]:
writing_df.to_csv("Resources/dept_writing.csv", encoding='utf8', index=False)

Creating dataframe for the Production department only

In [30]:
production_df = clean_crew[clean_crew["department"]=="Production"]

production_df

Unnamed: 0,movie_id,department,gender,job,name
5,862,Production,1.0,Producer,Bonnie Arnold
104,862,Production,1.0,Casting Consultant,Ruth Lambert
130,31357,Production,1.0,Producer,Deborah Schindler
139,11862,Production,1.0,Producer,Nancy Meyers
153,949,Production,1.0,Casting,Bonnie Timmermann
...,...,...,...,...,...
17345,371743,Production,0.0,Associate Producer,Tim Smith
17382,54400,Production,0.0,Producer,David Blair
17419,205908,Production,0.0,Executive Producer,William Alexander
17446,134201,Production,0.0,Casting Associate,Michael Greer


In [31]:
production_df.to_csv("Resources/dept_production.csv", encoding='utf8', index=False)