In [1]:
# Import dependencies and files
import pandas as pd

In [2]:
# Import csv file as dataframe
df = pd.read_csv("Resources/dept_writing.csv")

df

Unnamed: 0,movie_id,department,gender,job,name
0,11862,Writing,1.0,Screenplay,Nancy Meyers
1,11860,Writing,1.0,Screenplay,Barbara Benedek
2,9091,Writing,1.0,Screenplay,Karen Elise Baldwin
3,21032,Writing,1.0,Writer,Elana Lesser
4,4584,Writing,1.0,Novel,Jane Austen
...,...,...,...,...,...
36836,180850,Writing,0.0,Writer,Robert Nathan
36837,205908,Writing,0.0,Story,William Alexander
36838,52039,Writing,0.0,Writer,Cruz Angeles
36839,156310,Writing,0.0,Writer,Worth Keeter


In [3]:
# sort data by movie_id
sorted_df=df.sort_values("movie_id")

sorted_df

Unnamed: 0,movie_id,department,gender,job,name
3601,5,Writing,0.0,Writer,Robert Rodriguez
3602,5,Writing,0.0,Writer,Quentin Tarantino
6,5,Writing,1.0,Writer,Allison Anders
3600,5,Writing,0.0,Writer,Alexandre Rockwell
4115,6,Writing,0.0,Screenplay,Lewis Colick
...,...,...,...,...,...
3515,464111,Writing,1.0,Screenplay,Terri Tatchell
36306,464111,Writing,0.0,Screenplay,Neill Blomkamp
36328,467731,Writing,0.0,Writer,Reginald Rose
23358,469172,Writing,0.0,Writer,Raúl Ruiz


In [4]:
# create df to sum the gender column by movie_id
sum_gender = sorted_df.groupby(["movie_id"]).sum()

sum_gender

Unnamed: 0_level_0,gender
movie_id,Unnamed: 1_level_1
5,1.0
6,0.0
11,0.0
12,0.0
13,0.0
...,...
463800,0.0
463906,0.0
464111,1.0
467731,0.0


In [5]:
# change column name for clarity
sum_gender.rename(columns={"gender":"sum_gender"}, inplace=True)

sum_gender

Unnamed: 0_level_0,sum_gender
movie_id,Unnamed: 1_level_1
5,1.0
6,0.0
11,0.0
12,0.0
13,0.0
...,...
463800,0.0
463906,0.0
464111,1.0
467731,0.0


In [6]:
# create df to total number of people by movie_id
total_gender = sorted_df.groupby(["movie_id"]).count()

total_gender

Unnamed: 0_level_0,department,gender,job,name
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,4,4,4,4
6,2,2,2,2
11,1,1,1,1
12,6,6,6,6
13,2,2,2,2
...,...,...,...,...
463800,1,1,1,1
463906,1,1,1,1
464111,2,2,2,2
467731,1,1,1,1


In [7]:
# Drop all columns but gender column
total_gender.drop(columns=["department","job","name"], inplace=True)

total_gender

Unnamed: 0_level_0,gender
movie_id,Unnamed: 1_level_1
5,4
6,2
11,1
12,6
13,2
...,...
463800,1
463906,1
464111,2
467731,1


In [8]:
# rename gender column for clarity
total_gender.rename(columns={"gender":"total_gender"}, inplace=True)

total_gender

Unnamed: 0_level_0,total_gender
movie_id,Unnamed: 1_level_1
5,4
6,2
11,1
12,6
13,2
...,...
463800,1
463906,1
464111,2
467731,1


In [9]:
# merge df to have both sum and total aligned
joined_gender = pd.merge(sum_gender, total_gender, how='inner', on='movie_id')

joined_gender

Unnamed: 0_level_0,sum_gender,total_gender
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5,1.0,4
6,0.0,2
11,0.0,1
12,0.0,6
13,0.0,2
...,...,...
463800,0.0,1
463906,0.0,1
464111,1.0,2
467731,0.0,1


In [10]:
# add column called "percent_female" to perform calculation based on sum/total
joined_gender["percent_female"]=(joined_gender["sum_gender"]/joined_gender["total_gender"])

In [11]:
joined_gender

Unnamed: 0_level_0,sum_gender,total_gender,percent_female
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,1.0,4,0.25
6,0.0,2,0.00
11,0.0,1,0.00
12,0.0,6,0.00
13,0.0,2,0.00
...,...,...,...
463800,0.0,1,0.00
463906,0.0,1,0.00
464111,1.0,2,0.50
467731,0.0,1,0.00


In [12]:
# change movie_id to column instead of axis
# solution from https://stackoverflow.com/questions/20461165/how-to-convert-index-of-a-pandas-dataframe-into-a-column
joined_gender=joined_gender.rename_axis("movie_id").reset_index()

In [13]:
joined_gender.sample(25)

Unnamed: 0,movie_id,sum_gender,total_gender,percent_female
7585,23134,0.0,2,0.0
8077,25502,0.0,2,0.0
10516,36349,0.0,2,0.0
15608,67221,1.0,4,0.25
16277,74753,0.0,2,0.0
20316,168114,0.0,1,0.0
23862,411009,0.0,1,0.0
8297,26326,0.0,3,0.0
13711,50541,0.0,1,0.0
5596,15135,0.0,2,0.0


In [14]:
joined_gender.to_csv("Resources/joined_gender_writing.csv", encoding='utf8', index=False)