# Diversity in the film industry: data exploration - Movie scripts

## 1. Importing libraries and dataset

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

In [2]:
film_gender = round(pd.read_csv('./data/script/film_gender_clean.csv', index_col=0), 2)
year_gender = round(pd.read_csv('./data/script/year_gender_clean.csv', index_col=0), 2)
total_gender = round(pd.read_csv('./data/script/total_gender_clean.csv', index_col=0), 2)

film_race = round(pd.read_csv('./data/script/film_race_clean.csv', index_col=0), 2)
year_race = round(pd.read_csv('./data/script/year_race_clean.csv', index_col=0), 2)
total_race = round(pd.read_csv('./data/script/total_race_clean.csv', index_col=0), 2)

script = pd.read_csv('./data/actor-metrics.csv')
script.drop('imdb', axis=1, inplace=True)

## 2. Exploring gender

### 2.1. Totals 

In [3]:
total_gender

Unnamed: 0,gender,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_gender
0,female,51119.0,8114.0,172653.0,26351.0,29.61,30.79,31.7
1,male,121534.0,18237.0,172653.0,26351.0,70.39,69.21,68.3


### 2.2. Analyzing by film

In [4]:
# movies with highest % of female cast
film_gender.loc[film_gender.gender=='female'].sort_values('cast_perc_gender', ascending=False)

Unnamed: 0,year,film,gender,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_gender
14,2015,Mad Max,female,2316,505,3130,694,73.99,72.77,75.0
12,2015,Brooklyn,female,7427,1077,10591,1469,70.13,73.32,72.22
32,2017,Lady Bird,female,5949,899,7664,1148,77.62,78.31,58.33
25,2017,Call me by your name,female,1383,226,6673,1048,20.73,21.56,50.0
4,1989,Driving Miss Daisy,female,4143,674,9675,1520,42.82,44.34,50.0
16,2015,Room,female,5243,868,8373,1414,62.62,61.39,50.0
34,2017,Phantom Thread,female,2743,372,5541,783,49.5,47.51,50.0
40,2017,Three Billboards Outside Ebbing Missouri,female,3403,444,9480,1195,35.9,37.15,38.89
6,1989,Field of Dreams,female,1848,285,8933,1299,20.69,21.94,37.5
30,2017,Get Out,female,2039,425,7094,1330,28.74,31.95,36.36


In [5]:
# movies with highest % of female sentences
film_gender.loc[film_gender.gender=='female'].sort_values('sentences_perc', ascending=False)

Unnamed: 0,year,film,gender,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_gender
32,2017,Lady Bird,female,5949,899,7664,1148,77.62,78.31,58.33
12,2015,Brooklyn,female,7427,1077,10591,1469,70.13,73.32,72.22
14,2015,Mad Max,female,2316,505,3130,694,73.99,72.77,75.0
16,2015,Room,female,5243,868,8373,1414,62.62,61.39,50.0
34,2017,Phantom Thread,female,2743,372,5541,783,49.5,47.51,50.0
8,1989,My Left Foot,female,2158,439,4749,980,45.44,44.8,33.33
4,1989,Driving Miss Daisy,female,4143,674,9675,1520,42.82,44.34,50.0
40,2017,Three Billboards Outside Ebbing Missouri,female,3403,444,9480,1195,35.9,37.15,38.89
36,2017,The Post,female,3972,551,11135,1528,35.67,36.06,31.25
30,2017,Get Out,female,2039,425,7094,1330,28.74,31.95,36.36


In [6]:
# movies with highest % of female words
film_gender.loc[film_gender.gender=='female'].sort_values('words_perc', ascending=False).head()

Unnamed: 0,year,film,gender,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_gender
32,2017,Lady Bird,female,5949,899,7664,1148,77.62,78.31,58.33
14,2015,Mad Max,female,2316,505,3130,694,73.99,72.77,75.0
12,2015,Brooklyn,female,7427,1077,10591,1469,70.13,73.32,72.22
16,2015,Room,female,5243,868,8373,1414,62.62,61.39,50.0
34,2017,Phantom Thread,female,2743,372,5541,783,49.5,47.51,50.0


### 2.3. Analyzing by year

In [7]:
# years with highest % of female cast
year_gender.loc[year_gender.gender=='female'].sort_values('cast_perc_gender', ascending=False).head()

Unnamed: 0,year,gender,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_gender
2,2015,female,20385.0,3295.0,72918.0,10741.0,27.96,30.68,34.19
4,2017,female,21828.0,3277.0,66010.0,9902.0,33.07,33.09,32.32
0,1989,female,8906.0,1542.0,33725.0,5708.0,26.41,27.01,24.49


In [8]:
# years with highest % of female sentences
year_gender.loc[year_gender.gender=='female'].sort_values('sentences_perc', ascending=False).head()

Unnamed: 0,year,gender,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_gender
4,2017,female,21828.0,3277.0,66010.0,9902.0,33.07,33.09,32.32
2,2015,female,20385.0,3295.0,72918.0,10741.0,27.96,30.68,34.19
0,1989,female,8906.0,1542.0,33725.0,5708.0,26.41,27.01,24.49


In [9]:
# years with highest % of female words
year_gender.loc[year_gender.gender=='female'].sort_values('words_perc', ascending=False).head()

Unnamed: 0,year,gender,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_gender
4,2017,female,21828.0,3277.0,66010.0,9902.0,33.07,33.09,32.32
2,2015,female,20385.0,3295.0,72918.0,10741.0,27.96,30.68,34.19
0,1989,female,8906.0,1542.0,33725.0,5708.0,26.41,27.01,24.49


## 3. Exploring race

### 3.1. Totals

In [10]:
total_race

Unnamed: 0,race_simple,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_race
0,POC,15172.0,2377.0,172653.0,26351.0,8.79,9.02,11.32
1,White,157481.0,23974.0,172653.0,26351.0,91.21,90.98,88.68


### 3.2. Analyzing by film

In [11]:
# movies with highest % of POC cast
film_race.loc[film_race.race_simple=='POC'].sort_values('cast_perc_race', ascending=False)

Unnamed: 0,year,film,race_simple,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_race
24,2017,Get Out,POC,3309,653,7094,1330,46.65,49.1,45.45
17,2015,The Martian,POC,2589,365,10550,1507,24.54,24.22,35.71
3,1989,Driving Miss Daisy,POC,3531,544,9675,1520,36.5,35.79,33.33
12,2015,Room,POC,616,92,8373,1414,7.36,6.51,25.0
10,2015,Mad Max,POC,320,64,3130,694,10.22,9.22,16.67
26,2017,Lady Bird,POC,292,49,7664,1148,3.81,4.27,16.67
33,2017,Three Billboards Outside Ebbing Missouri,POC,782,76,9480,1195,8.25,6.36,16.67
15,2015,The Big Short,POC,515,71,13241,1782,3.89,3.98,13.04
5,1989,Field of Dreams,POC,1369,207,8933,1299,15.33,15.94,12.5
21,2017,Darkest Hour,POC,147,27,8034,1077,1.83,2.51,9.09


In [12]:
# movies with highest % of POC sentences
film_race.loc[film_race.race_simple=='POC'].sort_values('sentences_perc', ascending=False).head()

Unnamed: 0,year,film,race_simple,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_race
24,2017,Get Out,POC,3309,653,7094,1330,46.65,49.1,45.45
3,1989,Driving Miss Daisy,POC,3531,544,9675,1520,36.5,35.79,33.33
17,2015,The Martian,POC,2589,365,10550,1507,24.54,24.22,35.71
5,1989,Field of Dreams,POC,1369,207,8933,1299,15.33,15.94,12.5
31,2017,The Shape of Water,POC,1319,201,8036,1347,16.41,14.92,9.09


In [13]:
# movies with highest % of POC words
film_race.loc[film_race.race_simple=='POC'].sort_values('words_perc', ascending=False).head()

Unnamed: 0,year,film,race_simple,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_race
24,2017,Get Out,POC,3309,653,7094,1330,46.65,49.1,45.45
3,1989,Driving Miss Daisy,POC,3531,544,9675,1520,36.5,35.79,33.33
17,2015,The Martian,POC,2589,365,10550,1507,24.54,24.22,35.71
31,2017,The Shape of Water,POC,1319,201,8036,1347,16.41,14.92,9.09
5,1989,Field of Dreams,POC,1369,207,8933,1299,15.33,15.94,12.5


### 3.3. Analyzing by year

In [14]:
# movies with highest % of POC cast
year_race.loc[year_race.race_simple=='POC'].sort_values('cast_perc_race', ascending=False).head()

Unnamed: 0,year,race_simple,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_race
4,2017,POC,6024.0,1019.0,66010.0,9902.0,9.13,10.29,13.13
2,2015,POC,4040.0,592.0,72918.0,10741.0,5.54,5.51,11.11
0,1989,POC,5108.0,766.0,33725.0,5708.0,15.15,13.42,8.16


In [15]:
# movies with highest % of POC sentences
year_race.loc[year_race.race_simple=='POC'].sort_values('sentences_perc', ascending=False).head()

Unnamed: 0,year,race_simple,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_race
0,1989,POC,5108.0,766.0,33725.0,5708.0,15.15,13.42,8.16
4,2017,POC,6024.0,1019.0,66010.0,9902.0,9.13,10.29,13.13
2,2015,POC,4040.0,592.0,72918.0,10741.0,5.54,5.51,11.11


In [16]:
# movies with highest % of POC words
year_race.loc[year_race.race_simple=='POC'].sort_values('words_perc', ascending=False).head()

Unnamed: 0,year,race_simple,words,sentences,total_words,total_sentences,words_perc,sentences_perc,cast_perc_race
0,1989,POC,5108.0,766.0,33725.0,5708.0,15.15,13.42,8.16
4,2017,POC,6024.0,1019.0,66010.0,9902.0,9.13,10.29,13.13
2,2015,POC,4040.0,592.0,72918.0,10741.0,5.54,5.51,11.11


## 4. Looking into female POC

In [17]:
# creating dataframe filtering 'script' by female and POC
film_female_POC = script.loc[(script.race_simple=='POC')&(script.gender=='female')][['year', 'film', 'sentences']].groupby(['year','film'], as_index=False).sum()    
film_female_POC

Unnamed: 0,year,film,sentences
0,1989,driving-miss-daisy,34.0
1,2015,mad-max,64.0
2,2015,room,63.0
3,2015,the-big-short,51.0
4,2017,get-out,48.0
5,2017,lady-bird,20.0
6,2017,the-post,13.0
7,2017,the-shape-of-water,201.0
8,2017,three-billboards-outside-ebbing-missouri,11.0


In [18]:
# getting count of total sentences by movie
film_sentences = dict(script.groupby('film').sentences.sum())
film_sentences

{'born-on-the-fourth-of-july': 620.0,
 'bridge-of-spies': 1351.0,
 'brooklyn': 1469.0,
 'call-me-by-your-name': 1048.0,
 'darkest-hour': 1077.0,
 'dead-poets-society': 1289.0,
 'driving-miss-daisy': 1520.0,
 'dunkirk': 446.0,
 'field-of-dreams': 1299.0,
 'get-out': 1330.0,
 'lady-bird': 1148.0,
 'mad-max': 694.0,
 'my-left-foot': 980.0,
 'phantom-thread': 783.0,
 'room': 1414.0,
 'spotlight': 1975.0,
 'the-big-short': 1782.0,
 'the-martian': 1507.0,
 'the-post': 1528.0,
 'the-revenant': 549.0,
 'the-shape-of-water': 1347.0,
 'three-billboards-outside-ebbing-missouri': 1195.0}

In [19]:
# appending number of sentences to dataframe
film_female_POC['total_sentences'] = film_female_POC.film.map(film_sentences)

# adding new column with % of female POC sentences out of total sentences
film_female_POC['female_POC_perc'] = film_female_POC['sentences']/film_female_POC['total_sentences']*100

In [20]:
# getting dataframe with all movies and number of total sentences
script = script.groupby(['year','film'], as_index=False).sentences.sum()

In [21]:
# merging with 'script' to keep movies with 0% sentences by female POC
film_female_POC = pd.merge(script, film_female_POC, how='left', on=['film', 'year'])

# removing unwanted columns
film_female_POC = film_female_POC[['year', 'film', 'female_POC_perc']].groupby('film', as_index=False).sum()

In [22]:
# exporting dataset
film_female_POC.to_csv('./data/script/film_female_POC.csv')

In [23]:
# percentage of female POC sentences by movie 
film_female_POC

Unnamed: 0,film,year,female_POC_perc
0,born-on-the-fourth-of-july,1989,0.0
1,bridge-of-spies,2015,0.0
2,brooklyn,2015,0.0
3,call-me-by-your-name,2017,0.0
4,darkest-hour,2017,0.0
5,dead-poets-society,1989,0.0
6,driving-miss-daisy,1989,2.236842
7,dunkirk,2017,0.0
8,field-of-dreams,1989,0.0
9,get-out,2017,3.609023
