# Gender Equality in Film: The Bechdel Test

Based on the Bechdel Test criteria:
    - More than two female characters
    - Female characters talk to each other
    - Female characters talk about something other than men

In [144]:
import nltk
modules = ["punkt", "words", "stopwords", "averaged_perceptron_tagger", "maxent_ne_chunker"]
for module in modules:
    nltk.download(module)
    
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from datascience import *
import numpy as np

import locale
import re
import csv
import string
from collections import Counter

[nltk_data] Downloading package punkt to /home/vinitra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/vinitra/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vinitra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vinitra/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/vinitra/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


In [145]:
bechdel = Table.read_table('datasets/2010_2014MoviesTested.csv')
passes_bechdel = bechdel.where(1, 'Yes')
fails_bechdel = bechdel.where(1, 'No')

bechdel.show(3)

Movie Title,Passes Bechdel Test?,Number of Criteria Passed,Clarity of Pass,Has at least two [named] women in it,The women talk to each other,The women talk about something besides a man,Genre,Director,Main Actors,Production Comapany,Plot Keywords (top 10),Year,Rating,Runtime (min),Plot,Language,Country,Image,IMDb Popularity Score,Award Popularity,IMDb rating,IMDb votes,Nominations,Wins,IMDB Link
[REC]3 Genesis (2012),Yes,3,Clear,Pass,Pass,Pass,Horror,Paco Plaza,"Leticia Dolera, Diego Martín, Ismael Martínez, Àlex Monner","Canal+ España, Filmax, Generalitat de Catalunya - Instit ...","wedding, infection, church, third part, female stockinge ...",2012,Restricted,80,A couple's wedding day turns horrific as some of the gue ...,"Spanish, Catalan, French",Spain,http://ia.media-imdb.com/images/M/MV5BNDM3OTUzMTA1OF5BMl ...,101266,21.0,5.1,19856,6.0,3.0,http://www.imdb.com/title/tt1649444/
+1 (2013),Yes,3,Dubious,Pass,Pass,Pass,"Sci-Fi, Thriller",Dennis Iliadis,"Rhys Wakefield, Logan Miller, Ashley Hinshaw, Natalie Hall","Process Films, Process Media","party, college, hero kills a woman, woman stabbed, nude ...",2013,Not Rated,95,"Three college friends hit the biggest party of the year, ...",English,United States,http://ia.media-imdb.com/images/M/MV5BMTQwOTA5Mzc3Ml5BMl ...,22693,,5.5,4126,,,http://www.imdb.com/title/tt2395385/
11 Flowers (2011),Yes,3,Dubious,Pass,Pass,Pass,Drama,Xiaoshuai Wang,"Gang Cao, Shiping Cao, Wenqing Liu, Yihao Lou",,"childhood friend, pregnant from rape, sentenced to death ...",2011,Not Rated,110,,"Mandarin, Shanghainese","China, France",http://ia.media-imdb.com/images/M/MV5BMTQwOTYyMTU1NF5BMl ...,2583,6.0,7.0,369,6.0,,http://www.imdb.com/title/tt2042432/


In [146]:
proportion_of_passes = passes_bechdel.num_rows/bechdel.num_rows
print(str(round(proportion_of_passes*100, 2)) + "% movies from our dataset pass the bechdel test")

63.32% movies from our dataset pass the bechdel test


## Genre Analysis

For the movies that don't pass, do a frequency count on the genres to compare the most common entries

In [147]:

def list_flattening(l):
    return [item for sublist in l for item in sublist]

def clean_words(table, col):
    return list_flattening([genre.split(", ") for genre in table[col]])
    

In [148]:
f_genres = clean_words(fails_bechdel, 'Genre')
f_genre_count = Table().with_column('Genre', f_genres).group('Genre').sort('count', descending=True)
f_genre_count = f_genre_count.with_column('f proportion', f_genre_count['count']/sum(f_genre_count['count']))
f_genre_count

Genre,count,f proportion
Drama,248,0.203112
Comedy,188,0.153972
Action,139,0.113841
Thriller,111,0.0909091
Crime,84,0.0687961
Adventure,83,0.0679771
Sci-Fi,58,0.047502
Romance,50,0.04095
Horror,39,0.031941
Animation,38,0.031122


In [149]:
p_genres = clean_genres(passes_bechdel)
p_genre_count = Table().with_column('Genre', p_genres).group('Genre').sort('count', descending=True)
p_genre_count = p_genre_count.with_column('p proportion', p_genre_count['count']/sum(p_genre_count['count']))
p_genre_count

Genre,count,p proportion
Drama,506,0.250495
Comedy,326,0.161386
Romance,149,0.0737624
Thriller,146,0.0722772
Horror,125,0.0618812
Action,119,0.0589109
Adventure,108,0.0534653
Crime,79,0.0391089
Sci-Fi,76,0.0376238
Mystery,63,0.0311881


Let's find the genres with the biggest disparities between passing and not passing!

In [222]:
# The genres listed below are the genres which have a higher proportion of failed bechdel
# movies than passed bechdel movies

genre_count = p_genre_count.join("Genre", f_genre_count)
genre_count.append_column('disparity', (genre_count['f proportion'] - genre_count['p proportion'])*1000)
genre_count.sort('disparity', descending=True)

Genre,count,p proportion,count_2,f proportion,disparity
Action,119,0.0589109,139,0.113841,54.9302
Crime,79,0.0391089,84,0.0687961,29.6872
Thriller,146,0.0722772,111,0.0909091,18.6319
Adventure,108,0.0534653,83,0.0679771,14.5117
Sci-Fi,76,0.0376238,58,0.047502,9.87829
War,7,0.00346535,11,0.00900901,5.54366
Western,1,0.00049505,7,0.00573301,5.23796
Biography,43,0.0212871,32,0.026208,4.9209
Sport,7,0.00346535,10,0.00819001,4.72466
Short,2,0.000990099,5,0.004095,3.10491


In [225]:
# Exporting to CSV
genre_list = genre_count.select('disparity', 'Genre')
clean = lambda x: int(np.round(x))
genre_list = genre_list.with_column('disparity', genre_list.apply(clean, "disparity"))
genre_list.sort('disparity', descending=True).to_df().to_csv('genre_list.csv', index = False)

#### Failing Bechdel Test Genres
<img src="word_cloud_genre_n.png">

#### Genres of Movies that Pass Bechdel Test
<img src="word_cloud_genre_p.png">

## Plot Keywords

What are the words in the top 10 plot keywords that most indicate a negative bechdel result
    - which words are the most distinctly negative words

In [185]:
# import the snowball stemmer from the nltk to make each word their root
from nltk import stem
lancaster = stem.lancaster.LancasterStemmer()
snowball = stem.snowball.EnglishStemmer()
porter = stem.porter.PorterStemmer()

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_punct_words(lst):
    for i in range(len(lst)):
        lst[i] = [j.lower() for j in lst[i].split()]
        lst[i] = [j.lower() for j in lst[i] if j.lower() not in stop_words]
        lst[i] = [snowball.stem(k) for k in lst[i] if k != "nan"]
    return lst


In [187]:
tables = [bechdel, passes_bechdel, fails_bechdel]
k_words = []
i = 0
for tbl in tables:
    k_words.append(remove_punct_words(clean_words(tables[i], 'Plot Keywords (top 10)')))
    i += 1

In [188]:
for i in range(len(k_words)):
    k_words[i] = list_flattening(k_words[i])

In [189]:
k_dicts = []
k_tables = []
for i in range(len(k_words)):
    dict_vals = Counter(k_words[i])
    dict_vals['titl'] = 0
    k_dicts.append(dict_vals)
    k_tables.append(Table().with_columns("Stemmed Words", dict_vals.keys(),
                                        "Count", dict_vals.values()))

In [190]:
passed_kw = k_tables[1]
failed_kw = k_tables[2]
failed_kw = failed_kw.sort('Count', descending=True).with_column('Proportion', \
                                                failed_kw['Count']/sum(failed_kw['Count']))
failed_kw

Stemmed Words,Count,Proportion
polic,55,0.000710328
death,52,0.000142066
nuditi,51,0.000142066
relationship,48,0.000710328
femal,47,0.00170479
love,40,0.000142066
refer,38,0.000284131
male,38,0.000142066
word,38,0.000142066
murder,37,0.000284131


In [191]:
passed_kw = passed_kw.sort('Count', descending=True).with_column('Proportion', \
                                                 passed_kw['Count']/sum(passed_kw['Count']))
passed_kw

Stemmed Words,Count,Proportion
relationship,160,0.000253507
femal,147,8.45023e-05
nuditi,88,0.00101403
woman,77,8.45023e-05
school,73,0.000169005
sex,71,0.00160554
refer,70,0.000676018
love,68,8.45023e-05
daughter,67,0.000169005
mother,64,8.45023e-05


### Which words are most distinctly negative?

In [206]:
joined_kw = k_tables[0].join("Stemmed Words", passed_kw).join("Stemmed Words", failed_kw)
joined_kw = joined_kw.with_column('Negative Emphasis', joined_kw['Count_3']/joined_kw['Count'])
joined_kw = joined_kw.with_column('Negative Disparity', (joined_kw['Proportion_2'] - joined_kw['Proportion'])*joined_kw['Count']*100)
joined_kw = joined_kw.sort('Negative Disparity', descending=True)
joined_kw

  from ipykernel import kernelapp as app


Stemmed Words,Count,Count_2,Proportion,Count_3,Proportion_2,Negative Emphasis,Negative Disparity
sister,53,47,8.45023e-05,6,0.00667708,0.113208,34.9407
femal,194,147,8.45023e-05,47,0.00170479,0.242268,31.4335
war,28,18,8.45023e-05,10,0.00355164,0.357143,9.70799
relationship,208,160,0.000253507,48,0.000710328,0.230769,9.50188
base,74,49,0.000253507,25,0.00142066,0.337838,8.63691
open,26,13,0.000253507,13,0.00355164,0.5,8.57515
attempt,26,20,0.000338009,6,0.00340958,0.230769,7.98607
polic,115,60,8.45023e-05,55,0.000710328,0.478261,7.197
night,16,9,8.45023e-05,7,0.00440403,0.4375,6.91125
blood,43,31,8.45023e-05,12,0.00156272,0.27907,6.35634


In [221]:
# Exporting to csv
word_list = joined_kw.select('Negative Disparity', 'Stemmed Words').take(range(50))
clean = lambda x: int(np.round(x))
word_list = word_list.with_column('Negative Disparity', word_list.apply(clean, "Negative Disparity"))
word_list.to_df().to_csv('word_list', index = False)

<img src="word_cloud_negative_disparity.png">

## Time Series Analysis

Compare the years of those that passed with those that didn't pass

## Conclusion of Analysis

This is what we think -- 