In [1]:
from IMSDB_Parser import IMSDB_Parser
from lxml import etree
from bs4 import BeautifulSoup
import requests
from nltk.tag import StanfordNERTagger
import gender_guesser.detector as genderDetector
import pickle
import pandas as pd
from itertools import groupby
from collections import defaultdict, Counter
from io import StringIO
import re

import logging
logger = logging.getLogger()
logging.basicConfig(filename="parsing.log", level=logging.INFO)


In [2]:
%load_ext watermark

In [3]:
%watermark -a 'Amr Mashlah' -d -u -v -p nltk,lxml,io,bs4,requests,gender_guesser,collections,re,itertools,logging,pickle,pandas

Amr Mashlah 
last updated: 2018-12-31 

CPython 3.5.6
IPython 6.5.0

nltk 3.3
lxml unknown
io unknown
bs4 4.6.3
requests 2.19.1
gender_guesser unknown
collections unknown
re 2.2.1
itertools unknown
logging 0.5.1.2
pickle unknown
pandas 0.23.4


In [4]:
IMSDB_URL = "https://www.imsdb.com"
ALL_SCRIPTS = "/all%20scripts/"
NER_JAR = '/Users/amr/stanford-ner-2018-10-16/stanford-ner.jar'
NLTK_MODEL = '/Users/amr/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz'
st = StanfordNERTagger(NLTK_MODEL, NER_JAR, encoding='utf8')

#### Bechdel test functions

If provided, use cached results for gender detector and already checked bechdel tests

In [24]:
try:
    with open('gender_dict.pk', 'rb') as f:
        gender_dict = pickle.load(f)
except:
    gender_dict = {}

movies_results = defaultdict(dict)

try:
    with open('results.pk', 'rb') as f:
        saved_movies_results = pickle.load(f) 
        
    for key, value in saved_movies_results.items():
        movies_results[key] = value
except:
    pass

## Bechdel test functions

In [6]:
def get_gender(x):
    '''
    Parameters
    ----------
    x: first name
    
    Returns
    -------
    gender: string {'male', 'female', 'unkown'}

    '''
    if gender_dict.get(x, None):
        return gender_dict[x]
    else:
        
        d = genderDetector.Detector()
        gender = d.get_gender(x.lower().capitalize())
        if gender == 'mostly_male':
            gender = 'male'
        if gender == 'mostly_female':
            gender = 'female'
            
        gender_dict[x] = gender
        
        return gender


def find_first_names(text):
    '''
    Extract first names from plain text

    Parameters
    ----------
    text: Plain text
    
    Returns
    -------
    persons: list of first names found in text
    
    '''
    tagged_entitites = st.tag(text.split())
    persons = []
    for tag, chunk in groupby(tagged_entitites, lambda x:x[1]):
        if tag == "PERSON":
            persons.append(list(chunk)[0][0])
    return persons

def male_mentioned(text):
    '''
    Scans for male names in a body of text
    Parameters
    ----------
    Text: plain text
    
    Returns
    -------
    Boolean: 
    True if any male name were identified
    '''
    
    males_binary = []
    
    first_names = find_first_names(text)
    if first_names:
        males_binary = [True for x in first_names if get_gender(x)== 'male' or get_gender(x) == 'mostly_male']
    
    return any(males_binary)
        
    

def bechdel_condition(characters):
    '''
    Parameters
    ----------
    characters: list of strings
    
    Returns
    -------
    Boolean: True if satisfy Bechdel condition
    
    '''
    if len(characters)<2:
        return False
    
    if any([get_gender(x)== 'male' for x in characters]):
        return False
    
    if sum([1 for x in characters if get_gender(x) == 'female']) > 1:
        return True
    
    return False


def bechdel_test(movie_parser, scenes, characters_sequence ,dialog_condition= False ,dialog =''):
    '''
    Performs full Bechdel test with the option to include or exclude dialog condition
    
    Parameters
    ----------
    scene: list of lists
            list of scenes, each scene is a list of lines
    characters_sequence: list of lists
            contains actors appearing in each scene
    dialog_condition: boolean, optional,
            wheather to check the dialog between two women or not

    Returns
    -------
    Boolean:
        Wheather it pass the test or not
    '''

    
    for i, (scene, characters) in enumerate(zip(scenes[1:], characters_sequence)):
        if bechdel_condition(characters):

            logging.info('Basic Bechdel condition satisfied with: {}, scene number: {}'.\
                      format(" and ".join(characters), i))
            if dialog_condition:    
                dialog = movie_parser.dialog_from_scene(scene)
                if dialog:
                    if male_mentioned(dialog):
                        continue
                    logging.info('matching dialog: %s', dialog)
                    return True
                
            return True
               
    return False

### Get all scripts URL's

In [7]:
r  = requests.get(IMSDB_URL + ALL_SCRIPTS)
lxml_tree = etree.parse(StringIO(r.text), etree.HTMLParser())
movie_titles = [ x.text for x in lxml_tree.xpath('''//p/a''')]

### Parsing movie script



In [26]:
for i, movie in enumerate(movie_titles):
    
    if movie in movies_results:
        continue
        
        
    logging.info('parsing movie number: {}, title: {}'.format(i, movie))
    
    movie_parser = IMSDB_Parser(movie)
    
    if movie_parser.lines:
        
        characters = movie_parser.characters
        scenes = movie_parser.scenes
        characters_sequence = movie_parser.characters_sequence
        release_date = movie_parser.release_date
        
        basic_bechdel = bechdel_test(movie_parser, scenes, characters_sequence)
        full_bechdel = bechdel_test(movie_parser, scenes, characters_sequence, dialog_condition=True)

        movies_results[movie]['basic'] = basic_bechdel
        movies_results[movie]['full'] = full_bechdel
        movies_results[movie]['characters_seq'] = characters_sequence
        movies_results[movie]['characters'] = characters
        movies_results[movie]['release_date'] = release_date
    

### saving results and classified genders

In [27]:
with open('results.pk', 'wb') as f:
    pickle.dump({**movies_results}, f)

with open('gender_dict.pk', 'wb') as f:
    pickle.dump(gender_dict, f)