In [39]:
with open("../data/male_names.txt") as f:
    male_names = f.read().splitlines()[7:]

In [36]:
with open("../data/female_names.txt") as f:
    female_names = f.read().splitlines()[7:]

In [283]:
import glob
script_paths = glob.glob("../IMSDB_scripts/*.txt")
len(script_paths)

1089

In [392]:
from random import randint
i = randint(0, len(script_paths))
filepath = script_paths[i]
filepath

'../IMSDB_scripts/Dallas-Buyers-Club.txt'

In [393]:
with open(filepath) as f:
    data = f.read().splitlines()

In [394]:
len(data)

5356

In [395]:
headings = sorted([i for i,line in enumerate(data) if ('INT.' in line or 'EXT.' in line)])
headings.append(len(data))

In [396]:
scenes = [data[headings[i]+1:headings[i+1]] for i in range(len(headings)-1)]

In [397]:
from sklearn.feature_extraction.text import CountVectorizer

def parse_script(filepath):
    """Takes a .txt filepath from the IMSDB scraped scripts folder
        and creates a parsed script
        Args:
            filepath : *.txt
        Returns:
            a list of script lines
    """
    with open(filepath) as f:
        script = f.read().splitlines()
    return script

def break_scenes(script):
    """Breaks a script into scenes after identifying headings
        Args:
            script: list of str, a parsed script
        Returns:
            a list of scenes
    """
    headings = sorted([i for i,line in enumerate(script) if ('INT.' in line or 'EXT.' in line)])
    headings.append(len(script))
    scenes = [data[headings[i]+1:headings[i+1]] for i in range(len(headings)-1)]
    return scenes

def extract_characters(script_slice, indent=15):
    """Extracts characters from a full script or a scene
        Args:
            script_slice: list of str, a parsed script (or scene)
            indent : number of spaces before character name
        Returns:
            a list of unique characters
    """
    characters = []
    for line in script_slice:
        if len(line.lstrip()) !=0:
            if len(line)-len(line.lstrip()) > indent and line.lstrip()[0] != '(':
                character = line.strip()
                character = "".join(re.split("\(|\)|\[|\]", character)[::2]).strip()
                characters.append(character)
    return list(set(characters))

def female_count(script_slice, female_names):
    """Counts number of female names in a script
        Args:
            script_slice: list of str, a parsed script (or scene)
            female_names : list of female names
        Returns:
            number of female names in the script slice
    """
    characters = extract_characters(script_slice, indent=15)
    female_count = 0
    for character in characters:
        if character.title() in female_names:
            female_count += 1
    return female_count

def scene_tokenizer(script_slice):
    """Takes a scene and returns all the words present
        Args:
            script_slice: list of str
        Returns:
            a list of unique words
    """
    vectorizer = CountVectorizer(stop_words='english', lowercase=False)
    BOW = vectorizer.fit_transform(script_slice).toarray()
    tokens = vectorizer.get_feature_names()
    characters = extract_characters(script_slice, indent=15)
    for character in characters:
        if character in tokens:
            tokens.remove(character)
    return tokens

In [398]:
def test_3(scenes):
    for i, scene in enumerate(scenes):
        test_passed = False
        if female_count(scene, female_names) > 1:
            test_passed = True
            tokens = scene_tokenizer(scene)
            for token in tokens:
                if token in male_names:
                    # we found a male name we should explore another scene'
                    test_passed = False
                    break
            if test_passed:
                # we went through all the tokens without setting test_passed to false
                return test_passed
    # we went through all the scenes without returing True
    return test_passed

In [399]:
test_3(scenes)

False

In [370]:
scenes[57]

['',
 'Rose is dressed for the day, and is in the middle of helping Ruth with her',
 "corset. The tight bindings do not inhibit Ruth's fury at all.",
 '',
 '                                   RUTH',
 '',
 'You are not to see that boy again, do you understand me Rose? I forbid it!',
 '',
 "Rose has her knee at the base of her mother's back and is pulling the",
 'corset strings with both hands.',
 '',
 '                                   ROSE',
 '',
 "Oh, stop it, Mother. You'll give yourself a nosebleed.",
 '',
 'Ruth pulls away from her, and crosses to the door, locking it. CLACK!',
 '',
 '                                   RUTH',
 '',
 '                             (wheeling on her)',
 '',
 "Rose, this is not a game! Our situation is precarious. You know the money's",
 'gone!',
 '',
 '                                   ROSE',
 '',
 "Of course I know it's gone. You remind me every day!",
 '',
 '                                   RUTH',
 '',
 'Your father left us nothing but a legacy of

In [250]:
scene_tokenizer(scenes[144])

['Buzz',
 'Eve',
 'Something',
 'Wanda',
 'We',
 'crazy',
 'got',
 'gunshots',
 'heard',
 'run',
 'runs',
 'smack',
 'tell',
 've']

In [251]:
'Buzz' in male_names

False

In [400]:
def female_names_generator(filepath):
    with open(filepath) as f:
        female_names = f.read().splitlines()[7:]
    return female_names