In [39]:
with open("../data/male_names.txt") as f:
    male_names = f.read().splitlines()[7:]

In [36]:
with open("../data/female_names.txt") as f:
    female_names = f.read().splitlines()[7:]

In [283]:
import glob
script_paths = glob.glob("../IMSDB_scripts/*.txt")
len(script_paths)

1089

'../IMSDB_scripts/Midnight-Express.txt'

In [407]:
from random import randint
i = randint(0, len(script_paths))
#filepath = script_paths[i]
filepath = '../IMSDB_scripts/127-Hours.txt'
filepath

'../IMSDB_scripts/127-Hours.txt'

In [408]:
with open(filepath) as f:
    data = f.read().splitlines()

In [409]:
len(data)

5852

In [410]:
headings = sorted([i for i,line in enumerate(data) if ('INT.' in line or 'EXT.' in line)])
headings.append(len(data))

In [411]:
scenes = [data[headings[i]+1:headings[i+1]] for i in range(len(headings)-1)]

In [412]:
from sklearn.feature_extraction.text import CountVectorizer

def parse_script(filepath):
    """Takes a .txt filepath from the IMSDB scraped scripts folder
        and creates a parsed script
        Args:
            filepath : *.txt
        Returns:
            a list of script lines
    """
    with open(filepath) as f:
        script = f.read().splitlines()
    return script

def break_scenes(script):
    """Breaks a script into scenes after identifying headings
        Args:
            script: list of str, a parsed script
        Returns:
            a list of scenes
    """
    headings = sorted([i for i,line in enumerate(script) if ('INT.' in line or 'EXT.' in line)])
    headings.append(len(script))
    scenes = [data[headings[i]+1:headings[i+1]] for i in range(len(headings)-1)]
    return scenes

def extract_characters(script_slice, indent=15):
    """Extracts characters from a full script or a scene
        Args:
            script_slice: list of str, a parsed script (or scene)
            indent : number of spaces before character name
        Returns:
            a list of unique characters
    """
    characters = []
    for line in script_slice:
        if len(line.lstrip()) !=0:
            if len(line)-len(line.lstrip()) > indent and line.lstrip()[0] != '(':
                character = line.strip()
                character = "".join(re.split("\(|\)|\[|\]", character)[::2]).strip()
                characters.append(character)
    return list(set(characters))

def female_count(script_slice, female_names):
    """Counts number of female names in a script
        Args:
            script_slice: list of str, a parsed script (or scene)
            female_names : list of female names
        Returns:
            number of female names in the script slice
    """
    characters = extract_characters(script_slice, indent=15)
    female_count = 0
    for character in characters:
        if character.title() in female_names:
            female_count += 1
    return female_count

def scene_tokenizer(script_slice):
    """Takes a scene and returns all the words present
        Args:
            script_slice: list of str
        Returns:
            a list of unique words
    """
    vectorizer = CountVectorizer(stop_words='english', lowercase=False)
    BOW = vectorizer.fit_transform(script_slice).toarray()
    tokens = vectorizer.get_feature_names()
    characters = extract_characters(script_slice, indent=15)
    for character in characters:
        if character in tokens:
            tokens.remove(character)
    return tokens

In [415]:
def test_3(scenes):
    for i, scene in enumerate(scenes):
        test_passed = False
        print(f"scene {i}")
        if female_count(scene, female_names) > 1:
            test_passed = True
            tokens = scene_tokenizer(scene)
            for token in tokens:
                if token in male_names:
                    # we found a male name we should explore another scene'
                    test_passed = False
                    break
            if test_passed:
                # we went through all the tokens without setting test_passed to false
                return test_passed
    # we went through all the scenes without returing True
    return test_passed

In [420]:
extract_characters(scenes[26], indent=15)

['KRISTI / MEGAN', 'KRISTI', 'MEGAN', 'ARON', 'CUT TO:']

In [421]:
female_count(scenes[26], female_names)

2

In [416]:
test_3(scenes)

scene 0
scene 1
scene 2
scene 3
scene 4
scene 5
scene 6
scene 7
scene 8
scene 9
scene 10
scene 11
scene 12
scene 13
scene 14
scene 15
scene 16
scene 17
scene 18
scene 19
scene 20
scene 21
scene 22
scene 23
scene 24
scene 25
scene 26


True

In [417]:
scenes[26]

['',
 "          He slides/surfs down so he's on the same level as them,",
 '          arriving in a haze of dust, holding out his hand for the',
 '          shake. Big smile.',
 '',
 '                          KRISTI',
 '           (looking at Megan)',
 "           Sure, I'm Kristi.",
 '',
 '                          MEGAN',
 '           Megan.',
 '',
 '                          ARON',
 '           Nice to meet you. What a day.',
 '',
 '                          KRISTI',
 "           It's beautiful.",
 '',
 '                          ARON',
 '           Did you bike or come straight from',
 '           the trail head?',
 '',
 '                          KRISTI',
 '           We left the car there. Pretty',
 '           quiet.',
 '',
 '                          ARON',
 '           I left mine at the Horseshoe Canyon',
 '           and biked here.',
 '',
 '           7.',
 '',
 '                         ',
 '',
 '                         ',
 '',
 '                          MEGAN',
 "    

In [419]:
scene_tokenizer(scenes[26])

['17',
 '20',
 '50',
 'Big',
 'CONT',
 'CUT',
 'Canyon',
 'Cathedral',
 'Did',
 'HOCKEY',
 'He',
 'Horseshoe',
 'It',
 'Kristi',
 'Let',
 'MASK',
 'Megan',
 'Nice',
 'Pretty',
 'Sorry',
 'Sure',
 'TO',
 'That',
 'The',
 'They',
 'Wasn',
 'We',
 'What',
 'Where',
 'Yeah',
 'You',
 'arriving',
 'beautiful',
 'bike',
 'biked',
 'bit',
 'canyon',
 'car',
 'climb',
 'come',
 'crazy',
 'day',
 'desert',
 'disorientated',
 'dust',
 'exchange',
 'expecting',
 'girls',
 'glances',
 'got',
 'great',
 'guess',
 'guy',
 'hand',
 'harmless',
 'haze',
 'head',
 'holding',
 'isn',
 'kind',
 'know',
 'laugh',
 'left',
 'level',
 'like',
 'little',
 'lone',
 'looking',
 'lot',
 'map',
 'mean',
 'meet',
 'middle',
 'miles',
 'minute',
 'nervy',
 'paintings',
 'quiet',
 'right',
 'seeing',
 'shake',
 'slides',
 'smile',
 'sneaking',
 'straight',
 'suddenly',
 'surfs',
 'surprised',
 'today',
 'trail',
 'tricky',
 've',
 'wait',
 'walking',
 'wearing',
 'windy',
 'worth']

In [251]:
'Buzz' in male_names

False

In [400]:
def female_names_generator(filepath):
    with open(filepath) as f:
        female_names = f.read().splitlines()[7:]
    return female_names