# Import

In [1]:
import pdb
import spacy
nlp = spacy.load('en_core_web_md')
import re
import copy
from num2words import num2words
from nltk.corpus import stopwords

In [None]:
stimulus = 'hidden_figures'
stop_words = set(stopwords.words('english'))
stop_words.add('sir')

# Helper functions

## Extract subtitles from .srt file

In [4]:
def extract_subtitles(subtitles_path):
    subtitles = []
    with open(subtitles_path, 'r') as subtitles_file:
        for line in subtitles_file.readlines():
            line = line.replace('\n','').replace('\t','')
            if line.isdigit():
                subtitles.append("")
            elif "-->" in line or line == "":
                # ignore timestamps
                continue
            elif line[0] == "-":
                if subtitles[-1] == "":
                    del subtitles[-1]
                subtitles.append(line[1:].strip())
            elif subtitles[-1] == "":
                del subtitles[-1]
                subtitles.append(line)
            else:
                subtitles[-1] += " " + line
    return subtitles

## Parse Characters & Sentences from (Already Parsed) Screenplay

In [7]:
def extract_parsed_screenplay_info(characters_path, sentences_path):
    screenplay_chars, screenplay_sents = [], []

    with open(characters_path, 'r') as characters_file:
        with open(sentences_path, 'r') as sentences_file:
            screenplay_chars = [line.strip() for line in characters_file.readlines()]
            screenplay_sents = [line.strip() for line in sentences_file.readlines()]
    return screenplay_chars, screenplay_sents

## Match subtitles to screenplay sentences

In [10]:
def sublist(ps, qs):
    ps_diff = [p for p in ps if p in qs]
    qs_diff = [q for q in qs if q in ps]
    return (ps_diff == ps) or (qs_diff == qs)

def num_to_word(w):
    if w.isdecimal():
        w = num2words(w)
    return w

def format_string(s):
    s = re.sub(r'\.{2,}', ', ', s)
    s = s.lower().replace(',', '').replace('.', '').replace('?', '').replace('!', '').replace("’", '').replace("'",'').replace("-",' ').strip()
    s = ' '.join([num_to_word(w) for w in s.split()])
    return s

def spacy_similarity(s1, s2):
    # Embedding similarity according to embeddings generated from spacy nlp
    emb1, emb2 = nlp(s1), nlp(s2)
    sim = emb1.similarity(emb2)
    return sim

In [11]:
def find_match(sub, candidates):
    sub = format_string(sub)
    for i, cand in enumerate(candidates):
        candidates[i] = format_string(cand)
    
    # (a) exact match
    if sub in candidates:
        return candidates.index(sub), True
    
    # (b) "one contains another" match
    sub_words = sub.split()
    found_match, match_idx = False, None
    for i, cand in enumerate(candidates):
        if len(cand.split())<2:
            continue
        if sublist(sub_words, cand.split()) :
            found_match, match_idx = True, i
            break
    if found_match:
        return match_idx, False
    
    # (c) similarity score > 0.9 and intersection of word lists 
    # after removing stop words satisfies certain restrictions
    best_score, match_idx = 0., None
    filtered_sub = [w for w in sub_words if not w in stop_words]
    for i, cand in enumerate(candidates):
        if cand == '':
            continue
            
        cand_score = spacy_similarity(sub, cand)
        cand_words = cand.split()
        filtered_cand = [w for w in cand_words if not w in stop_words]
        filtered_intersection = list(set(filtered_sub) & set(filtered_cand))
        
        if best_score<cand_score and ((len(filtered_sub)>=4 and len(filtered_intersection)>=2) \
                                      or (len(filtered_sub)<4 and len(filtered_intersection)>=1)):
            best_score, match_idx = cand_score, i
    return match_idx, False

## Save subtitles with character labels

In [15]:
def get_match_idx(sub, labeled_subs):
    match_idx = None
    for i, (m, lsub) in enumerate(labeled_subs):
        if lsub == sub:
            match_idx = m
            del labeled_subs[i]
            return match_idx, labeled_subs
    print("Found no match in get_match_idx for sub: {}".format(sub))
    raise Exception("This should never happen in get_match_idx")

def rewrite_subtitles_with_characters(scply_chars, labeled_subs, subtitles_path, out_path):
    start_new_line = True
    with open(subtitles_path, 'r') as subtitles_file:
        subtitles_lines = subtitles_file.readlines()
        with open(out_path, 'w+') as out_file:
            for i, line in enumerate(subtitles_lines):
                formatted_line = line.replace('\n','').replace('\t','')
                if formatted_line.isdigit():
                    start_new_line = True
                    out_line = line
                elif "-->" in formatted_line or formatted_line == "":
                    out_line = line
                elif formatted_line[0] == "-":
                    # collect full subtitle that might span multiple lines
                    curr_sub = formatted_line[1:].strip()
                    next_line = subtitles_lines[i+1]
                    fmt_next_line = next_line.replace('\n','').replace('\t','')
                    while fmt_next_line != "" and fmt_next_line[0] != "-":
                        curr_sub += " " + fmt_next_line
                        i += 1
                        next_line = subtitles_lines[i+1]
                        fmt_next_line = next_line.replace('\n','').replace('\t','')
                    # get character
                    match_idx, labeled_subs = get_match_idx(curr_sub, labeled_subs)
                    matched_char = scply_chars[match_idx] if match_idx is not None else "None"
                    out_line = matched_char + ' ' + line
                    start_new_line = False
                elif start_new_line:
                    # collect full subtitle that might span multiple lines
                    curr_sub = formatted_line
                    next_line = subtitles_lines[i+1]
                    fmt_next_line = next_line.replace('\n','').replace('\t','')
                    while fmt_next_line != "" and fmt_next_line[0] != "-":
                        curr_sub += " " + fmt_next_line
                        i += 1
                        next_line = subtitles_lines[i+1]
                        fmt_next_line = next_line.replace('\n','').replace('\t','')
                    # get character
                    match_idx, labeled_subs = get_match_idx(curr_sub, labeled_subs)
                    matched_char = scply_chars[match_idx] if match_idx is not None else "None"
                    out_line = matched_char + ' - ' + line
                    start_new_line = False
                else:
                    out_line = line
                out_file.write(out_line)

# Testing this code with an example

In [None]:
subtitles_path = 'subtitles/{}_subtitles.srt'.format(stimulus)
characters_path = 'screenplays/separated_characters_and_dialogues/{}_characters.txt'.format(stimulus)
sentences_path = 'screenplays/separated_characters_and_dialogues/{}_dialogues.txt'.format(stimulus)
out_path = 'subtitles_with_characters/{}_subtitles.txt'.format(stimulus)

In [None]:
subs = extract_subtitles(subtitles_path)
scply_chars, scply_sents = extract_parsed_screenplay_info(characters_path, sentences_path)

In [12]:
# After running this cell, labeled_subs contains two-element lists [i, s] where i is either None or the index of a 
# screenplay sentence that matched with the subtitle s.
labeled_subs = []
prev_matched = {}
shift = 15
max_dialogue_stretch = 5 # how far apart two subtitles matching to the same sentence can be

prev_match_idx, prev_pivot = None, 0
for i, sub in enumerate(subs):
    # generate candidates for this sub
    pivot = prev_match_idx if (prev_match_idx is not None) else prev_pivot
    lo = pivot-shift if pivot-shift>=0 else 0
    hi = pivot+shift if pivot+shift<len(scply_sents) else len(scply_sents)
    candidates = scply_sents[lo:hi]
    
    # keep searching for matches until there's nothing left or we find a candidate that satisfies one of:
    # (a) is the exact same as our subtitle
    # (b) one's word list is contained within the other and both have 3 or more words
    # (c) similarity score > 0.9 and intersection of word lists after removing stop words satisfies certain restrictions
    match_rejected = True
    while match_rejected:
        match_idx, exact_match = find_match(sub, candidates)
        if exact_match and len(sub.split())>=3:
            prev_matched[lo+match_idx] = i
            match_rejected = False
        elif (match_idx is not None) and (lo+match_idx) in prev_matched:
            i_prev = prev_matched[lo+match_idx]
            if i_prev+max_dialogue_stretch >= i:
                prev_matched[lo+match_idx] = i
                match_rejected = False # allow relatively close subtitles to be matched to same sentence
            else:
                candidates[match_idx] = ''
        else:
            if match_idx is not None:
                prev_matched[lo+match_idx] = i
            match_rejected = False
    
    # Dense printing here to be able to go over the matches at the end and see if they make sense
    if match_idx is None:
        print("{} -> None".format(sub))
    else:
        print("{} -> {}".format(sub, scply_sents[lo+match_idx]))
        match_idx = lo+match_idx
        
    labeled_subs.append([match_idx, sub])
    prev_match_idx, prev_pivot = match_idx, pivot

Fourteen.. Fifteen.. -> 14, 15, 16...prime.
Sixteen. Prime. -> 14, 15, 16...prime.
Eighteen. Prime. -> 18, prime.
Twenty. -> 20, 21, 22, prime, 24, 25, 26...
Twenty-one. -> 20, 21, 22, prime, 24, 25, 26...
Twenty-two. -> 20, 21, 22, prime, 24, 25, 26...
West Virginia Collegiate Institute is the best school for Negros in the state. -> West Virginia Collegiate Institute is the best school for Negros in the state.
It's the only school, past the eighth grade, anywhere near here. -> It’s the only school, past the eighth grade, anywhere close to here.
Isosceles. Scalene -> Isosceles, scalene, obtuse, equilateral, rhombus...
Equilateral -> Isosceles, scalene, obtuse, equilateral, rhombus...
Rhombus -> Isosceles, scalene, obtuse, equilateral, rhombus...
Trapezoid -> Trapezoid, tetrahedron, octahedron, dodecahedron...
Katherine's in the sixth grade. -> Katherine’s in the sixth grade.
They want to take her early. -> They want to take her early.
Tetrahedron, -> Trapezoid, tetrahedron, octahedron,

Starting tomorrow, I'm riding the bus -> None
Show me -> None
No, exactly -> None
Ascent angle: 46.56. -> Ascend angle- 42.46.
Approaching 7400 mph -> Approximate speed- 17,400 mph.
118 seconds. -> None
Second stage. Success. -> None
Korabl-Sputnik-4 is orbital. -> None
Orbital entry is established. -> Orbital entry is established.
Russian orbital entry is established. -> Orbital entry is established.
Jim Webb. -> Paul Stafford...our Lead Engineer, Mr. Webb.
Yes, Mr. President -> Paul Stafford...our Lead Engineer, Mr. Webb.
We certainly are, sir. -> None
It looks like they've achieved at least one orbit, maybe two -> None
Zvezdochka is ready to go again. So is our comrade, Ivan. -> Zvezdochka is ready to go again.
A brave space traveler. -> Mother Russia will be first to put a human in space.
We have proven life can be sustained in space. -> None
Now, we will be first to send a human -> Mother Russia will be first to put a human in space.
A damn dog! -> A goddamn dog!
And a damn manneq

We need em for the Redstone test. -> We need ‘em for the Redstone test.
Mach 1 tunnel test, T-1 minute. -> Mach 2 tunnel test, T-minus one minute.
No shoe is worth your life -> No shoe is worth your life.
One.. One moment! -> Mach 2 tunnel test, T-minus one minute.
Shut it down. -> None
If we alter the exterior from smooth -> If we alter the exterior from smooth to corrugated, the capsule would have more stability, despite the air displacement.
to corrugated, the capsule might have more stability, -> If we alter the exterior from smooth to corrugated, the capsule would have more stability, despite the air displacement.
But the friction, it's under during reentry.. -> Coupled with rising temperatures on reentry, the contacts soften.
Most shield erosion occurs -> None
on the posterior side abutting the retro-boosters. -> None
Conclusion? -> None
The area closest to the boosters is closest to the heat. -> The area closest to the boosters is closest to the heat.
Coupled with rising tempera

I think that's a pretty reason we're giving. We're putting a human.. -> Do I need to remind everyone...that we are putting a human on top of a missile and shooting him into space?
On top of a missile and shooting him into space -> Do I need to remind everyone...that we are putting a human on top of a missile and shooting him into space?
And it's never been done before. -> It’s never been done before.
And because it's never been done. Everything we do -> And because it’s never been done...everything we do between now and then is going to matter- it’s going to matter to their wives, their kids, I believe it’s going to matter to the whole damn country.
between now and then is going to matter. -> And because it’s never been done...everything we do between now and then is going to matter- it’s going to matter to their wives, their kids, I believe it’s going to matter to the whole damn country.
It's going to matter to their wives. -> And because it’s never been done...everything we do betwee

Well, it's nothing you can't handle, Katherine. -> Nothing you can’t do, Katherine.
Thank you, mamma. -> None
Are they asleep yet? -> None
They're pretending to be. -> They’re pretending to.
They're not pretending well enough -> They’re pretending to.
Thank you, mamma. Love you. -> None
This is my bed! -> Whoever sleeps in that bed in, Joylette’s place, will also dry the dishes, take out the trash, and do all the rest of Joylette’s choirs.
What are you doing? -> What are you doing?
It's not fair how Joylette gets to sleep by herself cause she's the oldest. -> It’s not fair Joylette always gets to sleep by herself ‘cause she’s the oldest.
Yes it is fair -> It is too fair!
Constance, Kathy, sit. -> Constance, Kathy, come sit.
Now, I understand you want to be grown. And have your own space -> I understand you want to be grown.
So whoever sleeps in that bed, -> None
in Joylette's place, -> None
will also do the dishes, -> None
take out the trash, and do all of Joylette's chores -> None
Tha

Pastor mentioned you're computer at NASA -> Pastor mentioned you’re a “Computer” at NASA.
Yes -> None
What's that entail? -> What’s that entail?
We calculate the mathematics necessary to enable -> We calculate the mathematics necessary to enable launch and landing for the Space Program.
launch and landing for the Space Program. -> We calculate the mathematics necessary to enable launch and landing for the Space Program.
Pretty heady stuff! -> Pretty heady stuff.
Yes, it is. -> So, yes...they let women do some things over at NASA, Mr. Johnson.
They let women handle that sort of.. -> They let women handle that kind of-
That's.. Not what I mean -> That’s not what I mean.
What do you mean? -> What do you mean?
I am.. Just surprised something so.. Taxing -> I was just surprised something so...taxing-
Mr. Johnson -> So, yes...they let women do some things over at NASA, Mr. Johnson.
If I were you, I'd quit talking right now -> None
I don't mean any disrespect -> I’m not meaning any disrespect

The Mercury Capsule weight, we know. -> The Mercury Capsule weight is known.
And the speeds are there.. In the data. -> And the speeds are there in the data.
You did the math. -> You did the math.
Yes, sir. -> Yes, sir.
I looked beyond. -> I looked beyond.
And how do you know about the Atlas rocket? -> Then how did you know about the Atlas rocket?
That's not math. -> That’s not math.
That data's not here. Like he said, it's classified. -> That data’s not here.
I held it up to the light. -> I held it up to the light.
You held it up to the light? -> I held it up to the light.
Yes, sir. -> None
Yep. There it is. -> There it is.
Atlas -> None
What's your name? -> What’s your name?
Katherine Goble. -> Katherine Goble.
Are you a spy, Katherine? -> Are you a spy, Katherine?
Am I what? -> Am I what?
I'm saying, are you a Russian spy? -> Are you a Russian spy?
No, sir. -> No sir, I’m not Russian.
I'm not Russian. -> No sir, I’m not Russian.
She's not Russian, sir. -> She’s not Russian, sir.
All

Everybody smile. -> None
Make a wish! -> Can I make it while we dance?
Leonard, you need to get you a job now. -> None
Two girls! -> None
Well, look at that. -> Look at that.
Look at what? -> Look at that.
What a surprise. -> What a surprise.
I'll say. -> I’ll say.
Howard must have invited him. -> Howard must have invited him.
Mmm. Hmmm. -> Mmm.
Let me go help him with those flowers -> None
I believe I owe you an apology. -> I believe I owe you an apology.
Well, is that it? -> None
Can I make it while we dance? -> Can I make it while we dance?
I'm afraid I simply misspoke the other day, Mrs. Goble. -> I’m afraid I simply misspoke, Mrs. Goble.
I've been away for a long time. -> I’ve been away for a long time.
I imagine, I'm just out of practice. -> I imagine, I’m just out of practice.
Mm-hmm. -> Mm-hmm.
Yeah. I was hoping you would allow me to start over? -> And I was hoping you would allow me to start over?
Mm-hmm. -> Mm-hmm.
So, the thing is.. -> The thing is...I have an interest in g

There is no bathroom! -> There’s no bathroom?
There are no colored bathrooms in this building, -> There are no COLORED bathrooms in this building or ANY building outside the West Campus.
or any building outside the West Campus. -> There are no COLORED bathrooms in this building or ANY building outside the West Campus.
Which is half a mile away. Did you know that? -> Which is half a mile away!
I have to walk to Timbuktu just to relieve myself -> I have to walk to Timbuktu just to relieve myself!
And I can't use one of the handy bikes. -> And I can’t take one of the handy bikes.
Picture that, Mr. Harrison -> AL HARRISON At NASA we all...pee the same color!
My uniform... -> Picture that, with my uniform- skirt below the knees and my heels.
Skirt below my knees, my heels -> Picture that, with my uniform- skirt below the knees and my heels.
and simple string of pearls. Well, I don't own pearls. -> None
Lord knows you don't pay the coloreds enough to afford pearls! -> Lord knows you don’t pa

no other American had ever touched space. -> And before Alan Shepard sat on top of a rocket, no American had ever touched space.
And now, he will forever be remembered as the US Navy man from New Hampshire. -> He will forever be remembered as the Navy man from New Hampshire who was the first to touch the stars.
The first to touch the stars. -> He will forever be remembered as the Navy man from New Hampshire who was the first to touch the stars.
And I, sir. -> And I, sir, plan on being an engineer at NASA.
I plan on being an engineer at NASA. -> And I, sir, plan on being an engineer at NASA.
But I can't do that without taking them classes at that all-white high school. -> But I can’t do that without taking those classes at that all-white high school.
And I can't change the color of my skin. -> And I can’t change the color of my skin.
So, I have no choice.. But to be the first. -> So...I have no choice but to be the first.
Which I can't do without you, sir. -> Which I can’t do without yo

So, it means to move from an elliptical orbit.. To a parabolic orbit? -> So...it has to move from an elliptical orbit to a parabolic orbit?
Yes. That's the Go/ No Go. -> That’s the Go/ No Go.
Now, this point is a pin head. -> This point is a pin head.
We bring him in too soon.. -> We bring him in too soon...
He burns up on reentry. -> He burns up on reentry.
That's right. We bring him too late, and he's pushed out of Earth's gravity. -> Too late...and he’s pushed out of Earth’s gravity.
Any changes to mass, weight, speed, time, -> Any changes in mass, weight, speed, time, distance, friction...or a puff of wind...would alter the Go/No Go.
distance, friction... or a puff of wind.. Would alter the Go/No Go. -> Any changes in mass, weight, speed, time, distance, friction...or a puff of wind...would alter the Go/No Go.
And we start our calculations.. Over -> And we would have to start calculating all over.
Yes. -> Yes?
So, we need to be able to choose this re-entry point. -> We have to be a

When exactly is that gonna happen? -> When is that going to happen?
Katherine. -> Katherine.
Have a go at it? -> Have a go at it?
The Go point for re-entry -> The Go point for re-entry is 2,990 miles from where we want Colonel Glenn to land.
is 2,990 miles -> The Go point for re-entry is 2,990 miles from where we want Colonel Glenn to land.
from where we want Colonel Glenn to land. -> The Go point for re-entry is 2,990 miles from where we want Colonel Glenn to land.
If we assume that's The Bahamas. -> If we assume that’s The Bahamas...
At 17,544 miles per hour.. -> At 17,544 miles per hour upon reentry...
Upon reentry.. -> At 17,544 miles per hour upon reentry...
370.. -> At a descent angle of 46.56 degrees...
At a descent angle.. -> At a descent angle of 46.56 degrees...
Of 46.56 .. -> At a descent angle of 46.56 degrees...
Degrees.. Distance.. -> At a descent angle of 46.56 degrees...
OK! So, that puts the landing zone at... -> That puts the landing zone at...
25.0667 North, -> At a 

  sim = emb1.similarity(emb2)


As we can calculate launch, landing -> We can calculate launch, landing, but without this conversion...the capsule stays in orbit, we can’t bring it home.
but without this conversion, the capsule stays in orbit, we can't bring it back home. -> We can calculate launch, landing, but without this conversion...the capsule stays in orbit, we can’t bring it home.
Maybe we've been thinking about this all wrong. -> Maybe we’re thinking about this all wrong.
How's that? -> How’s that?
Maybe it's not new math at all. -> Maybe it’s not new math at all.
It could be old math. -> Maybe it’s old math.
Something that looks at the problem numerically, and not theoretically -> Something that looks at the problem numerically.
Math is always dependable. -> Math is always dependable.
For you it is. -> For you it is.
Euler's Method. -> Euler’s Method.
Euler's Method? -> Euler’s Method.
Yes. -> Yes.
That's ancient. -> That’s ancient.
But it works! -> But it works.
It works numerically. -> It works numericall

I'll be honest with you, Al When I fly, I fly the machine, -> To be honest with you, Al...when I fly, I fly the machine and now I feel like the machines are flying me.
Right now, it seems like this machine is flying me. -> To be honest with you, Al...when I fly, I fly the machine and now I feel like the machines are flying me.
We're on the same page, John. Our guys are on it. -> We’re on the same page.
Let's get the girl to check the numbers. -> Let’s get the girl to check the numbers.
The girl? -> The girl?
Yes, sir. -> Yes, sir.
You mean Katherine? -> HARRISON Sam...go find Katherine.
Yes, sir. The smart one. -> Yes, sir.
If she says they're good, I'm ready to go. -> If she says they’re good, I’m ready to go.
All right. We'll get into it. -> None
Roger! -> None
Sam! Go find Katherine Goble. -> HARRISON Sam...go find Katherine.
She needs to verify Glenn's Go/No Go or we're staying on the ground. -> She needs to verify Glenn’s Go/No Go or we’re staying on the ground.
Yes, sir. -> Yes, 

and bring the scope in. Over. -> None
He knows. -> None
Without a heat shield, there's no way a Colonel Glenn -> None
can get back through the Earth's atmosphere. -> None
The heat is simply too overpowering. A malfunction in the automatic control system -> None
is causing the spacecraft to yaw in a skid like fashion -> None
thus putting the Friendship 7's re-entry point in jeopardy -> None
even now, col. Glen is skidding towards the US -> None
and he must begin the retro sequence -> None
You're going to be using fly-by-wire for reentry. -> None
You'll need to maintain a zero angle. Over. -> None
Roger. I'm on fly-by-wire, back-it up with manual. Over. -> None
Keep talking to him. -> None
Roger. The weather in the recovery area: 3 foot waves, 10 miles visibility. -> None
Capcom, you're going out -> How do I look, Capcom?
He's crossing the communication black-out zone. -> None
There's a real fireball outside, and it's getting a little hot here! -> But that was a real fireball, boy.
Frien

In [13]:
# Count how many nones and how many total subtitles were seen
none_count, total_count = 0,0
for match_idx, sub in labeled_subs:
    if match_idx is None:
        none_count += 1
    total_count += 1
print(none_count)
print(total_count)

343
1653


In [17]:
original_labeled_subs = copy.deepcopy(labeled_subs)
rewrite_subtitles_with_characters(scply_chars, labeled_subs, subtitles_path, out_path)
labeled_subs = copy.deepcopy(original_labeled_subs)