In [1]:
# Import packages

import pandas as pd 
import numpy as np
import re

import nltk
from nltk import word_tokenize as w_tokenize
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
from nltk.corpus import stopwords
import string
stop_words = set(stopwords.words('english') + ["..."])

from sklearn.feature_extraction.text import TfidfVectorizer as tfv
from sklearn.decomposition import NMF as nmf

import pdfminer
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.high_level import extract_text
import PyPDF2
# !pip install PyPDF2

In [2]:
# extract text from list of pdf titles and create dataframe of text

path = './Scripts/'
end = '.pdf'
titles = ['1917', '20th_Century_Women', 'BridgeofSpies', 'ExMachina', 'First_Reformed',
         'Get_Out', 'GreenBook', 'Hell_or_HighWater', 'InsideOut', 'KnivesOut', 'LadyBird',
         'LaLaLand', 'Manchester_By_TheSea', 'MarriageStory', 'Parasite', 'Roma', 
          'ShapeofWater', 'Spotlight', 'StraightOuttaCompton', 'TheBigSick', 
         'TheFavourite', 'TheLobster', 'ThreeBillboards', 'Vice']

movie_scripts = []
for title in titles:
    script = extract_text(path + title + end)
    movie_scripts.append(script)
#     print (title)

# create dataframe of scripts/titles from lists    
movie_df = pd.DataFrame(list(zip(titles, movie_scripts)), columns = ['Title', 'Script'])

In [3]:
# add label for original screenplay win, best picture win, and year
orig_screen_win = [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]
best_pic_win = ['Nom', None, 'Nom', None, None, 'Nom', 'Win', 'Nom', None, None, 'Nom', 'Nom', 'Nom', 'Nom', 'Win', 'Nom', 'Win', 'Win', None, None, 'Nom', None, 'Nom', 'Nom']
year = [2019, 2016, 2015, 2015, 2018, 2017, 2018, 2016, 2015, 2019, 2017, 2016, 2016, 2019, 2019, 2018, 2017, 2015, 2015, 2017, 2018, 2016, 2017, 2018]
reviewer = ["Shafer", "Shafer", "Shafer", "Shafer", "Ragan", "Shafer", "Ragan", "Ragan", "Shafer", "Shafer", "Ragan", "Ragan", "Shafer", "Ragan", "Shafer", "Ragan", "Ragan", "Shafer", "Shafer", "Ragan", "Ragan", "Shafer", "Ragan", "Ragan"]

# sanity check -- should return True
len(orig_screen_win) == len(year) == len(best_pic_win) ==  len(list(movie_df['Title'])) == len(reviewer)

movie_df['year'] = year
movie_df['orig_screen_win'] = orig_screen_win
movie_df['best_pic_win'] = best_pic_win
movie_df['reviewer'] = reviewer

In [4]:
# column of dialogue
# column of visual cues
# column of setting
# raw script column
# everything column


# corpus of just dialogue
# corpus of everything (dialogue, settings, visual cues, etc.) cleaned up
# corpus of visual cues

# so 3 x per model
    # sentiment analysis on dialogue (CV + TFIDF)
    # sentiment analysis on everything (CV + TFIDF)
    # sentimental analysis on visual cues
    # and so on forth

# POS tagging and keep only proper names and put rest into settings (when dialogue vs setting can both be caps) 

In [5]:
ar = movie_df[movie_df['reviewer'] == "Ragan"].copy()
ar['CleanedScript'] = ar['Script']

raw_scripts = list(ar['Script'])

### Script Cleaning

##### semi-manually remove extraneous beginning junk

varies by script, easier to do it manually than write a function with 10k if statements that may still not capture everything

In [30]:
def drop_start(idx, todrop):
    # return script without extraneous beginning stuff
    return raw_scripts[idx].replace(todrop, "")

extraneous = ["Scripts.com\n\nFirst Reformed\n\nBy Paul Schrader\n\nPage 1/57\n\n\x0c", 
              "GREEN BOOK \n\nWritten by \n\nNick Vallelonga & Brian Currie & Peter Farrelly \n\n\x0c", 
              "Scripts.com\n\nHell or High Water\n\nBy Taylor Sheridan\n\nPage 1/42\n\n\x0c", 
              " \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n\nLADY BIRD \n\nwritten by \n\nGreta Gerwig \n\n\x0c \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\nii. \n\n", 
              "LA LA LAND\n\nby\n\nDamien Chazelle\n\n\x0c", 
              "MARRIAGE STORY\n\nWritten and Directed by Noah Baumbach\n\n\x0cBlack.\n\n", 
              "IN ENGLISH\n\n\x0cROMA\n\nWritten and Directed by\n\nAlfonso Cuarón\n\nDates in RED are meant only as a tool for the different departments for \nthe specific historical accuracy of the scenes and are not intended to \nappear on screen. \n\n\x0cThursday, September 3rd, 1970\n\n", 
              "F O R   Y O U R   C O N S I D E R A T I O N\n\nB E S T   O R I G I N A L   S C R E E N P L A Y\n\nG U I L L E R M O   D E L   T O R O\n&\nV A N E S S A   T A Y L O R\n\n\x0c1                             \n\n1\n\n", 
              "THE BIG SICK\n\nby\nEmily V. Gordon & Kumail Nanjiani\n\n\x0c1\n\n", 
              "F O R   Y O U R   C O N S I D E R A T I O N\n\nW R I T T E N   B Y\n\nD E B O R A H   D A V I S   A N D   T O N Y   M C N A M A R A\n\n\x0cW R I T T E N   B Y\n\nD E B O R A H   D A V I S   A N D   T O N Y   M C N A M A R A\n\n\x0cTHE FAVOURITE\n\nWritten by\n\nDeborah Davis and Tony McNamara\n\nFINAL SHOOTING SCRIPT - 23rd MARCH 2017\n\nElement Pictures\n21 Mespil Road \nDublin 4\nIreland\n\nScarlet Films\n3 Oakley Studios\nUpper Cheyne Row\nLondon SW3 5JW, UK\n\n \n\n\x0c1\n\n2\n\n3\n\n4\n\n5\n\n6\n\n", 
              "F O R   Y O U R   C O N S I D E R A T I O N\n\nB E S T   O R I G I N A L   S C R E E N P L A Y\n\nM A R T I N   M c D O N A G H\n\n\x0cTHREE BILLBOARDS OUTSIDE EBBING, MISSOURI\n\nYou Red Welby?\n\nby\nMartin McDonagh\n\n1\n\n", 
              "Written and Directed by\n\nVICE\n\nAdam McKay\n\n\x0c"]

for r in range(len(raw_scripts)):
    raw_scripts[r] = drop_start(r, extraneous[r])

# expand contractions
def expand_contractions(text):
    text = re.sub("\'s", " is", text) # it's --> it is
    text = re.sub("ain\'t", "are not", text) # ain't --> are not
    text = re.sub("n\'t", " not", text) # don't --> do not
    text = re.sub("\'re", " are", text) # you're --> you are
    text = re.sub("\'d", " would", text) # she'd --> she would
    text = re.sub("\'ll", " will", text) # he'll --> he will
    text = re.sub("\'ve", " have", text) # we've --> we have
    text = re.sub("\'m", " am", text) # I'm --> I am
    return text

for i in range(len(raw_scripts)):
    raw_scripts[i] = expand_contractions(raw_scripts[i])

# instantiate empty lists to be input into columns
master_visual_cues = []

master_dialogue = []

master_cleaned_everything = []

def clean_lists(lis):
    if type(lis) != list:
        lis = w_tokenize(lis)
    # strip punctuation
    lis = [t.lower() for t in lis if t not in set(string.punctuation)]
    # lemmatize + lowercase
    lis = [lemma.lemmatize(l) for l in lis]
    # drop stop words
    lis = [t for t in lis if t not in set(stop_words)]
    # strip new lines
    lis = [re.sub('\s+', ' ', l) for l in lis]
    return lis

##### process First Reformed

`raw_scripts[0]`

        Speaking: all caps NAME \n text	
        Setting: none, script is just dialogue and a few visuals	
        Character: bold all caps:	
        Visual: (ALL CAPS)
        Other Notes: some lines are randomly preceded by dashes and includes page numbers

In [7]:
# remove page numbers
raw_scripts[0] = re.sub("\\nPage \d+/57", " ", raw_scripts[0])

# visual cues
# tokenize
toked_fr = w_tokenize(raw_scripts[0])
orig_toked_fr = toked_fr.copy()
# find indexes with (
open_pars = [i for i, e in enumerate(toked_fr) if e == "("]
# find indexes with )
end_pars = [i for i, e in enumerate(toked_fr) if e == ")"]
# return words between ( and ) to extract visual cues
visual_cues_fr = [" ".join(toked_fr[open_pars[o]+1:end_pars[o]]) for o in range(len(open_pars))]

# drop visual cues from list
for o in range(len(open_pars)):
    del toked_fr[open_pars[o]:end_pars[0]+1]

# re-join : with character names and drop extra :
# find indices with :
char_idx = [i for i, item in enumerate(toked_fr) if item.endswith(':')]
# for every index with :, go to the index before and add : to re-join CHARACTER with :
for b in char_idx:
    toked_fr[b-1] = toked_fr[b-1] + toked_fr[b]
# drop extraneous colons
toked_fr.remove(":")

# append non-character names to dialogue list
dialogue_fr = [toked_fr[t] for t in range(len(toked_fr)) if toked_fr[t].endswith(":") == False]

# clean + join visual cues
visual_cues_fr = " ".join(clean_lists(visual_cues_fr))

# clean + join dialogue
dialogue_fr = " ".join(clean_lists(dialogue_fr))

# clean + join everything
cleaned_everything_fr = " ".join(clean_lists(orig_toked_fr))

# append to master lists
master_visual_cues.append(visual_cues_fr)
master_dialogue.append(dialogue_fr)
master_cleaned_everything.append(cleaned_everything_fr)

##### process Green Book

`raw_scripts[1]`

        Speaking: all caps NAME \n text
        Setting: bold and preceded by ext/int
        Character: centered, all caps	
        Visual: block of text with some words in all caps for emphasis	
        Other Notes: page numbers sandwiched by CONTINUED

In [39]:
raw_scripts[1]



In [40]:
# remove page numbers
raw_scripts[1] = re.sub("\\x0c                                   \\n\\n   \d+.", " ", raw_scripts[1])

In [43]:
# keep raw copy
raw_raw_gb = raw_scripts[1]

In [44]:
# split on combo of caps and new lines
split_cap = re.split("\\n[A-Z]*\s?\\n", raw_scripts[1])

In [46]:
vis_gb = split_cap.copy()
dia_gb = split_cap.copy()

In [50]:
"EXT. COPACABANA - 10 EAST 60TH ST. - NYC - NIGHT ".isupper()

True

In [45]:
split_cap

['FADE IN:',
 'TITLE CARD:          “New York City, 1962”',
 'EXT. COPACABANA - 10 EAST 60TH ST. - NYC - NIGHT ',
 'The famous red awning with COPACABANA in white block letters \nhangs over the entrance of the world-renowned nightclub. A \nlong line of well-dressed customers fills the sidewalk \nwaiting to get in. ',
 'OVER THIS - WE HEAR the opening horns of Bobby Rydell’s \nversion of “THAT OLD BLACK MAGIC”...',
 'INT. COPACABANA - MAIN ROOM - CONTINUOUS',
 'The vibe is electric. SIX HUNDRED PATRONS pack the tables in \nthe main room. Brazilian in theme, the Copa décor is art deco \nthroughout, filled with palm trees illuminated by blue and \npink hues. Pure elegance.',
 'Peppering the crowd... gorgeous COPA GIRLS, white-jacketed \nWAITERS, MAITRE D’S in Black Tuxedo Jackets, CAPTAINS in Blue \nTux Jackets, and BOUNCERS in Red Tux Jackets. ',
 'On stage in front of the Orchestra, tuxedo-clad BOBBY RYDELL. \nThe crowd APPLAUDS. ',
 'BOBBY RYDELL\nThank you very much!  Welcome to \nJul

In [47]:
# drop settings
settings = [v for v in range(len(vis_gb)) if vis_gb[v].startswith("INT") or vis_gb[v].startswith("EXT")]

In [49]:
settings

[2,
 5,
 18,
 47,
 55,
 66,
 75,
 87,
 112,
 138,
 140,
 142,
 151,
 197,
 234,
 239,
 277,
 283,
 308,
 313,
 319,
 322,
 334,
 355,
 470,
 472,
 532,
 534,
 544,
 566,
 609,
 725,
 745,
 747,
 749,
 786,
 788,
 825,
 870,
 880,
 885,
 887,
 894,
 941,
 963,
 1015,
 1020,
 1028,
 1030,
 1072,
 1074,
 1102,
 1167,
 1224,
 1251,
 1252,
 1281,
 1283,
 1287,
 1289,
 1298,
 1381,
 1394,
 1418,
 1436,
 1441,
 1491,
 1520,
 1525,
 1527,
 1545,
 1587,
 1602,
 1605,
 1636,
 1667,
 1696,
 1783,
 1810,
 1822,
 1850,
 1855,
 1860,
 1862,
 1869,
 1916,
 1944,
 1946,
 1948,
 1960,
 1998,
 2002,
 2006,
 2049,
 2073,
 2079,
 2132,
 2136,
 2145,
 2148,
 2211,
 2244,
 2312,
 2314,
 2363,
 2378,
 2389,
 2393,
 2440,
 2442,
 2447,
 2456,
 2462,
 2474,
 2560,
 2599,
 2612,
 2617,
 2618,
 2679,
 2696,
 2700,
 2715,
 2718,
 2720,
 2726,
 2728,
 2732,
 2734,
 2751,
 2761,
 2763,
 2778,
 2791,
 2794,
 2797,
 2816,
 2835,
 2847]

##### process Hell or High Water

`raw_scripts[2]`

        Speaking: a wall of text
        Setting: none, script is just dialogue
        Character: none, script is just dialogue
        Visual: none, script is just dialogue
        Other Notes: page numbers are marked (1/42, 2/42, etc.)

In [8]:
# remove page numbers
raw_scripts[2] = re.sub("\\nPage \d+/42", " ", raw_scripts[2])

# tokenize
toked_hhw = w_tokenize(raw_scripts[2])

# clean + join dialogue
dialogue_hhw = " ".join(clean_lists(toked_hhw))

# clean + join everything
cleaned_everything_hhw = " ".join(clean_lists(toked_hhw))

# append to master lists
master_visual_cues.append([None])
master_dialogue.append(dialogue_hhw)
master_cleaned_everything.append(cleaned_everything_hhw)

##### process Lady Bird

`raw_scripts[3]`

        Speaking: all caps NAME \n text
        Setting: preceded by ext/int in all caps	
        Character: centered, all caps
        Visual: just words	
        Other Notes: very little extraneous text at beginning

In [51]:
raw_scripts[3]

'"Anybody who talks about California hedonism has never spent \na Christmas in Sacramento." \n\n- Joan Didion \n\n\x0c \n \n\n \n\n \n\n \n\nINT. MOTEL. CALIFORNIA. EARLY MORNING. \n\nOne of those anonymous trucker motels along-side I-5 through \ncentral California. It’s late summer, 2002. Two women sleep \ntogether in a bed. Christine, aka Lady Bird, 17 years old. \nHer Mom, Marion, the age of Lady Bird’s Mom. A modern-day \nromantic Mary Cassatt rip-off painting of motherhood. \n\n \nLADY BIRD (V.O.) \n\nDo you think I look like I’m from \nSacramento? \n\nNow they’re awake. Lady Bird stares at her re. Marion makes \nthe bed. \n\nCUT TO: \n\nMARION \n\nYou are from Sacramento. \n\nLADY BIRD \n(re: making the bed) \n\nYou don’t have to do that. \n\nMARION \n\nWell it’s nice to make things neat \nand clean. \n\nMarion is making hospital corners. Lady Bird sits on the \njust made bed. Marion sits beside her, moves the hair out of \nLady Bird’s eyes. \n\nMARION (CONT’D) \n\nReady to go ho

##### process La La Land

`raw_scripts[4]`

        Speaking: all caps NAME \n text
        Setting: words	
        Character: NAME
        Visual: words, sometimes italcized or underlined
        Other Notes: music titles are in brackets and bold

##### process Marriage Story

`raw_scripts[5]`

        Speaking: all caps NAME \n text
        Setting: preceded by ext/int in all caps	centered, all caps	
        Character: just words; pattern of character
        Visual: name \n dialogue \n free text not preceeded by character name \n and then character name \n or new setting	
        Other Notes: N/A

In [31]:
# remove page numbers
raw_scripts[5] = re.sub("\\x0c                                                 \d+.\\n", " ", raw_scripts[5])

# remove voiceover notes, not necessary and won't add to analysis
raw_scripts[5] = raw_scripts[5].replace("(V.O.)", "")

# keep copy of untouched (aside from VO and page numbers)
raw_raw_ms = raw_scripts[5]

# split on combo of caps and new lines
split_cap = re.split("\\n[A-Z]*\s?\\n", raw_scripts[5])
vis_ms = split_cap.copy()
dia_ms = split_cap.copy()

# if begins with INT or EXT then it's a setting and the next line is visual cue
settings = [v for v in range(len(vis_ms)) if vis_ms[v].startswith("INT") or vis_ms[v].startswith("EXT")]
# extract visual cues
visual_cues_ms = [vis_ms[s+1] for s in settings]

sets_cues_id = []
for s in settings:
    sets_cues_id.append(s)
    sets_cues_id.append(s+1)

# remove settings and visual cues from dialogue
dialogue_ms = [dia_ms[d] for d in range(len(dia_ms)) if d not in sets_cues_id]

# removing entries that are just all caps (i.e. CHARLIE)
dialogue_wo_upper = [d for d in dialogue_ms if d.isupper() == False]

# removing entries that don't have a character indicator
dialogue_wo_indicator = [t for t in dialogue_wo_upper if t[:3].isupper() == False]

# split out indicators from dialogue + flatten nested list
dialogue_split = [re.split("[A-Z]\s?\\n", t) for t in dialogue_wo_upper if t[:3].isupper()]

# drop indicator + flatten split list
dialogue_flatten = []
for ts in dialogue_split:
    ts.pop(0)
    dialogue_flatten.append(ts)
dialogue_flatten = [item for sublist in dialogue_flatten for item in sublist]

dialogue_ms = dialogue_flatten + dialogue_wo_indicator

# clean + join visual cues
visual_cues_ms = " ".join(clean_lists(visual_cues_ms))

# clean + join dialogue
dialogue_ms = " ".join(clean_lists(dialogue_ms))

# clean + join everything
cleaned_everything_ms = " ".join(clean_lists(raw_raw_ms))

# append to master lists
master_visual_cues.append(visual_cues_ms)
master_dialogue.append(dialogue_ms)
master_cleaned_everything.append(cleaned_everything_ms)

##### process Roma

`raw_scripts[6]`

        Speaking: all caps NAME \n text	
        Setting: preceded by ext/int in all caps	
        Character: centered, all caps	
        Visual: block of text	
        Other Notes: includes dates in red for design department

##### process Shape of Water

`raw_scripts[7]`

        Speaking: all caps NAME \n text, sometimes surrounded by quotes when character Dialogue is speaking	
        Setting: preceded by ext/int in all caps	
        Character: centered, all caps	
        Visual: text, sometimes italicized	
        Other Notes: extraneous details at the beginning "for your consideration", etc.

##### process The Big Sick

`raw_scripts[8]`

        Speaking: all caps NAME \n text	
        Setting: preceded by ext/int in all caps	
        Character: centered, all caps	
        Visual: just words; pattern of character name \n dialogue \n free text not preceeded by character name \n and then character name \n or new setting	
        Other Notes: (V.O.) means voiceover

##### process The Favourite

`raw_scripts[9]`

        Speaking: all caps NAME \n text	
        Setting: preceded by ext/int in all caps and surrounded by numbers	
        Character: centered, all caps	
        Visual: preceded by setting and number	
        Other Notes: preceding extraneous details

##### process Three Billboards

`raw_scripts[10]`

        Speaking: all caps NAME \n text, sometimes words are randomly underlined	
        Setting: bold and preceded by ext/int	
        Character: bold centered, all caps	
        Visual: words	
        Other Notes: extraneous details at the beginning "for your consideration", etc.

##### process Vice

`raw_scripts[11]`

        Speaking: voiceovers are in quotes but regular dialogue is all caps NAME \n text	
        Setting: preceded by ext/int in all caps	
        Character: centered, all caps	
        Visual: some visual cues are in all caps for emphasis "POLICE LIGHTS FLASH behind him as he drives" and stage directions i.e. (into his ear piece) are in ()
        Other Notes: (V.O.) means voiceover

##### return processed scripts to data frame

In [None]:
# # sanity checks
# len(master_dialogue) == 12
# len(master_visual_cues) == 12
# len(master_cleaned_everything) == 12

# # make column of dialogue
# ar['dialogue'] = master_dialogue

# # make column of visual cues
# ar['visual_cues'] = master_visual_cues

# # make everything column
# ar['cleaned_script'] = master_cleaned_everything