## Package Imports

In [1]:
# Import packages

import pandas as pd 
import numpy as np
import re

import nltk
from nltk import word_tokenize as w_tokenize
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
from nltk.corpus import stopwords
import string
stop_words = set(stopwords.words('english') + ["..."])

from sklearn.feature_extraction.text import TfidfVectorizer as tfv
from sklearn.decomposition import NMF as nmf

import pdfminer
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.high_level import extract_text
import PyPDF2
# !pip install PyPDF2

## Extract Text

In [2]:
# extract text from list of pdf titles and create dataframe of text

path = './Scripts/'
end = '.pdf'
titles = ['1917', '20th_Century_Women', 'BridgeofSpies', 'ExMachina', 'First_Reformed',
         'Get_Out', 'GreenBook', 'Hell_or_HighWater', 'InsideOut', 'KnivesOut', 'LadyBird',
         'LaLaLand', 'Manchester_By_TheSea', 'MarriageStory', 'Parasite', 'Roma', 
          'ShapeofWater', 'Spotlight', 'StraightOuttaCompton', 'TheBigSick', 
         'TheFavourite', 'TheLobster', 'ThreeBillboards', 'Vice']

movie_scripts = []
for title in titles:
    script = extract_text(path + title + end)
    movie_scripts.append(script)
    print (title)

# create dataframe of scripts/titles from lists    
movie_df = pd.DataFrame(list(zip(titles, movie_scripts)), columns = ['Title', 'Script'])

# add label for original screenplay win, best picture win, and year
orig_screen_win = [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]
best_pic_win = ['Nom', None, 'Nom', None, None, 'Nom', 'Win', 'Nom', None, None, 'Nom', 'Nom', 'Nom', 'Nom', 'Win', 'Nom', 'Win', 'Win', None, None, 'Nom', None, 'Nom', 'Nom']
year = [2019, 2016, 2015, 2015, 2018, 2017, 2018, 2016, 2015, 2019, 2017, 2016, 2016, 2019, 2019, 2018, 2017, 2015, 2015, 2017, 2018, 2016, 2017, 2018]
reviewer = ["Shafer", "Shafer", "Shafer", "Shafer", "Ragan", "Shafer", "Ragan", "Ragan", "Shafer", "Shafer", "Ragan", "Ragan", "Shafer", "Ragan", "Shafer", "Ragan", "Ragan", "Shafer", "Shafer", "Ragan", "Ragan", "Shafer", "Ragan", "Ragan"]

# sanity check -- should return True
print(f"\n\nSanity check for lists: {len(orig_screen_win) == len(year) == len(best_pic_win) ==  len(list(movie_df['Title'])) == len(reviewer)}")

movie_df['year'] = year
movie_df['orig_screen_win'] = orig_screen_win
movie_df['best_pic_win'] = best_pic_win
movie_df['reviewer'] = reviewer

1917
20th_Century_Women
BridgeofSpies
ExMachina
First_Reformed
Get_Out
GreenBook
Hell_or_HighWater
InsideOut
KnivesOut
LadyBird
LaLaLand
Manchester_By_TheSea
MarriageStory
Parasite
Roma
ShapeofWater
Spotlight
StraightOuttaCompton
TheBigSick
TheFavourite
TheLobster
ThreeBillboards
Vice


Sanity check for lists: True


In [3]:
# column of dialogue
# column of visual cues
# column of setting
# raw script column
# everything column


# corpus of just dialogue
# corpus of everything (dialogue, settings, visual cues, etc.) cleaned up
# corpus of visual cues

# so 3 x per model
    # sentiment analysis on dialogue (CV + TFIDF)
    # sentiment analysis on everything (CV + TFIDF)
    # sentimental analysis on visual cues
    # and so on forth

# POS tagging and keep only proper names and put rest into settings (when dialogue vs setting can both be caps) 

## Data Cleaning

### AR

In [4]:
ar = movie_df[movie_df['reviewer'] == "Ragan"].copy()
ar['CleanedScript'] = ar['Script']

raw_scripts = list(ar['Script'])

##### semi-manually extract dialogue

varies by script, easier to do it manually than write a function with 10k if statements that may still not capture everything

In [5]:
def drop_start(idx, todrop):
    return raw_scripts[idx].replace(todrop, "")

extraneous = ["Scripts.com\n\nFirst Reformed\n\nBy Paul Schrader\n\nPage 1/57\n\n\x0c", 
              "GREEN BOOK \n\nWritten by \n\nNick Vallelonga & Brian Currie & Peter Farrelly \n\n\x0c", 
              "Scripts.com\n\nHell or High Water\n\nBy Taylor Sheridan\n\nPage 1/42\n\n\x0c", 
              " \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n\nLADY BIRD \n\nwritten by \n\nGreta Gerwig \n\n\x0c \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\nii. \n\n", 
              "LA LA LAND\n\nby\n\nDamien Chazelle\n\n\x0c", 
              "MARRIAGE STORY\n\nWritten and Directed by Noah Baumbach\n\n\x0cBlack.\n\n", 
              "IN ENGLISH\n\n\x0cROMA\n\nWritten and Directed by\n\nAlfonso Cuarón\n\nDates in RED are meant only as a tool for the different departments for \nthe specific historical accuracy of the scenes and are not intended to \nappear on screen. \n\n\x0cThursday, September 3rd, 1970\n\n", 
              "F O R   Y O U R   C O N S I D E R A T I O N\n\nB E S T   O R I G I N A L   S C R E E N P L A Y\n\nG U I L L E R M O   D E L   T O R O\n&\nV A N E S S A   T A Y L O R\n\n\x0c1                             \n\n1\n\n", 
              "THE BIG SICK\n\nby\nEmily V. Gordon & Kumail Nanjiani\n\n\x0c1\n\n", 
              "F O R   Y O U R   C O N S I D E R A T I O N\n\nW R I T T E N   B Y\n\nD E B O R A H   D A V I S   A N D   T O N Y   M C N A M A R A\n\n\x0cW R I T T E N   B Y\n\nD E B O R A H   D A V I S   A N D   T O N Y   M C N A M A R A\n\n\x0cTHE FAVOURITE\n\nWritten by\n\nDeborah Davis and Tony McNamara\n\nFINAL SHOOTING SCRIPT - 23rd MARCH 2017\n\nElement Pictures\n21 Mespil Road \nDublin 4\nIreland\n\nScarlet Films\n3 Oakley Studios\nUpper Cheyne Row\nLondon SW3 5JW, UK\n\n \n\n\x0c1\n\n2\n\n3\n\n4\n\n5\n\n6\n\n", 
              "F O R   Y O U R   C O N S I D E R A T I O N\n\nB E S T   O R I G I N A L   S C R E E N P L A Y\n\nM A R T I N   M c D O N A G H\n\n\x0cTHREE BILLBOARDS OUTSIDE EBBING, MISSOURI\n\nYou Red Welby?\n\nby\nMartin McDonagh\n\n1\n\n", 
              "Written and Directed by\n\nVICE\n\nAdam McKay\n\n\x0c"]

for r in range(len(raw_scripts)):
    raw_scripts[r] = drop_start(r, extraneous[r])

# expand contractions
def expand_contractions(text):
    text = re.sub("\'s", " is", text) # it's --> it is
    text = re.sub("ain\'t", "are not", text) # ain't --> are not
    text = re.sub("n\'t", " not", text) # don't --> do not
    text = re.sub("\'re", " are", text) # you're --> you are
    text = re.sub("\'d", " would", text) # she'd --> she would
    text = re.sub("\'ll", " will", text) # he'll --> he will
    text = re.sub("\'ve", " have", text) # we've --> we have
    text = re.sub("\'m", " am", text) # I'm --> I am
    return text

for i in range(len(raw_scripts)):
    raw_scripts[i] = expand_contractions(raw_scripts[i])

# instantiate empty lists to become columns
master_dialogue = []
master_cleaned_everything = []

def clean_lists(lis):
    cust_punct = string.punctuation + "`" + "'" + "’" + "”" + "“" + "..."
    if type(lis) != list:
        lis = w_tokenize(lis)
    lis = [l.replace("'", "") for l in lis]
    # strip punctuation
    lis = [t for t in lis if t not in set(cust_punct)]
    # lemmatize + lowercase
    lis = [lemma.lemmatize(l) for l in lis]
    # drop stop words
    lis = [t for t in lis if t not in set(stop_words)]
    # strip new lines
    lis = [re.sub('\s+', ' ', l) for l in lis]
    # remove all non-alphanumeric -- fail safe code
    lis = [re.sub("[^a-zA-Z0-9\\s]", "", l) for l in lis]
    return lis

def split_dialogue_everything(script, pages):
    first_ref = False
    hhw = False
    other = False
    script = script.replace("'", "")
    if script == raw_scripts[0]:
        first_ref = True
    elif script == raw_scripts[2]:
        hhw = True
    else:
        other = True
    # remove page numbers
    script = re.sub(pages, " ", script)
    # remove voiceover --(V.O.)--from script
    script = script.replace("(V.O.)", "")
    # extract dialogue -- some require customization
    # First Reformed
    if first_ref:
        print('First Reformed')
        split_script = w_tokenize(script)
        orig_split_script = split_script.copy()
        # find indexes with (
        open_pars = [i for i, e in enumerate(split_script) if e == "("]
        # find indexes with )
        end_pars = [i for i, e in enumerate(split_script) if e == ")"]
        # return words between ( and ) to extract visual cues
        visual_cues_fr = [" ".join(split_script[open_pars[o]+1:end_pars[o]]) for o in range(len(open_pars))]
        # drop visual cues from list
        for o in range(len(open_pars)):
            del split_script[open_pars[o]:end_pars[0]+1]
        # re-join : with character names and drop extra :
        # find indices with :
        char_idx = [i for i, item in enumerate(split_script) if item.endswith(':')]
        # for every index with :, go to the index before and add : to re-join CHARACTER with :
        for b in char_idx:
            split_script[b-1] = split_script[b-1] + split_script[b]
        # drop extraneous colons
        if ":" in split_script:
            split_script.remove(":")
        # append non-character names to dialogue list
        dialogue = [split_script[t] for t in range(len(split_script)) if split_script[t].endswith(":") == False]
        script = orig_split_script
    # Hell or High Water -- script is literally just dialogue
    elif hhw:
        print('Hell or High Water')
        dialogue = script
    # All others
    else:
        print('Others')
        # split on combo of caps and new lines
        split_script = re.split("\\n[A-Z]*\s?\\n", script)
        visual_cues = []
        # if begins with INT or EXT then it's a setting and the next line is visual cue
        settings = [v for v in range(len(split_script)) if split_script[v].startswith("INT")
                    or split_script[v].startswith("EXT")
                    or split_script[v].startswith("VISUALS|VISUAL")]
        # extract visual cues
        visual_cue = [split_script[s+1] for s in settings]
        # remove word if uppercase
        visual_cue = [w for w in visual_cue if w.isupper() == False]
        visual_cues.append(visual_cue)
        sets_cues_id = []
        for s in settings:
            sets_cues_id.append(s)
            sets_cues_id.append(s+1)
        # remove settings and visual cues from dialogue
        dialogue_ms = [split_script[d] for d in range(len(split_script)) if d not in sets_cues_id]
        # removing entries that are just all caps (i.e. CHARLIE)
        dialogue_wo_upper = [d for d in dialogue_ms if d.isupper() == False]
        # removing entries that don't have a character indicator
        dialogue_wo_indicator = [t for t in dialogue_wo_upper if t[:3].isupper() == False]
        # split out indicators from dialogue + flatten nested list
        dialogue_split = [re.split("[A-Z]\s?\\n", t) for t in dialogue_wo_upper if t[:3].isupper()]
        # drop indicator + flatten split list
        dialogue_flatten = []
        for ts in dialogue_split:
            ts.pop(0)
            dialogue_flatten.append(ts)
        dialogue_flatten = [item for sublist in dialogue_flatten for item in sublist]
        dialogue = dialogue_flatten + dialogue_wo_indicator
    # clean + join lists
    everything_list = " ".join(clean_lists(script))
    dialogue_list = " ".join(clean_lists(dialogue))
    # return everything list and dialogue list
    return everything_list, dialogue_list

##### process First Reformed

`raw_scripts[0]`

        Speaking: all caps NAME \n text	
        Setting: none, script is just dialogue and a few visuals	
        Character: bold all caps:	
        Visual: (ALL CAPS)
        Other Notes: some lines are randomly preceded by dashes and includes page numbers

In [10]:
# extract and process dialogue
everything_list_fr, dialogue_list_fr = split_dialogue_everything(raw_scripts[0], 
                                                                 "\\nPage \d+/57")

# # append to master lists
# master_dialogue.append(dialogue_list_fr)
# master_cleaned_everything.append(everything_list_fr)

Others


##### process Green Book

`raw_scripts[1]`

        Speaking: all caps NAME \n text
        Setting: bold and preceded by ext/int
        Character: centered, all caps	
        Visual: block of text with some words in all caps for emphasis	
        Other Notes: page numbers sandwiched by CONTINUED

In [12]:
# extract and process dialogue
everything_list_gb, dialogue_list_gb = split_dialogue_everything(raw_scripts[1], 
                                                                 "\\x0c                                   \\n\\n   \d+\.")

# # append to master lists
# master_dialogue.append(dialogue_list_gb)
# master_cleaned_everything.append(everything_list_gb)

Others


##### process Hell or High Water

`raw_scripts[2]`

        Speaking: a wall of text
        Setting: none, script is just dialogue
        Character: none, script is just dialogue
        Visual: none, script is just dialogue
        Other Notes: page numbers are marked (1/42, 2/42, etc.)

In [14]:
# extract and process dialogue
everything_list_hhw, dialogue_list_hhw = split_dialogue_everything(raw_scripts[2], 
                                                                       "\\nPage \d+/42")

# # append to master lists
# master_dialogue.append(dialogue_list_MOVIE)
# master_cleaned_everything.append(everything_list_MOVIE)

Others


##### process Lady Bird

`raw_scripts[3]`

        Speaking: all caps NAME \n text
        Setting: preceded by ext/int in all caps	
        Character: centered, all caps
        Visual: just words	
        Other Notes: very little extraneous text at beginning

In [16]:
# extract and process dialogue
everything_list_lb, dialogue_list_lb = split_dialogue_everything(raw_scripts[3], 
                                                                       "\\x0c\s?[\\n\\n ?\\n ?]+\d+\. ?\\n")

# # append to master lists
# master_dialogue.append(dialogue_list_MOVIE)
# master_cleaned_everything.append(everything_list_MOVIE)

Others


##### process La La Land

`raw_scripts[4]`

        Speaking: all caps NAME \n text
        Setting: words	
        Character: NAME
        Visual: words, sometimes italcized or underlined
        Other Notes: music titles are in brackets and bold

In [18]:
# extraneous number lines/scene codes
raw_scripts[4] = re.sub("\\n\d+\\n", "", raw_scripts[4])
raw_scripts[4] = re.sub("\\n[A-Z]\d+\\n", "", raw_scripts[4])
raw_scripts[4] = re.sub("\[(.*?)\]", "", raw_scripts[4])

# extract and process dialogue
everything_list_lll, dialogue_list_lll = split_dialogue_everything(raw_scripts[4], 
                                                                       "Revision\\n\\n\d+\.\\n")

# # append to master lists
# master_dialogue.append(dialogue_list_MOVIE)
# master_cleaned_everything.append(everything_list_MOVIE)

Others


##### process Marriage Story

`raw_scripts[5]`

        Speaking: all caps NAME \n text
        Setting: preceded by ext/int in all caps	centered, all caps	
        Character: just words; pattern of character
        Visual: name \n dialogue \n free text not preceeded by character name \n and then character name \n or new setting	
        Other Notes: N/A

In [20]:
# extract and process dialogue
everything_list_ms, dialogue_list_ms = split_dialogue_everything(raw_scripts[5], 
                                                                       "\\x0c                                                 \d+\.\\n")

# # append to master lists
# master_dialogue.append(dialogue_list_MOVIE)
# master_cleaned_everything.append(everything_list_MOVIE)

Others


##### process Roma

`raw_scripts[6]`

        Speaking: all caps NAME \n text	
        Setting: preceded by ext/int in all caps	
        Character: centered, all caps	
        Visual: block of text	
        Other Notes: includes dates in red for design department

In [23]:
# extract and process dialogue
everything_list_r, dialogue_list_r = split_dialogue_everything(raw_scripts[6], 
                                                                       "\\x0c\d+\.\\n")

# # append to master lists
# master_dialogue.append(dialogue_list_MOVIE)
# master_cleaned_everything.append(everything_list_MOVIE)

Others


##### process Shape of Water

`raw_scripts[7]`

        Speaking: all caps NAME \n text, sometimes surrounded by quotes when character Dialogue is speaking	
        Setting: preceded by ext/int in all caps	
        Character: centered, all caps	
        Visual: text, sometimes italicized	
        Other Notes: extraneous details at the beginning "for your consideration", etc.

In [25]:
# extraneous number lines/scene codes
raw_scripts[7] = raw_scripts[7].replace("\n\nTHE SHAPE OF WATER\nTHE SHAPE OF WATER\nWritten by\n\nGuillermo del Toro\nWritten by\n&\nVanessa Taylor\nGuillermo del Toro\n&\nVanessa Taylor\nStory by\nGuillermo del Toro\n\nStory by\nGuillermo del Toro\n\nFOX SEARCHLIGHT PICTURES, INC. \n10201 W. Pico Blvd.\nLos Angeles, CA 90035\n\nFOX SEARCHLIGHT PICTURES, INC. \n10201 W. Pico Blvd.\nLos Angeles, CA 90035\n\nALL RIGHTS RESERVED. COPYRIGHT ©2016 WILLOW AND OAK, INC. NO PORTION OF THIS SCRIPT MAY BE \nPERFORMED, PUBLISHED, REPRODUCED, SOLD OR DISTRIBUTED BY ANY MEANS, OR QUOTED OR PUBLISHED IN ANY \nMEDIUM, INCLUDING ANY WEB SITE, WITHOUT THE PRIOR WRITTEN CONSENT OF WILLOW AND OAK, INC. DISPOSAL \nOF THIS SCRIPT COPY DOES NOT ALTER ANY OF THE RESTRICTIONS SET FORTH ABOVE.\n\nALL RIGHTS RESERVED. COPYRIGHT ©2016 WILLOW AND OAK, INC. NO PORTION OF THIS SCRIPT MAY BE \nPERFORMED, PUBLISHED, REPRODUCED, SOLD OR DISTRIBUTED BY ANY MEANS, OR QUOTED OR PUBLISHED IN ANY \nMEDIUM, INCLUDING ANY WEB SITE, WITHOUT THE PRIOR WRITTEN CONSENT OF WILLOW AND OAK, INC. DISPOSAL \nOF THIS SCRIPT COPY DOES NOT ALTER ANY OF THE RESTRICTIONS SET FORTH ABOVE.", "")
raw_scripts[7] = re.sub("\\n\d+\\n", "", raw_scripts[7])
raw_scripts[7] = re.sub("\\n[A-Z]\d+\\n", "", raw_scripts[7])

# extract and process dialogue
everything_list_sow, dialogue_list_sow = split_dialogue_everything(raw_scripts[7], 
                                                                       "\\x0c\d+                             \\n\\n\d+")

# # append to master lists
# master_dialogue.append(dialogue_list_MOVIE)
# master_cleaned_everything.append(everything_list_MOVIE)

Others


##### process The Big Sick

`raw_scripts[8]`

        Speaking: all caps NAME \n text	
        Setting: preceded by ext/int in all caps	
        Character: centered, all caps	
        Visual: just words; pattern of character name \n dialogue \n free text not preceeded by character name \n and then character name \n or new setting	
        Other Notes: (V.O.) means voiceover

In [27]:
# extract and process dialogue
everything_list_tbs, dialogue_list_tbs = split_dialogue_everything(raw_scripts[8], 
                                                                       "\\n\\x0c\d+\\n")

# # append to master lists
# master_dialogue.append(dialogue_list_MOVIE)
# master_cleaned_everything.append(everything_list_MOVIE)

Others


##### process The Favourite

`raw_scripts[9]`

        Speaking: all caps NAME \n text	
        Setting: preceded by ext/int in all caps and surrounded by numbers	
        Character: centered, all caps	
        Visual: preceded by setting and number	
        Other Notes: preceding extraneous details

In [29]:
# extraneous number lines/scene codes
raw_scripts[9] = re.sub("\\n\d+\\n", "", raw_scripts[9])

# extract and process dialogue
everything_list_tf, dialogue_list_tf = split_dialogue_everything(raw_scripts[9], 
                                                                       "\\x0cTHE FAVOURITE SHOOTING SCRIPT 6 MAR 17   \d+.\\n")

# # append to master lists
# master_dialogue.append(dialogue_list_MOVIE)
# master_cleaned_everything.append(everything_list_MOVIE)

Others


##### process Three Billboards

`raw_scripts[10]`

        Speaking: all caps NAME \n text, sometimes words are randomly underlined	
        Setting: bold and preceded by ext/int	
        Character: bold centered, all caps	
        Visual: words	
        Other Notes: extraneous details at the beginning "for your consideration", etc.

In [31]:
# extraneous number lines/scene codes
raw_scripts[10] = re.sub("\\n\d+\\n", "", raw_scripts[10])

# extract and process dialogue
everything_list_tb, dialogue_list_tb = split_dialogue_everything(raw_scripts[10], 
                                                                       "\\n\d+\.")

# # append to master lists
# master_dialogue.append(dialogue_list_MOVIE)
# master_cleaned_everything.append(everything_list_MOVIE)

Others


##### process Vice

`raw_scripts[11]`

        Speaking: voiceovers are in quotes but regular dialogue is all caps NAME \n text	
        Setting: preceded by ext/int in all caps	
        Character: centered, all caps	
        Visual: some visual cues are in all caps for emphasis "POLICE LIGHTS FLASH behind him as he drives" and stage directions i.e. (into his ear piece) are in ()
        Other Notes: (V.O.) means voiceover

In [33]:
# extract and process dialogue
everything_list_v, dialogue_list_v = split_dialogue_everything(raw_scripts[11], 
                                                                       "\\x0c\d+\.\\n")

# # append to master lists
# master_dialogue.append(dialogue_list_MOVIE)
# master_cleaned_everything.append(everything_list_MOVIE)

Others


##### return processed scripts to data frame

In [35]:
# # sanity checks
# len(master_dialogue) == 12
# len(master_visual_cues) == 12
# len(master_cleaned_everything) == 12

# # make column of dialogue
# ar['dialogue'] = master_dialogue

# # make column of visual cues
# ar['visual_cues'] = master_visual_cues

# # make everything column
# ar['cleaned_script'] = master_cleaned_everything

## NLP transformations

CV + TFIDF

## Sentiment Analysis

maybe one person uses ANEW and one uses Harvard-IV??

or just split it up so one person does: 
- SA with dialogue CV 
- SA with dialogue TFIDF

and the other does:
- SA with everything CV
- SA with everything TFIDF

## Topic Modeling -- AS

## K Means Clustering  -- AR