In [1]:
# Import packages

# tools for extracting text from PDF
import pdfminer
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.high_level import extract_text
import PyPDF2

# tools for data manipulation
import pandas as pd 
import numpy as np
import re
import string

# tools for text cleaning
import nltk
from nltk import word_tokenize as w_tokenize
from nltk.corpus import stopwords

# tools for modeling text and analysis
from sklearn.feature_extraction.text import TfidfVectorizer as tfv
from sklearn.decomposition import NMF as nmf


In [2]:
# extract text from list of pdf titles and create dataframe of text

path = './Scripts/'
end = '.pdf'
titles = ['1917', '20th_Century_Women', 'BridgeofSpies', 'ExMachina', 'First_Reformed',
         'Get_Out', 'GreenBook', 'Hell_or_HighWater', 'InsideOut', 'KnivesOut', 'LadyBird',
         'LaLaLand', 'Manchester_By_TheSea', 'MarriageStory', 'Parasite', 'Roma', 
          'ShapeofWater', 'Spotlight', 'StraightOuttaCompton', 'TheBigSick', 
         'TheFavourite', 'TheLobster', 'ThreeBillboards', 'Vice']

movie_scripts = []
for title in titles:
    script = extract_text(path + title + end) 
    movie_scripts.append(script) # add script to list of scripts
    print (title) # prints title as script is extracted

# create dataframe of scripts/titles from lists  

movie_df = pd.DataFrame(list(zip(titles, movie_scripts)), columns = ['Title', 'Script'])

1917
20th_Century_Women
BridgeofSpies
ExMachina
First_Reformed
Get_Out
GreenBook
Hell_or_HighWater
InsideOut
KnivesOut
LadyBird
LaLaLand
Manchester_By_TheSea
MarriageStory
Parasite
Roma
ShapeofWater
Spotlight
StraightOuttaCompton
TheBigSick
TheFavourite
TheLobster
ThreeBillboards
Vice


In [3]:
# add label for original screenplay win, best picture win, and year
orig_screen_win = [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]
best_pic_win = ['Nom', None, 'Nom', None, None, 'Nom', 'Win', 'Nom', None, None, 'Nom', 'Nom', 'Nom', 'Nom', 'Win', 'Nom', 'Win', 'Win', None, None, 'Nom', None, 'Nom', 'Nom']
year = [2019, 2016, 2015, 2015, 2018, 2017, 2018, 2016, 2015, 2019, 2017, 2016, 2016, 2019, 2019, 2018, 2017, 2015, 2015, 2017, 2018, 2016, 2017, 2018]
reviewer = ["Shafer", "Shafer", "Shafer", "Shafer", "Ragan", "Shafer", "Ragan", "Ragan", "Shafer", "Shafer", "Ragan", "Ragan", "Shafer", "Ragan", "Shafer", "Ragan", "Ragan", "Shafer", "Shafer", "Ragan", "Ragan", "Shafer", "Ragan", "Ragan"]

# ADD GENRE

# sanity check -- should return True
len(orig_screen_win) == len(year) == len(best_pic_win) ==  len(list(movie_df['Title'])) == len(reviewer)

movie_df['year'] = year
movie_df['orig_screen_win'] = orig_screen_win
movie_df['best_pic_win'] = best_pic_win
movie_df['reviewer'] = reviewer

In [4]:
# keep only scripts I am working on

ACS = movie_df[movie_df['reviewer'] == 'Shafer'].copy()

In [5]:
# Create function to remove extraneous information from title pages, etc.

ext_list = ['1917\n\nWritten by\n\nSam Mendes\n&\nKrysty Wilson-Cairns\n\n© 2018 Storyworks Productions Limited. All rights reserved.\n\n\x0cii\nii\n\n‘Life, to be sure,\n Is nothing much to lose,\n But young men think it is,\n And we were young.’\n\n-A.E. Housman\n\n‘We have so much to say, and we shall never say it.’\n\n-Erich Maria Remarque\nAll Quiet On The Western Front \n\n\x0ciii\niii\n\nThe following script takes place in real time, and - with the \nexception of one moment - is written and designed to be one single \ncontinuous shot. \n\nNOTE:\n\n\x0c                                                                1.\n',
             '20th Century Women\n\nby\n\nMike Mills\n\n\x0cEXT.',
             'Scripts.com\n\nBridge of Spies\n\nBy Matt Charman\n\nPage 1/124\n\n\x0cTITLE OVER BLACK: 1957.',
             ' \n\nEX MACHINA\n\nBy','\n\nAlex Garland\n\n','Alex Garland 2013\nc/o DNA Films Ltd.',
            '10 Amwell Street\nLondon EC1R 1UQ\nT:','\n\+44 (0)207 843 4410\n\n\x0c','\n\n\x0c'
             'Scripts.com\n\nGet Out\n\nBy Jordan Peele\n\nPage 1/94\n\n\x0c',
             'INSIDE OUT\n\noriginal story by\n\nPete Docter\nRonnie Del Carmen\n \n\nScreenplay by\n\nPete Docter\nMeg LeFauve\nJosh Cooley\n\n\x0cBLACK.',
             'KNIVES OUT\n\nA Murder Mystery by\n\nRian Johnson\n\nSCREEN SCRIPT\n\n\x0cEXT.',
             'MANCHESTER BY THE SEA\n\nWritten & Directed \n\nby\n\nKenneth Lonergan\n\n\x0cEXT.',
             'F O R   Y O U R   C O N S I D E R A T I O N\n\nOUTSTANDING ORIGINAL SCREENPLAY \nSCREENPLAY BY\nBONG JOON HO AND HAN JIN WON\nSTORY BY\nBONG JOON HO\n\n\x0c \n \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n \nPARASITE\n \n \n \n\nScreenplay by\n\n \nStory by\n\nBong Joon Ho\n\nBong Joon Ho and Han Jin Won\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\x0c1\n\n2\n\nTITLE SEQUENCE OVER',
             'Scripts.com\n\nSpotlight\n\nBy Tom McCarthy\n\nPage 1/152\n\n\x0c1',
             'STRAIGHT OUTTA COMPTON\n\nBy\n\nLeigh Savidge and Alan Wenkus\n\nRevised 8/16/08',
            '\nXenon Pictures','\n(310)451-5510','\n\n\x0cEXT.',
            'Scripts.com\n\nGet Out\n\nBy Jordan Peele\n\nPage 1/94\n\n\x0c',
            'NEONratedAwards.com\n\n ',
            'THE LOBSTER\n\nWritten by',
            '\n\nYorgos Lanthimos', 
            '& Efthimis Filippou\n\n30 April 2013',
            '\n\nElement Pictures\nLimp\nScarlet', 
            'Films\n\n 1',
             'Scripts.com\n\nThe Lobster\n\nBy Yorgos Lanthimos\n\nPage 1/39\n\n\x0c',
            ' BLACK \nAccompanied by dark but curiously upbeat MUSIC. \nAt the end of the credits, the MAIN TITLE, in strange \ncalligraphy, fills the screen -- \n\n“PARASITE”\n\n1\n \nMUSIC FADES. ',
           'THE END.\n\nFOR LANCE CORPORAL ALFRED H. MENDESST BATTALION, KING’S ROYAL RIFLE CORPS\n\nWHO TOLD US THE STORIES\n\n\x0c"']

def drop_start(text, ext_list):
    new_text = text
    for match in ext_list:
        new_text = re.sub(match, '', new_text)
    return new_text


In [6]:
# apply function to do initial light clean on columns and make new fields

ACS['FirstClean'] = ACS['Script'].apply(drop_start, args = (ext_list, ))


In [8]:
# Create function to clean unneeded pieces from script. ## These used to be all one tool called "multiclean"

def remove_contractions(text):
    text = re.sub(r"n\'t", " not", text) # handle contractions
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'d", " had", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"ain\'t", "are not", text)
    return text

def remove_page_nums(text):
    text = re.sub('\n11/26/14', '', text)
    text = re.sub('\n\n\\x0c\s+\d+\.','', text)
    text = re.sub('\nPage \d+\/\d+', '', text)
    text = re.sub('\n\/Page', '', text)
    text = re.sub('\\x0c|\\x0c ', '', text)
    text = re.sub('\\x0c\d\.', ' ', text)
    text = re.sub('\n+\\x0c\d+','', text)
    text = re.sub('\n\n\\x0c\s+\d+\.|\n\n\\x0c\s+\d+\.|\\x0c','', text)
    return text

def remove_space_digs(text):
    text = re.sub('\n\*\n', '', text)
    text = re.sub('^\s+','',text)
    text = re.sub('\s+\d\.', '', text)
    text = re.sub('\d\.|\d+\.', '', text)
    text = re.sub('\n\s\d+\.\n|\n*\s+\\n+\d+\.', ' ', text)
    text = re.sub('\n\d\.| \d\.|\n \d+\.|\n\d+\.', '', text)
    text = re.sub('\n\d+|\n\d+\.| \d+\n| \d+\.', '', text)
    text = re.sub('\n\*\n', '', text)
    text = re.sub('^\n\(\)- ', '', text)
    text = re.sub('^\n\(d+\)d+\-d+ ', '', text)
    text = re.sub('\s+\d', '', text)
    text = re.sub('\(\d+\)', '', text)
    text = re.sub('\d+\n+\d+\w\n+\d|\n+\d+', '', text)
    text = re.sub('\n\d+\.', '', text)
    text = re.sub('\d', '', text)
    text = re.sub(' +', ' ', text)
    text = re.sub('\n\d\.| \d\.|\n \d+\.|\n\d+\.| \d\n|\n\d|\n\d |\n\d\n', '', text)
    return text

def remove_words(text):
    text = re.sub('TITLES:', '', text)
    text = re.sub('\(V\.O\.\)|\(O\.S\.\)|\(O\.C\.\)| V.O.|O\.S\.|V\.O\.|\sV\.O\.', '', text)
    text = re.sub('^Films\n+ \d\n\n', '', text)
    text = re.sub("\(CONTINUED:\)|\(CONTINUED\)|CONTINUED:|COUNTINUOUS| CONTINUOUS|CONT'D|\(CONT'D\)", ' ', text)
    text = re.sub("\(CONT\’D\)| \(CONT\'D\)\s+|\(CONT\'D\)|\(CONT\'D\)|\s\(CONT\'D\)\s+| (CONT\’D)",'',text)
    text = re.sub("\(CONTINUED:\)|\s\(CONT\’D\)\n| \(CONT\’D\) \n|\(CONTINUED\)|CONTINUED:|COUNTINUOUS| CONTINUOUS|CONT'D|\(CONT'D\)| \(CONT'D\)|\(CONT..\d\;D\)", ' ', text)
    text = re.sub(r'14 FINAL SHOOTING SCRIPT  |FINAL SHOOTING SCRIPT|\n 12.17.14 FINAL SHOOTING SCRIPT \d+\.|\n 12.17.14 FINAL SHOOTING SCRIPT |\n+ \d+ FINAL SHOOTING SCRIPT  \n+', '', text)
    text = re.sub("\(CONT’D\)| \(CONT'D\)",'', text)
    text = re.sub("\(CONT’D\)| \(CONT'D\)| \(CONT\\'D\)",' ', text)
    text = re.sub('\(MORE\)', '', text)
    return text
    

In [10]:
# Apply cleaning functions to initially cleaned col (FirstClean)

ACS['SecondClean'] = ACS['FirstClean'].apply(remove_contractions) # does not seem to be working
ACS['SecondClean'] = ACS['SecondClean'].apply(remove_page_nums)
ACS['SecondClean'] = ACS['SecondClean'].apply(remove_space_digs)
ACS['SecondClean'] = ACS['SecondClean'].apply(remove_words)

In [13]:
# Create function to idenitfy speaking vs. visual cues

def speaking_parts(text):
    # split on combo of caps and new lines
    split_text = re.split("\\n[A-Z]*\s?\\n", text)

    visual_cues = []
    # if begins with INT or EXT then it's a setting and the next line is visual cue
    settings = [v for v in range(len(split_text)) if split_text[v].startswith("INT") 
                or split_text[v].startswith("EXT")
                or split_text[v].startswith("VISUALS|VISUAL")]

    # extract visual cues
    visual_cue = [split_text[s+1] for s in settings]

    # remove word if uppercase
    visual_cue = [w for w in visual_cue if w.isupper() == False]

    visual_cues.append(visual_cue)

    sets_cues_id = []
    for s in settings:
        sets_cues_id.append(s)
        sets_cues_id.append(s+1)

    # remove settings and visual cues from dialogue
    dialogue = [split_text[d] for d in range(len(split_text)) if d not in sets_cues_id]
    
    # removing entries that are just all caps (i.e. CHARLIE)
    dialogue = [d for d in dialogue if d.isupper() == False]

    return dialogue

In [18]:
# Apply function to create Dialogue column for further cleaning separation
# Should result in a list of strings for each script

ACS['Diag_Work'] = ACS['SecondClean'].apply(speaking_parts)

In [19]:
ACS['Diag_Work']

0     [, A figure lies against a tree, eyes closed -...
1     [High overhead shot looking down on the Pacifi...
2     [The height of the Cold War. The United States...
3     [© \n + \nMusic starts.,  main( ) {, }\na ‘hel...
5     [I appeal to you therefore, brothers, by the m...
8     [JOY \nDo you ever look at someone and \nwonde...
9     [The grounds of a New England manor. Pre-dawn ...
12    [A small commercial fishing boat heads out of ...
14    [Accompanied by dark but curiously upbeat MUSI...
17    [OLDER COP:\nThe mother’s bawling and the uncl...
18    [Emblazoned across the black screen is: DETROI...
21    [Films, I’m really sorry., (Pause), Does he we...
Name: Diag_Work, dtype: object

In [16]:
# Create function to prepare text for additional separation by visual cues and speaking parts

# import stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
punc = string.punctuation

def add_PoS(text):
    flat_list = []
    sentences = [w_tokenize(sent) for sent in text] # tokenize text
    for i in range(len(sentences)):
        sentences[i] = [w for w in sentences[i] if w not in punc] # remove punctuation
    for i in range(len(sentences)):
        sentences[i] = [w for w in sentences[i] if w not in stop_words] # remove stopwords
    pos_text = [nltk.pos_tag(speech) for speech in sentences] # assign PoS to words
    
    return pos_text

In [20]:
# Apply function to add PoS and remove stopwords
# Should result in a list of list of tuples for each 

ACS['D_Work'] = ACS['Diag_Work'].apply(add_PoS)

In [35]:
test = ACS['D_Work'][5]

In [36]:
test

[[('I', 'PRP'),
  ('appeal', 'VBP'),
  ('therefore', 'RB'),
  ('brothers', 'NNS'),
  ('mercies', 'NNS'),
  ('God', 'NNP'),
  ('topresent', 'NN'),
  ('bodies', 'NNS'),
  ('living', 'VBG'),
  ('sacrifice', 'NN'),
  ('holy', 'JJ'),
  ('acceptableto', 'NN'),
  ('God', 'NNP'),
  ('spiritual', 'JJ'),
  ('worship', 'NN'),
  ('Do', 'NNP'),
  ('conformedto', 'VB'),
  ('world', 'NN'),
  ('transformed', 'VBN'),
  ('renewal', 'NN'),
  ('mind', 'NN'),
  ('testing', 'NN'),
  ('may', 'MD'),
  ('discern', 'VB'),
  ('God', 'NNP'),
  ('whatis', 'NN'),
  ('good', 'JJ'),
  ('acceptable', 'JJ'),
  ('perfect', 'JJ'),
  ('-Romans', 'NNS'),
  ('EXT', 'VBP'),
  ('THE', 'NNP'),
  ('SHAW', 'NNP'),
  ('’', 'NNP'),
  ('S', 'NNP'),
  ('HOUSE', 'NNP'),
  ('FRONT', 'NNP'),
  ('LAWN', 'NNP'),
  ('NIGHT', 'NNP'),
  ('A', 'NNP'),
  ('perfect', 'JJ'),
  ('suburban', 'NN'),
  ('house', 'NN'),
  ('bay', 'NN'),
  ('windows', 'VBZ'),
  ('front', 'JJ'),
  ('lawn', 'NN'),
  ('The', 'DT'),
  ('SHAW', 'NNP'),
  ('family', 'NN'),

In [37]:
ACS['Diag_Work'][5]

['I appeal to you therefore, brothers, by the mercies of God, topresent your\nbodies as a living sacrifice, holy and acceptableto God, which is your\nspiritual worship. Do not be conformedto this world, but be transformed by\nthe renewal of your mind,\nthat by testing you may discern what is the will of God, whatis good and\nacceptable and perfect. -Romans:-\nEXT. THE SHAW’S HOUSE - FRONT LAWN - NIGHT\nA perfect suburban house with bay windows and a front lawn.\nThe SHAW family. Caucasian and warm - RICHARD,; NANCY,;\nJOSHUA,; and MAY, - eat dinner inside. Richard reads \nsomething on his tablet illuminating his face.',
 'JOSHUA :\nWhich one are we going to?',
 'RICHARD :\nThe one in Orlando.',
 'NANCY :\nDisney World.',
 'JOSHUA :\nTony said that Mickey is notreally Mickey; it’s someone elsein there.',
 'RICHARD :\nMickey’s Mickey.\nEXT. SUBURBAN STREET - NIGHT\nANDRE,, an African-American man runs down the sidewalk in \nsweats. He listens to jazz on his phone. The music stops. Hestop

In [38]:
# Manual Cleaning

#def Manual_Clean(text):
visuals = []
for t in range(len(test)):
    if t == 3:
        vis = test[0:5]
        visuals.append(vis)
    if t == 5:
        
        vis = test[0]
        visuals.append(vis)
        

print (visuals)
            

[[[('I', 'PRP'), ('appeal', 'VBP'), ('therefore', 'RB'), ('brothers', 'NNS'), ('mercies', 'NNS'), ('God', 'NNP'), ('topresent', 'NN'), ('bodies', 'NNS'), ('living', 'VBG'), ('sacrifice', 'NN'), ('holy', 'JJ'), ('acceptableto', 'NN'), ('God', 'NNP'), ('spiritual', 'JJ'), ('worship', 'NN'), ('Do', 'NNP'), ('conformedto', 'VB'), ('world', 'NN'), ('transformed', 'VBN'), ('renewal', 'NN'), ('mind', 'NN'), ('testing', 'NN'), ('may', 'MD'), ('discern', 'VB'), ('God', 'NNP'), ('whatis', 'NN'), ('good', 'JJ'), ('acceptable', 'JJ'), ('perfect', 'JJ'), ('-Romans', 'NNS'), ('EXT', 'VBP'), ('THE', 'NNP'), ('SHAW', 'NNP'), ('’', 'NNP'), ('S', 'NNP'), ('HOUSE', 'NNP'), ('FRONT', 'NNP'), ('LAWN', 'NNP'), ('NIGHT', 'NNP'), ('A', 'NNP'), ('perfect', 'JJ'), ('suburban', 'NN'), ('house', 'NN'), ('bay', 'NN'), ('windows', 'VBZ'), ('front', 'JJ'), ('lawn', 'NN'), ('The', 'DT'), ('SHAW', 'NNP'), ('family', 'NN'), ('Caucasian', 'NNP'), ('warm', 'NN'), ('RICHARD', 'NNP'), ('NANCY', 'NNP'), ('JOSHUA', 'NNP'), (

In [26]:
#def double_clean(test):
vis = []
dialogue = []
for t in (range(len(test))):
    if len(test[t]) > 2:
        if test[t][0][0].isupper() and test[t][0][1] == 'NNP' and  test[t][1][0].istitle() == False:
            vis.append(test[t])
        elif test[t][-1][0].isupper() and test[t][-1][1] == 'NNP':
            vis.append(test[t])
            
        elif test[t][0][0] == r'MOMENTS' and  test[t][1][0] == r'LATER':
            vis.append(test[t])
        elif test[t][0][0] == r'LATER':
            vis.append(test[t])
        elif test[t][0][0] == r'VISUALS':
            vis.append(test[t]) 
        
        elif test[t][0][0].isupper() and test[t][0][1] == 'RB' and  test[t][1][0].isupper() == True:
            vis.append(test[t])
        elif test[t][0][0].istitle() and test[t][0][1] == 'RB' and  test[t][1][1] == 'IN' and test[t][2][1] == 'PRP$':
            vis.append(test[t])
        elif test[t][0][1].istitle() and test[t][0][1] == 'NNP' and test[t][1][1] == 'NNS':
            vis.append(test[t])
        elif test[t][0][0].istitle() and test[t][0][1] == 'NNP' and test[t][1][1] == 'VBD':
            vis.append(test[t])
        elif test[t][0][0].istitle() and test[t][0][1] == 'NNP' and test[t][1][1] == 'VBZ':
            vis.append(test[t])
        elif test[t][0][0].istitle() and test[t][0][1] == 'NNP' and test[t][1][1] == 'NNS':
            vis.append(test[t])
        elif test[t][0][0].istitle() and test[t][0][1] == 'NNP' and test[t][1][1] == 'VBZ' and test[t][2][1] != 'VBN' and test[t][3][1] != 'TO':
            vis.append(test[t])
        elif test[t][0][0].istitle() and test[t][0][1] == 'PRP' and test[t][1][1] == 'VBD' and test[t][1][1] == 'VBP' and test[t][2][1] != 'NNS' and  test[t][2][1] != 'RP' and test[t][2][1] != 'DT' and test[t][3][1] != 'VB':
            vis.append(test[t])
        
        
        elif test[t][0][0].istitle() and test[t][0][1] == 'PRP' and test[t][1][1] == 'VBP' and test[t][2][1] == 'VBG':
            vis.append(test[t])
        elif test[t][0][0].istitle() and test[t][0][1] == 'PRP' and test[t][1][1] == 'DT' and test[t][2][1] == 'VBP':
            vis.append(test[t])
        elif test[t][0][0].istitle() and test[t][0][1] == 'PRP' and test[t][1][1] == 'VBP' and test[t][2][1] == 'TO':
            vis.append(test[t])
        
        elif test[t][0][0].istitle() and (test[t][0][0] == r'She' or test[t][0][0] == r'He') and test[t][0][1] == 'PRP' and (test[t][1][1] == 'VBZ' or test[t][1][1] == 'VBD'):
            vis.append(test[t])
        elif test[t][0][0].istitle() and (test[t][0][0] == r'She' or test[t][0][0] == r'He') and test[t][0][1] == 'PRP' and (test[t][1][1] == 'VBZ' or test[t][1][1] == 'VBD') and test[t][2][1] == 'PDT':
            vis.append(test[t])
        elif test[t][0][0].istitle() and test[t][0][0] == r'Dorothea' and test[t][0][1] == 'NNP' and test[t][1][1] == ('NNS'):
            vis.append(test[t])
        elif test[t][0][0].istitle() and (test[t][0][0] == r'Dorothea' or test[t][0][0] == r'Julie') and test[t][0][1] == 'NNP' and test[t][1][1] == ('VBZ'):
            vis.append(test[t])
        elif test[t][0][0].istitle() and test[t][0][1] == 'NN' and test[t][1][1] == 'VBZ' and test[t][2][1] != 'JJ':
            vis.append(test[t])
        elif test[t][0][0].isupper() and test[t][0][1] == 'NN' and test[t][1][1] != ('PRP'):
            vis.append(test[t])
        
        
        elif test[t][0][0].istitle() and test[t][0][1] == 'DT' and test[t][1][1] == ('NN') and test[t][2][1] == ('VBZ'):
            vis.append(test[t])
        elif test[t][0][0].istitle() and test[t][0][1] == 'JJ' and test[t][1][1] == ('NN') and test[t][2][1] != 'PRP':
            vis.append(test[t])
       
        elif test[t][0][0].istitle() and test[t][0][1] == 'JJ' and test[2][1] != 'PRP' :
            vis.append(test[t])
        
        
        # Script 1
        elif test[t][0][0].istitle() and test[t][0][1] == 'PRP' and test[t][1][1] == 'VBP' and test[t][2][1] == 'NNS':
            vis.append(test[t])
        elif test[t][0][0].istitle() and test[t][0][1] == 'PRP' and test[t][1][1] == 'VBZ':
            vis.append(test[t])
        elif test[t][0][0].istitle() and test[t][0][1] == 'NNP' and test[t][1][1] == 'VBZ':
            vis.append(test[t])
        elif test[t][0][1] == 'DT' and test[t][1][1] == 'NN' and test[t][2][1] == 'VBG':
            vis.append(test[t])
        elif test[t][0][0].isupper() and test[t][0][1] == 'NN' and test[t][1][0].isupper() and test[t][1][1] != 'NNP':
            vis.append(test[t])
        elif test[t][0][0].isupper() and test[t][0][1] == 'NNP' and test[t][1][0].isupper() and test[t][1][1] != 'NNP':
            vis.append(test[t])
        elif test[t][0][0].isupper() and test[t][1][0] == r'CONTD' and test[t][1][1] != 'NNP':
            vis.append(test[t])
        elif test[t][0][0].isupper() and test[t][1][0] == r'LIEUTENANT':
            dialogue.append(test[t])
        
        # Script 2
        
        
        else:
            dialogue.append(test[t])



dialogue



[[('©', 'JJ'), ('Music', 'NNP'), ('starts', 'NNS')],
 [('‘', 'NN'),
  ('hell', 'NN'),
  ('’', 'NNP'),
  ('b', 'NN'),
  ('‘', 'NNP'),
  ('w', 'NN'),
  ('’', 'NNP'),
  ('c', 'VBZ'),
  ('‘', 'NN')],
 [('Lines', 'NNS'), ('code', 'NN'), ('appear', 'VBP'), ('typed', 'VBN')],
 [('extrn', 'NN'),
  ('b', 'NN'),
  ('c', 'NN'),
  ('putchar', 'NN'),
  ('putchar', 'NN'),
  ('b', 'NN'),
  ('putchar', 'NN'),
  ('c', 'NN'),
  ('putchar', 'NN'),
  ('’', 'NNP'),
  ('’', 'NNP'),
  ('*n', 'NNP'),
  ('’', 'NN')],
 [('view', 'NN'),
  ('ultra-cool', 'JJ'),
  ('ultra-designed', 'JJ'),
  ('open-plan', 'JJ'),
  ('office', 'NN')],
 [('In', 'IN'),
  ('kitchen', 'NNP'),
  ('area', 'NN'),
  ('young', 'JJ'),
  ('men', 'NNS'),
  ('women', 'NNS'),
  ('mill', 'VBP'),
  ('chat', 'VBN'),
  ('Casually', 'NNP'),
  ('dressed', 'JJ'),
  ('Feels', 'NNS'),
  ('like', 'IN'),
  ('intelligent', 'JJ'),
  ('relaxed', 'NN'),
  ('environment', 'NN')],
 [('Behind', 'IN'),
  ('young', 'JJ'),
  ('men', 'NNS'),
  ('women', 'NNS'),
  ('si

In [None]:
## OLD CLEANER

titles = list(ACS['Title'])
def multi_clean(text, titles):
    for title in titles:
        if titles[0]:
            text = re.sub('\n\n\\x0c\s+\d+\.','', text)
            text = re.sub('^\s+','',text)
            text = re.sub('\s+\d\.', '', text)
            text = re.sub('\d\.', '', text)
            text = re.sub('\n\s\d+\.\n', ' ', text)
            text = re.sub("\(CONTINUED:\)|\(CONTINUED\)|CONTINUED:|COUNTINUOUS| CONTINUOUS|CONT'D|\(CONT'D\)", ' ', text)
            text = re.sub('\s+\d', '', text)
            text = re.sub('\(\d+\)', '', text)
            text = re.sub('\d', '', text)
            text = re.sub(' +', ' ', text)
            text = re.sub('\(MORE\)', '', text)
            text = re.sub('\(V\.O\.\)|\(O\.S\.\)|\(O\.C\.\)| V.O.', '', text)
            text = re.sub("\(CONT’D\)|\(CONT'D\)|\(CONT'D\)|\s\(CONT'D\)\s+| (CONT’D)",'',text)
            text = re.sub('\n\n\\x0c\s+\d+\.','', text)
            text = re.sub('\\x0c','',text)
        elif titles[1]:
            text = re.sub("\(CONT’D\)|\(CONT'D\)|\(CONT'D\)|\s\(CONT'D\)\s+| (CONT’D)",'',text)
            text = re.sub('\n\n\\x0c\s+\d+\.','', text)
            text = re.sub('\\x0c','',text)
            text = re.sub('\n*\s+\\n+\d+\.', ' ', text)
            text = re.sub('\d+\.', '',text)
            text = re.sub(' +', ' ', text)
            text = re.sub('\(V\.O\.\)|\(O\.S\.\)|O\.S\.|V\.O\.|\sV\.O\.', '', text)
            text = re.sub('[-/\]', '', text)
        elif titles[2]:
            text = re.sub(r'14 FINAL SHOOTING SCRIPT  |\n 12.17.14 FINAL SHOOTING SCRIPT \d+\.|\n 12.17.14 FINAL SHOOTING SCRIPT |\n+ \d+ FINAL SHOOTING SCRIPT  \n+', '', text)
            text = re.sub("\(CONTINUED:\)|\s\(CONT’D\)\n| \(CONT’D\) \n|\(CONTINUED\)|CONTINUED:|COUNTINUOUS| CONTINUOUS|CONT'D|\(CONT'D\)| \(CONT'D\)|\(CONT..\d\;D\)", ' ', text)
            text = re.sub('\nPage \d+\/124', '', text)
            text = re.sub('\n\/Page', '', text)
            text = re.sub('\\x0c|\\x0c ', '', text)
            text = re.sub('\d+\.', '',text)
            text = re.sub('TITLES:', '', text)
            text = re.sub(' +', ' ', text)
            text = re.sub('\(V\.O\.\)|\(O\.S\.\)', '', text)
        elif titles[3]:
            text = re.sub('\d+\n+\d+\w\n+\d|\n+\d+', '', text)
            text = re.sub('\n+\\x0c\d+','', text)
            text = re.sub("\(CONT’D\)| \(CONT'D\)",'', text)
            text = re.sub(' +', ' ', text)
            text = re.sub('\(V\.O\.\)|\(O\.S\.\)', '', text)
        elif titles[4]:
            text = re.sub('\nPage \d+\/94', '', text)
            text = re.sub('\\x0c', '', text)
            text = re.sub('\n\d+\.', '', text)
            text = re.sub(' +', ' ', text)
            text = re.sub('\(V\.O\.\)|\(O\.S\.\)', '', text)
        elif titles[5]:
            text = re.sub("\(CONT’D\)| \(CONT'D\)",'', text)
            text = re.sub('\\x0c', ' ', text)
            text = re.sub('\n\d+\.', ' ', text)
            text = re.sub(' +', ' ', text)
            text = re.sub('\(V\.O\.\)|\(O\.S\.\)', '', text)
        elif titles[6]:
            text = re.sub('^\s', '', text)
            text = re.sub('\\x0c', ' ', text)
            text = re.sub('\n\d+\.|\n \d+\.', ' ', text)
            text = re.sub('\(MORE\)', '', text)
            text = re.sub(' +', ' ', text)
            text = re.sub('\(V\.O\.\)|\(O\.S\.\)', '', text)
        elif titles[7]:
            text  = re.sub('\n\*\n', '', text)
            text = re.sub('\\x0c', ' ', text)
            text = re.sub('\n +', ' ', text)
            text = re.sub("\(CONT’D\)| \(CONT'D\)| \(CONT\\'D\)",'', text)
            text = re.sub('\n\d+\.|\n \d+\.', ' ', text)
            text = re.sub(' +', ' ', text)
            text = re.sub('\(V\.O\.\)|\(O\.S\.\)', '', text)
        elif titles[8]:
            text = re.sub('^\s', '', text)
            text = re.sub('\\x0c\d\.', ' ', text)
            text = re.sub('\\x0c', ' ', text)
            text = re.sub("\(CONT’D\)| \(CONT'D\)| \(CONT\\'D\)",'', text)
            text = re.sub('\n\d+|\n\d+\.| \d+\n| \d+\.', '', text)
            text = re.sub(' +', ' ', text)
            text = re.sub('\(V\.O\.\)|\(O\.S\.\)', '', text)
        elif titles[9]:
            text = re.sub('\nPage \d+\/152', '', text)
            text = re.sub('\\x0c', ' ', text)
            text = re.sub('\n11/26/14', '', text)
            text = re.sub('\d+\.|\d+', '', text)
            text = re.sub("\(CONT\’D\)| \(CONT\'D\)| \(CONT\'D\)\s+| \(CONT\\'D\)",' ', text)
            text = re.sub('\(V\.O\.\)|\(O\.S\.\)|\(O\.C\.\)', '', text)
        elif titles[10]:
            text = re.sub('^\n\(\)- ', '', text)
            text = re.sub('^\n\(d+\)d+\-d+ ', '', text)
            text = re.sub('\\x0c', ' ', text)
            text = re.sub('\n\d\.| \d\.|\n \d+\.|\n\d+\.', '', text)
            text = re.sub("\(CONT’D\)| \(CONT'D\)| \(CONT\\'D\)",' ', text)
            text = re.sub('\(MORE\)', '', text)
            text = re.sub('\(V\.O\.\)|\(O\.S\.\)', '', text)
        elif titles[11]:
            text = re.sub('\\x0c', ' ', text)
            text = re.sub('^\s+', '', text)
            text = re.sub('^Films\n+ \d\n\n', '', text)
            text = re.sub("\(CONT’D\)| \(CONT'D\)| \(CONT\\'D\)",' ', text)
            text = re.sub('\n\d\.| \d\.|\n \d+\.|\n\d+\.| \d\n|\n\d|\n\d |\n\d\n', '', text)
            text = re.sub('\(V\.O\.\)|\(O\.S\.\)', '', text)
        text = expand_contractions(text)
        return text
    

## Attempts at LookArounds
    
def speaking_parts(text):
    speaking = []
    visual = []
    # split on combo of caps and new lines
    #text = re.split("\\n[A-Z]*\s?\\n", text)
    #for line in text:
        #if line.isupper() == True:
    pattern = re.compile('[A-Z]+.+[A-Z] \n\n(\w.+\.)(?= \n\n)')
    pattern2 = re.compile('[A-Z]+.+[A-Z]()(?= \n\n)')
    pattern3 = re.compile('(?<=\n\nVISUALS\s)([A-Z]+.+)(?=\n\n)')
    pattern4 = re.compile('(?<=\n\nVISUALS\s)([A-Z]+.+)(?=\n)')
    pattern5 = re.compile('(?<=\n\nVISUALS\s)([A-Z]+.+)(?=\n)')
    pattern6 = re.compile('(?<=\n\n[A-Z] \n\n)(\w+.+\n+\w.+)(?=\.|\?|\!)')
    pattern7 = re.compile('(?<=\n\n\DOROTHEA\s\n\n)\w.*\w+')
    text = re.sub(' \n', ' ', text)
    text = re.sub('\n', '', text)
    text = re.sub('[)/\-;(] ','', text)
    # gets all capital words
    vis = re.findall(pattern7, text)
    
    #pattern_a = re.compile('(?<=\n\n[A-Z] \n)(\w.+\w.)')
    #vis2 = re.findall()
    

    return text