# Data Cleaning of Scripts



In [259]:
# import libraries
import requests
import pandas as pd
import re
import numpy as np 
import os
import docx2txt

In [260]:
# functions for saving in pickle format 

import pickle
def pickle_dataframe(x, filename):
    with open(filename, "wb") as f:
        pickle.dump(x, f)
        
def open_pickled_dataframe(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

In [261]:
# Set working directory
os.getcwd()
os.chdir('/Users/Sofie/Disney Thesis')

In [262]:
# import scripts 
movies = open_pickled_dataframe('Scraped_scripts.pickle')

In [263]:
# import snowwhite script collected after the other data set 
snowwhite_movie = open_pickled_dataframe('snowwhite_script.pickle')
movies = movies.append(snowwhite_movie)
movies = movies.reset_index(drop = True)

In [264]:
# Types of htlm code for cleaning the scripts in different loops
p_tags = ['Cinderella', 'Bolt', 'Tangled', 'Frozen 2', 'The Incredibles', 'Finding Nemo',
          'Finding Dory', 'Toy Story', 'Toy Story 4', 'Toy Story 2', 'Toy Story 3', 
          'Incredibles 2', 'Monsters University', 'Monsters, Inc.', 'Cars', 'Cars 3']

b_tags = ['Pinocchio', 'The Great Mouse Detective',
          'Dumbo', 'The Emperors New Groove', 'Lilo & Stitch', 'Brother Bear']

n_split = ['Hercules', 'The Black Cauldron', 'The Rescuers', 'The Aristocats', 
           'The Jungle Book', 'Fun and Fancy Free']

#'Beauty and the Beast'

p_split = ['Alice in Wonderland', 'The Little Mermaid',  
           'The Rescuers Down Under', 'Mulan',  'Oliver & Company', 'Sleeping Beauty',
          'Snow White and the Seven Dwarfs']

p_hunch = ['The Hunchback of Notre Dame']
p_fox = ['The Fox and the Hound']

test = ['One Hundred and One Dalmatians', 'The Lion King']

b_split = ['Pocahontas']

title_p = ['Zootopia']
title_b = ['Moana']

html_style = []
for movie in movies['Movie']:
    if movie in p_tags:
        html = 'p-tag'
    elif movie in b_tags:
        html = 'b-tag'
    elif movie in n_split:
        html = 'n-split'
    elif movie in test:
        html = 'test'
    elif movie in title_p:
        html = 'title_p'
    elif movie in title_b:
        html = 'title_b'
    elif movie in p_fox:
        html = 'p_fox'
    elif movie in p_split:
        html = 'p_split'
    elif movie in p_hunch:
        html = 'p_hunch'
    elif movie in b_split:
        html = 'b_split'
    else:
        html = np.nan
    html_style.append(html)

movies['html'] = html_style

# Clean scripts and create datasets for each movie

In [265]:
# convert scripts to strings 
movies['Script'] = movies['Script'].astype(str)

In [266]:
# function for slicing text out before a character 
def slicer(my_str,sub):
    index=my_str.find(sub)
    if index != -1:
        return my_str[index:] 
    else :
        raise Exception('Sub string not found!')

In [267]:
# remove text and html-code before the transcript 
# OBS: Found manually
movies['Script'][0] = slicer(movies['Script'][0], 'Jiminy Cricket:') #Pinocchio
movies['Script'][1] = slicer(movies['Script'][1], '<p><b>Pongo</b>:') #One Hundred and One Dalmatians
movies['Script'][35] = slicer(movies['Script'][35], 'Bagheera:') #The Jungle Book
movies['Script'][33] = slicer(movies['Script'][33], '{Sunrise on African') # The Lion King
movies['Script'][42] = slicer(movies['Script'][42], '(An ocean.') # The little Mermaid
movies['Script'][11] = slicer(movies['Script'][11], '<p>Flynn:') # Tangled
movies['Script'][18] = slicer(movies['Script'][18], '<b>Narrator</b>:') # Dumbo
movies['Script'][15] = slicer(movies['Script'][15], '\nChorus:') # Fun, fancy and free
movies['Script'][16] = slicer(movies['Script'][16], '<p>Narrator:') #Cindrella
movies['Script'][31] = slicer(movies['Script'][31], '<p>Chorus:') #Alice in Wonderland
movies['Script'][36] = slicer(movies['Script'][36], '\n<dt> Narrator:') #Sleeping Beauty
movies['Script'][14] = slicer(movies['Script'][14], 'Eddie:') #The Rescuers
movies['Script'][2] = slicer(movies['Script'][2], '[<i>The') # The fox and the Hound
movies['Script'][45] = slicer(movies['Script'][45], 'Prologue') # The Black Cauldron
movies['Script'][3] = slicer(movies['Script'][3], '(<i>The camera') #The Great Mouse Detective
movies['Script'][4] = slicer(movies['Script'][4], '<p><i>(The city') # Oliver & Co
movies['Script'][43] = slicer(movies['Script'][43], '(opening: ') #The Rescuers Down Under
movies['Script'][37] = slicer(movies['Script'][37], 'NARRATOR:') #Beauty and the Beast
movies['Script'][39] = slicer(movies['Script'][39], 'PEDDLER:') #Aladin
movies['Script'][12] = slicer(movies['Script'][12], '</p><p>ENGLISH CHORUS<br/>') #Pocahontas
movies['Script'][13] = slicer(movies['Script'][13], 'Clopin</a>:') #The Hunchback of Notre Dame
movies['Script'][32] = slicer(movies['Script'][32], 'Narrator:') #HErcules
movies['Script'][41] = slicer(movies['Script'][41], 'Guard [yelling]:') #Mulan
movies['Script'][5] = slicer(movies['Script'][5], '<dl><dd><b>Kuzco</b>:') # The Emperors New Groove
movies['Script'][6] = slicer(movies['Script'][6], 'li><b>Grand') #Lilo and Stich
movies['Script'][7] = slicer(movies['Script'][7], '<dd><b>Man:</b> ') #Brother Bear
movies['Script'][8] = slicer(movies['Script'][8], '</p><p>Penny:') #Bolt
movies['Script'][40] = slicer(movies['Script'][40], '<b>                            YOUNG ANNA') #Frozen
movies['Script'][10] = slicer(movies['Script'][10], '\n<h2><span class="mw-headline" id="Scene_1:') #Zootopia
movies['Script'][17] = slicer(movies['Script'][17], 'title="Gramma Tala') #Moana
movies['Script'][9] = slicer(movies['Script'][9], '<p>Agnarr (O.S.):') #Frozen 2
movies['Script'][19] = slicer(movies['Script'][19], '<dl><dd><b>Lightning McQueen</b>:') #Cars
movies['Script'][20] = slicer(movies['Script'][20], '<b>Mother</b>:') #Monsters, Inc
movies['Script'][21] = slicer(movies['Script'][21], '<p><b>Kids</b>:') #Monsters University
movies['Script'][22] = slicer(movies['Script'][22], '<b>Utility Belt Buzz</b>:') #Toy Story 2
movies['Script'][23] = slicer(movies['Script'][23], '<b>One Eyed Bart:</b>') #Toy Story 3
movies['Script'][24] = slicer(movies['Script'][24], '<p><b>Jessie</b>:') #Toy Story 4
movies['Script'][25] = slicer(movies['Script'][25], '<p><b>Andy</b>:') #Toy Story
movies['Script'][26] = slicer(movies['Script'][26], '<p><b>Marlin</b>:') #Finding Neme
movies['Script'][27] = slicer(movies['Script'][27], '<p>Young Dory:') #Finding Dory
movies['Script'][28] = slicer(movies['Script'][28], '<p>Rick Dicker:') #Incredibles 2
movies['Script'][29] = slicer(movies['Script'][29], '<p><b>Mr. Incredible:</b>') #The incredibles
movies['Script'][30] = slicer(movies['Script'][30], '<b>Mack:</b>') #Cars 3
movies['Script'][38] = slicer(movies['Script'][38], '\nCHOIR:') #Peter Pan
movies['Script'][46] = slicer(movies['Script'][46], '<li>Narrator:') #Peter Pan

In [268]:
scripts = []

for index, row in movies.iterrows():
    script = row['Script']
    ## fordi der er THE END in movies['Script'][29]
    if "THE END" in script and row['Movie'] != "The Incredibles":
        print("THE END", row['Movie'])
        new_script = script.split("THE END",1 )[0]
    elif "The End" in script:
        new_script = script.split("The End",1 )[0]
    elif "CLOSING TITLES" in script:
        new_script = script.split("CLOSING TITLES",1)[0]
    elif "NewPP limit report" in script: 
        new_script = script.split("NewPP limit report",1 )[0]
    elif "The end" in script:
        print("found")
        new_script = script
    #    new_script = script.split("The end",1 )[0]
    elif "Part 29: End Credits" in script:
        new_script = script.split("Part 29: End Credits",1 )[0] 
    elif "Credits\\n" in script:
        new_script = script.split("Credits\\n", 1)[0]
    else:
        new_script = script
    scripts.append(new_script)

movies['Script'] = scripts
        
        
        

THE END One Hundred and One Dalmatians
THE END Dumbo
THE END Toy Story
found
THE END The Aristocats
THE END The Jungle Book
found
THE END Peter Pan
found
THE END Frozen
THE END The Little Mermaid
THE END The Rescuers Down Under
THE END Cars 2


In [269]:
movies['Script'][13]

'Clopin</a>:</b> MORNING IN PARIS, THE CITY AWAKES<br/>\nTO THE BELLS OF NOTRE DAME<br/>\nTHE FISHERMAN FISHES, THE BAKERMAN BAKES<br/>\nTO THE BELLS OF NOTRE DAME<br/>\n</p><p>TO THE BIG BELLS AS LOUD AS THE THUNDER<br/>\nTO THE LITTLE BELLS SOFT AS A PSALM<br/>\nAND SOME SAY THE SOUL OF THE CITY\'S THE TOLL OF THE BELLS<br/>\nTHE BELLS OF NOTRE DAME<br/>\n</p><p>Listen, they\'re beautiful, no? So many colors of sounds, so many changing moods. Because, you know, they don\'t ring all by themselves.\n</p><p><b>Puppet:</b> They don\'t?!?\n</p><p><b>Clopin:</b> No, silly boy. Up there, high, high in the dark bell tower, lives the mysterious bell ringer. Who is this creature?\n</p><p><b>Puppet:</b> Who?\n</p><p><b>Clopin:</b> What is he?\n</p><p><b>Puppet:</b> What?\n</p><p><b>Clopin:</b> How did he come to be there?\n</p><p><b>Puppet:</b> How?\n</p><p><b>Clopin:</b> Hush!\n</p><p><b>Puppet:</b> Ohhh...\n</p><p><b>Clopin:</b> And Clopin will tell you. It is a tale, a tale of a man and a mo

In [270]:
#Save total dataframe 
movies.to_excel (r'Clean_scraped_scripts_03mar.xlsx', index = False, header=True)

## From string to dataframe

In [271]:
#Function for cleaning text 
def cleaner(text):
    text = re.sub("[\(\[].*?[\)\]]", "", text)
    text = re.sub("<[^>]*>", "", text) 
    text = re.sub("\n", " ", text)
    text = re.sub("\\n", "", text)
    return text

In [272]:
#Cleaning scripts with p-tags <p>

clean_script  = []

for index, row in movies.iterrows():
    if row['html'] == 'p-tag':
        script = row['Script']
        total_text = script.split('<p>')
        dataframe = pd.DataFrame()

        print(row['Movie'])
        lines = []
        characters = []
        length = []

        for speech in total_text:
            
            speech = re.sub("[\(\[].*?[\)\]]", "", speech)
            speech = speech.split(':')
            lenght_ = len(speech)
            
            # if multiple lines inside one p-tag 
            if len(speech) > 3:
                for i in range(len(speech) - 1):
                    try:
                        character = speech[i].split("<b>")[1]
                        line      = speech[i + 1].split("<b>")[0]
                        
                        character = re.sub("</b>", "", character) 
                        
                        
                        line = cleaner(line)
                        
                        characters.append(character)
                        lines.append(line)
                        length.append(length_)
                    except:
                        pass
                continue
                    
                     
            elif len(speech) == 3:
                character = speech[0]
                character = cleaner(character)
                
                
                line = speech[1] + speech[2]
                line = str(line)
                line = cleaner(line)
                
                
            elif len(speech) == 2:
                character = speech[0]
                character = cleaner(character)
                
                line = speech[1]
                line = cleaner(line)
                
            else:
                character = np.nan
                line = speech[0]
                sub_h3 = '<h3>'
                sub_h2 = '<h2>'
                if sub_h3 or sub_h2 in line:
                    line = ""
                else:
                    line = line
                line = cleaner(line)
                
            lines.append(line)
            characters.append(character)
            length.append(lenght_)
        #
        dataframe['Character'] = characters
        dataframe['Line'] = lines

        # drop rows with no lines 
        dataframe = dataframe[dataframe.Line != ''].reset_index(drop=True)
        dataframe = dataframe[dataframe.Line != '<!-- NewPP limit reportCached time'].reset_index(drop=True)

        # check if line is a song 
        song = []
        for index, row in dataframe.iterrows():
            if pd.isna(row['Character']):
                song_ = 1
            else:
                song_ = 0
            song.append(song_)   
        dataframe['Song'] = song
    
    #Cleaning b-tags
    elif row['html'] == 'b-tag':
        
        script = row['Script']
        song = []
        
        total_text = script.split('<b>')
        dataframe = pd.DataFrame()
        dataframe['Total'] = total_text
        print(row['Movie'])
    
        lines = []
        characters = []
        
        for speech in dataframe['Total']:
            
            speech = speech.split(':')
            if len(speech) >= 2:
                character = speech[0]
                character = cleaner(character)
        
                line = speech[1]
                if '♪' in line or '>singing<' in line:
                    song_test = 1
                else:
                    song_test = 0
                if song_test == 1:
                    line = re.sub('</i>', '', line)
                    line = re.sub('<i>', '', line)
                else: 
                    line = re.sub("<[i][^>]*>(.+?)</[i]>", '', line)
                line = cleaner(line)
                
                
            else:
                character = np.nan
                line = speech[0]
                line = cleaner(line)
                song_test = 0
            
            lines.append(line)
            song.append(song_test)
            characters.append(character)
        #print(lines)
        dataframe['Character'] = characters
        dataframe['Line'] = lines
        dataframe['Song'] = song
        
        song_new = []
        for index, row in dataframe.iterrows():
            if row['Song'] == 1:
                song_ = 1
            elif pd.isna(row['Character']) == True:
                song_ = 1
            else:
                song_ = 0
            song_new.append(song_)   
        dataframe['Song'] = song_new
    
    else:
        dataframe = np.nan
    clean_script.append(dataframe)
        
movies['Clean_Script'] = clean_script 


Pinocchio
The Great Mouse Detective
The Emperors New Groove
Lilo & Stitch
Brother Bear
Bolt
Frozen 2
Tangled
Cinderella
Dumbo
Cars
Monsters, Inc.
Monsters University
Toy Story 2
Toy Story 3
Toy Story 4
Toy Story
Finding Nemo
Finding Dory
Incredibles 2
The Incredibles
Cars 3


In [273]:
for line in movies['Clean_Script'][18]['Line'][140:]:
    print(line)

 That windbag! Why doesn't he come to the point?! 
 Hundred pounds! And now, I present the world's smallest little elephant... who will spring from this springboard in one spring to the top of this pyramid, waving his little flag for a grand climax! Ladies and gentlemen, I give you... Dumbo! Timothy Q. Mouse
 Look out, look out Pink elephants on parade Here they come Hippity-hoppity They're here and there Pink elephants everywhere Look out, look out They're walking around the bed on their head Clippity-cloppity Arrayed in braid Pink elephants on parade What'll I do What'll I do What an unusual view I can stand the sight of worms and look at microscopic germs But Technicolor pachyderms is really too much for me I am not the type to faint When things are odd or things are quaint But seein'things you knowthere ain't Can certainly give you an awful fright What a sight Chase 'em away, chase 'em away I'm afraid, need your aid Pink elephants on parade! Ade, ade, ade... Pink elephants Pink ele

## Rens zootopia

In [274]:
# CLEANING OF ZOOTOPIA AND MOANA

clean_script  = []
movies_title_p = []

for index, row in movies.iterrows():
    if row['html'] == 'title_p':
        
        movie = row['Movie']
        movies_title_p.append(movie)
        
        script = row['Script']
        script = re.sub("[\(\[].*?[\)\]]", "", script)
        script = re.sub("|\<h2.*\>(.*\n*)\</h2\>|isU'", "", script)
        script = re.sub("<a[^>]*>", "", script) 
        
        total_text = script.split('<p>') #</p>
        dataframe = pd.DataFrame()
        dataframe['Total'] = total_text
        
        lines = []
        characters = []
        length = []

        for speech in dataframe['Total']:
            
            speech = speech.split(':')
                
            if len(speech) == 2:
                character = speech[0]
                character = cleaner(character) 
                line = speech[1]
                line = cleaner(line)
            else:
                character = np.nan
                line = speech[0]
            
            lines.append(line)
            characters.append(character)
            length.append(lenght_)
        #print(lines)
        dataframe['Character'] = characters
        dataframe['Line'] = lines
        for index, row in dataframe.iterrows():
            if pd.isna(row['Character']):
                dataframe.drop(index, inplace=True) 
        
        
        clean_script.append(dataframe)
        
    elif row['html'] == 'title_b':
        
        movie = row['Movie']
        movies_title_p.append(movie)
        
        script = row['Script']
        script = re.sub("[\(\[].*?[\)\]]", "", script)
        
        total_text = script.split('<b>') 
        dataframe = pd.DataFrame()
        dataframe['Total'] = total_text
        
        lines = []
        characters = []
        length = []
        songs = []

        for speech in dataframe['Total']:
            
            speech = speech.split(':')
            lenght_ = len(speech)
                
            if len(speech) == 2:
                character = speech[0]
                character = cleaner(character) 
                line = speech[1]
                line = cleaner(line)
                
                
                if '♫' in line:
                    song = 1
                else:
                    song = 0
            else:
                character = np.nan
                line = speech[0]
                song = 0
            
            lines.append(line)
            characters.append(character)
            length.append(lenght_)
            songs.append(song)
        
        dataframe['Character'] = characters
        dataframe['Line'] = lines
        dataframe['Song'] = songs
        
        for index, row in dataframe.iterrows():
            if pd.isna(row['Character']):
                dataframe.drop(index, inplace=True) 
        
        clean_script.append(dataframe)
        
movies_test = pd.DataFrame()
movies_test['Clean_Script'] = clean_script 
movies_test['Movie'] = movies_title_p 

In [275]:
# convert docs to text files (coco, snowwhite, insideout etc )

os.chdir('/Users/Sofie/Disney Thesis/scripts')

word_scripts = ['Coco', 'Inside Out', 'Luca', 
                'Raya and the Last Dragon', 'Onwards', 'Up']

clean_scripts_docs = []

for movie in word_scripts:
    name_word = movie + ".docx"
    # Passing docx file to process function
    text = docx2txt.process(name_word)
    # Saving content inside docx file into output.txt file
    name_txt = movie +"_script.txt"
    with open(name_txt, "w") as text_file:
        print(text, file=text_file)
        
    LINE_DELIM = "__LINE_DELIM__"

    # read script.
    text_in = ""
    with open(name_txt, "r") as f:
        text_in = f.read()

    # remove non-printable characters from script.
    text_0 = re.sub(r"[\u00bf-\uffff]", "", text_in)
    # remove things in brackets 
    text_0 = re.sub("[\(\[].*?[\)\]]", "", text_0)

    # add delimeters to lines which can be used to split on.
    # this simply adds the string "__LINE_DELIM__" in front of every character name.
    text_1 = re.sub(r"([A-Z]{2,}.*\n)", fr"{LINE_DELIM}\1", text_0)
    
    # split on the added line delimeters and remove empty results.
    text_2 = [s for s in text_1.split(LINE_DELIM) if len(s) > 0]
    
    # split each string in text_2 into the character and their line.
    # this is done by splitting on the first newline.
    text_3 = [s.split("\n", 1) for s in text_2]
    
    # remove extra whitespace from lines.
    text_out = np.array([[character, re.sub(r"\s+", " ", line)]
                         for character, line in text_3])
    
    # extract characters and lines.
    characters = text_out[:, 0]
    lines      = text_out[:, 1]
    
    movie_script = pd.DataFrame(columns = ["Character", "Line"])
    
    movie_script["Character"] = characters
    movie_script["Line"]      = lines
    
    # delete row if line is empthy 
    for index, row in movie_script.iterrows():
            if pd.isna(row['Line']):
                movie_script.drop(index, inplace=True)
            elif row['Line'] == ' ':
                movie_script.drop(index, inplace=True)
                
    
    clean_scripts_docs.append(movie_script)
    
movies_docs = pd.DataFrame()
movies_docs['Clean_Script'] = clean_scripts_docs
movies_docs['Movie'] = word_scripts

In [276]:
# merge movies_test + movies_docs
df_res = movies_test.append(movies_docs)
df_res = df_res.reset_index(drop=True)

In [277]:
# read in Bambi, Robin Hood and Lady and the Tramp and merge on big data set movies
xlsx_movies = ['Bambi', 'Robin Hood', 'Lady and the Tramp', 'Encanto', 'Brave', 
               'The Princess and the Frog', 'Home on the Range', 'Atlantis']

clean_scripts = []

for movie in xlsx_movies:
    #xlsx name 
    movie_name = movie +'.xlsx'
    text = pd.read_excel(movie_name)
    lines = []
    characters = []
    songs = []
    for index, row in text.iterrows():
        line = str(row['Line'])
        line = cleaner(line)
        #print(line[:50])
        character = str(row['Character'])
        character = cleaner(character)
        
        if 'Song' in text.columns:
            song = row['Song']
        else: 
            song = np.nan
        lines.append(line)
        characters.append(character)
        songs.append(song)
    
    df = pd.DataFrame()
    df['Character'] = characters
    df['Line'] = lines
    df['Song'] = songs
    
    # if line is empty delete
    for index, row in df.iterrows():
            if pd.isna(row['Line']):
                df.drop(index, inplace=True)
            elif row['Line'] == ' ':
                df.drop(index, inplace=True)
            elif row['Character'] == "nan":
                df.drop(index, inplace=True)
    # if charachter is empty delete
        
    
        
   
    clean_scripts.append(df)

movies_xlsx = pd.DataFrame()
movies_xlsx['Movie'] = xlsx_movies
movies_xlsx['Clean_Script'] = clean_scripts
    
df_res = df_res.append(movies_xlsx)
df_res = df_res.reset_index(drop=True)
    

In [278]:
#Frozen
frozen = movies['Script'][40]
frozen_test = frozen.split('\r\n\r\n')


clean_2 = []
for line in frozen_test:
    if line.startswith(" <b>") or line.startswith("<b>") == True:
        clean_2.append(line)

frozen_df = pd.DataFrame()
frozen_df['Total'] = clean_2       
frozen_df

characters = []
lines = []
    
for text in frozen_df['Total']:
    speech = text.split('</b>')
    
    if len(speech) == 2:
        character = speech[0]
        character = str(character)
        character = cleaner(character)
        character = re.sub(r'\r', '', character)
        characters.append(character)
        
        line = speech[1]
        line = str(line)
        line = cleaner(line)
        line = re.sub(r'\r', '', line)
        lines.append(line)
        
    else: 
        character = speech[0]
        character = str(character)
        character = cleaner(character)
        line = speech[1:]
        
        line = [i for i in line if i.startswith('<b>')]
        line = ' '.join([str(x) for x in line])
        line = cleaner(line)
        line = re.sub(r'\r', '', line)
            
        lines.append(line)
        characters.append(character)
    
frozen = pd.DataFrame()
frozen['Character'] = characters
frozen['Line'] = lines   
 
frozen = frozen[frozen["Character"].str.contains("INT.") == False]
frozen = frozen[frozen["Character"].str.contains("TO:") == False]
frozen = frozen[frozen["Character"].str.contains("EXT.") == False]
frozen = frozen[frozen["Character"].str.contains("MONTAGE:") == False]
frozen = frozen[frozen.Line != ""].reset_index(drop=True)
frozen  

df3 = {'Clean_Script': frozen, 'Movie': 'Frozen'}

df_res = df_res.append(df3, ignore_index = True)

In [279]:
#Aladin

aladin = movies['Script'][39]
aladin = re.sub('_x000D_', "", aladin)

index_first_character = re.search(r"[A-Z]{2,}.*?:", aladin).start()

# extract everything starting from the first character name.
aladin = aladin[index_first_character:]

aladin_split = re.split(r"([A-Z]{2,}).*?:", aladin)[1:]

# extract characters and lines. if our assumptions about the input text were
# correct, then each character name will reside at an even index, while the
# corresponding line will reside at the immediately following index (ie. odd
# indices).
characters = aladin_split[0::2]
lines      = aladin_split[1::2]
clean_lines = []
for line in lines:
    line = cleaner(line)
    line = re.sub("[\(\[].*?[\)\]]", "", line)
    line = re.sub("\r", "", line)
    clean_lines.append(line)

aladin_df = pd.DataFrame()
aladin_df['Character'] = characters
aladin_df['Line']      = clean_lines

clean_list = []
clean_list.append(aladin_df)

df10 = pd.DataFrame()
df10['Clean_Script'] = clean_list
df10['Movie'] = 'Aladdin'

df_res = df_res.append(df10, ignore_index = True)  

In [280]:
# Cleaning of p_split 

clean_script  = []
movie_name = []

for index, row in movies.iterrows():
    if row['html'] == 'p_split':
        
        movie = row['Movie']
        movie_name.append(movie)
        
        script = row['Script']
        script = re.sub(r"_x000D_", "", script)
        #script = re.sub(r"\n", " ", script)
        script = re.sub("[\(\[].*?[\)\]]", "", script)
        script = re.sub(r' {[^}]*}','',script)
        script = re.sub("<[i][^>]*>(.+?)</[i]>", '', script)
        script = re.sub("<[^>]*>", "", script) 
        script = re.sub("&amp;", "&", script)
        
        index_first_character = re.search(r"[A-Z][a-z]{2,}.*?:", script).start()

        # extract everything starting from the first character name.
        script = script[index_first_character:]

        script_split = re.split(r"([A-Z][a-z]{2,}).*?:", script)[1:]
        #print(script_split[:10])
        
        characters = script_split[0::2]
        lines      = script_split[1::2]
        clean_lines = []
        for line in lines:
            line = cleaner(line)
            line = re.sub("[\(\[].*?[\)\]]", "", line)
            line = re.sub("\t", "", line)
            line = re.sub("\r", "", line)
            clean_lines.append(line)

        new_df = pd.DataFrame()
        new_df['Character'] = characters
        new_df['Line']      = clean_lines
        
        
        clean_script.append(new_df)
        
df4 = pd.DataFrame()
df4['Movie'] = movie_name
df4['Clean_Script'] = clean_script
        
df_res = df_res.append(df4, ignore_index = True)      

In [281]:
# Cleaning of the fox and the hound 

clean_script  = []
movie_name = []

for index, row in movies.iterrows():
    if row['html'] == 'p_fox':
        
        movie = row['Movie']
        movie_name.append(movie)
        
        script = row['Script']
        script = re.sub("<[i][^>]*>(.+?)</[i]>", '', script)
        script = re.sub("[\(\[].*?[\)\]]", "", script)
        script = re.sub(r' {[^}]*}','',script)
        script = re.sub("&amp;", "&", script)
        
        index_first_character = re.search(r"[A-Z][a-z]{2,}.*?:", script).start()

        # extract everything starting from the first character name.
        script = script[index_first_character:]

        script_split = re.split(r"([A-Z][a-z]{2,}).*?:", script)[1:]
        
        characters = script_split[0::2]
        lines      = script_split[1::2]
        
        print(lines[14:20])
        clean_lines = []
        for line in lines:
            line = line.split('<p>')[0]
            line = cleaner(line)
            line = re.sub("[\(\[].*?[\)\]]", "", line)
            line = re.sub("\t", "", line)
            line = re.sub("\r", "", line)
            clean_lines.append(line)

        new_df = pd.DataFrame()
        new_df['Character'] = characters
        new_df['Line']      = clean_lines
        
        
        clean_script.append(new_df)
        
df4_test = pd.DataFrame()
df4_test['Movie'] = movie_name
df4_test['Clean_Script'] = clean_script

df_res = df_res.append(df4_test, ignore_index = True)  

[' Well. I was sure I heard someone knocking.\n</p><p>\n</p><p><b>', ' Oh dear! My laundry!\n</p><p>\n</p><p>\n</p><p><b>', ' Oh! Here! Stop it! Oh you pesky birds! Will you come here?!\n</p><p>\n</p><p><b>', ' Oh! Stop!\n</p><p>They drop the clothing on the fox pup.\n</p><p><b>', ' Well! I wonder what got into those birds?\n</p><p>She reaches down and picks up her laundry. Upon spotting the fox, she gasps and drops the laundry. She reaches down and picks it up again, looking underneath.\n</p><p><b>', " Well bless my soul! Why it's. . . it's a baby fox.\n</p><p>She looks at it adoringly.\n</p><p><b>"]


In [282]:
clean_script  = []
movie_name = []

for index, row in movies.iterrows():
    if row['html'] == 'p_hunch':
        
        movie = row['Movie']
        movie_name.append(movie)
        
        script = row['Script']
        script = script.split("<p>(We continue to pull out,",1 )[0]
        script = re.sub("[\(\[].*?[\)\]]", "", script)
        script = re.sub(r' {[^}]*}','',script)
        script = re.sub("<[i][^>]*>(.+?)</[i]>", '', script)
        script = re.sub("<[^>]*>", "", script) 
        script = re.sub("&amp;", "&", script)
        ndex_first_character = re.search(r"[A-Z][a-z]{2,}.*?:", script).start()

        # extract everything starting from the first character name.
        script = script[index_first_character:]

        script_split = re.split(r"([A-Z][a-z]{2,}).*?:", script)[1:]
        #print(script_split[:10])
        
        characters = script_split[0::2]
        lines      = script_split[1::2]
        clean_lines = []
        for line in lines:
            line = cleaner(line)
            line = re.sub("[\(\[].*?[\)\]]", "", line)
            line = re.sub("\t", "", line)
            line = re.sub("\r", "", line)
            clean_lines.append(line)

        new_df = pd.DataFrame()
        new_df['Character'] = characters
        new_df['Line']      = clean_lines
        
        
        clean_script.append(new_df)
        
df4_extra = pd.DataFrame()
df4_extra['Movie'] = movie_name
df4_extra['Clean_Script'] = clean_script
        
df_res = df_res.append(df4_extra, ignore_index = True)

In [283]:
#n-splits
clean_script  = []
movie_name = []

for index, row in movies.iterrows():
    if row['html'] == 'n-split':
        movie = row['Movie']
        print(movie)
        movie_name.append(movie)
        script = row['Script']
        script = re.sub("[\(\[].*?[\)\]]", "", script)
        script = re.sub(r' {[^}]*}','',script)
        #regex = re.compile(r'[\n\r\t]')
        #script = regex.sub(" ", script)
        script = re.sub(r'\\n', ' ',script)
        script = re.sub("<[^>]*>", "", script)
        script = re.sub(" &amp;", " &", script)
  
        
        index_first_character = re.search(r"[A-Z][a-z]{2,} ?[0-9]?:", script).start()

        # extract everything starting from the first character name.
        script = script[index_first_character:]

        script_split = re.split(r"([A-Z][a-z]{2,} ?[0-9]?):", script)[1:]
        #print(script_split[:10])
        
        characters = script_split[0::2]
        lines      = script_split[1::2]
        clean_lines = []
        for line in lines:
            line = cleaner(line)
            line = re.sub("[\(\[].*?[\)\]]", "", line)
            line = re.sub("\r", "", line)
            line = re.sub("\\r", "", line)
            line = line.replace("\\","") #re.sub("/", "", line)
            clean_lines.append(line)

        new_df = pd.DataFrame()
        new_df['Character'] = characters
        new_df['Line']      = clean_lines
        
        
        clean_script.append(new_df)
        
df14 = pd.DataFrame()
df14['Movie'] = movie_name
df14['Clean_Script'] = clean_script
df14       
df_res = df_res.append(df14, ignore_index = True) 
        
        

The Rescuers
Fun and Fancy Free
Hercules
The Aristocats
The Jungle Book
The Black Cauldron


In [284]:
# clean wreck-it ralph
os.chdir('/Users/Sofie/Disney Thesis/scripts')

word_scripts = ['Wreck-It Ralph']

clean_script = []
movie_name = []

for movie in word_scripts:
    movie_name.append(movie)
    name_word = movie + ".docx"
    # Passing docx file to process function
    text = docx2txt.process(name_word)
    # Saving content inside docx file into output.txt file
    name_txt = movie +"_script.txt"
    with open(name_txt, "w") as text_file:
        print(text, file=text_file)

    # read script.
    text_in = ""
    with open(name_txt, "r") as f:
        text_in = f.read()
    
    index_first_character = re.search(r"[A-Z]{2,}:", text_in).start()

    # extract everything starting from the first character name.
    script = text_in[index_first_character:]
    script_split = re.split(r"([A-Z]{2,}):", script)[1:]
    
    characters = script_split[0::2]
    lines      = script_split[1::2]
    clean_lines = []
    for line in lines:
        line = cleaner(line)
        
        clean_lines.append(line)
    new_df = pd.DataFrame()
    new_df['Character'] = characters
    new_df['Line']      = clean_lines
    
    clean_script.append(new_df)
        

df8 = pd.DataFrame()
df8['Movie'] = movie_name
df8['Clean_Script'] = clean_script
   
df_res = df_res.append(df8, ignore_index = True)  

In [285]:
# A Bug's Life
os.chdir('/Users/Sofie/Disney Thesis/scripts')

word_scripts = ['A Bug\'s life']

clean_script = []
movie_name = []

for movie in word_scripts:
    movie_name.append(movie)
    name_word = movie + ".docx"
    # Passing docx file to process function
    text = docx2txt.process(name_word)
    # Saving content inside docx file into output.txt file
    name_txt = movie +"_script.txt"
    with open(name_txt, "w") as text_file:
        print(text, file=text_file)

    # read script.
    text_in = ""
    with open(name_txt, "r") as f:
        text_in = f.read()
    
    #print(text_in)
    index_first_character = re.search(r"[A-Z][a-z]{2,} ?[0-9]?:", text_in).start()
    

    # extract everything starting from the first character name.
    script = text_in[index_first_character:]
    script_split = re.split(r"([A-Z][a-z]{2,} ?[0-9]?):", script)[1:]
    
    characters = script_split[0::2]
    lines      = script_split[1::2]
    clean_lines = []
    for line in lines:
        line = cleaner(line)

        clean_lines.append(line)
    new_df = pd.DataFrame()
    new_df['Character'] = characters
    new_df['Line']      = clean_lines
    
    clean_script.append(new_df)
    

df9 = pd.DataFrame()
df9['Movie'] = movie_name
df9['Clean_Script'] = clean_script

df_res = df_res.append(df9, ignore_index = True)         
 

In [286]:
os.chdir('/Users/Sofie/Disney Thesis/scripts')

word_scripts = ['Ratatouille', 'The Good Dinosaur']

clean_script = []
movie_name = []

for movie in word_scripts:
    movie_name.append(movie)
    name_word = movie + ".docx"
    # Passing docx file to process function
    text = docx2txt.process(name_word)
    # Saving content inside docx file into output.txt file
    name_txt = movie +"_script.txt"
    with open(name_txt, "w") as text_file:
        print(text, file=text_file)

    # read script.
    text_in = ""
    with open(name_txt, "r") as f:
        text_in = f.read()
    script = re.sub("[\(\[].*?[\)\]]", "", text_in)
    #print(script)
    index_first_character = re.search(r"[A-Z]{2,}", script).start()

    # extract everything starting from the first character name.
    script = script[index_first_character:]

    script_split = re.split(r"([A-Z]{2,})", script)[1:]

# extract characters and lines. if our assumptions about the input text were
# correct, then each character name will reside at an even index, while the
# corresponding line will reside at the immediately following index (ie. odd
# indices).
    characters = script_split[0::2]
    lines      = script_split[1::2]
    clean_lines = []
    for line in lines:
        line = cleaner(line)

        clean_lines.append(line)

    new_df = pd.DataFrame()
    new_df['Character'] = characters
    new_df['Line']      = clean_lines
    
    clean_script.append(new_df)


df11 = pd.DataFrame()
df11['Clean_Script'] = clean_script
df11['Movie'] = movie_name

df_res = df_res.append(df11, ignore_index = True)  

In [287]:
# b_split (pocahontas)
clean_script  = []
movie_name = []

for index, row in movies.iterrows():
    if row['html'] == 'b_split':
        
        movie = row['Movie']
        movie_name.append(movie)
        print(movie)
        script = row['Script']
        script = re.sub("[\(\[].*?[\)\]]", "", script)

        
        index_first_character = re.search(r"[A-Z]{2,}.*?<br/>", script).start()

        # extract everything starting from the first character name.
        script = script[index_first_character:]

        script_split = re.split(r"([A-Z]{2,}).*?<br/>", script)[1:]

        
        characters = script_split[0::2]
        lines      = script_split[1::2]
        #print(characters)
        #print(lines)
        
        clean_lines = []
        for line in lines:
            line = cleaner(line)
            clean_lines.append(line)

        new_df = pd.DataFrame()
        new_df['Character'] = characters
        new_df['Line']      = clean_lines
        
        
        clean_script.append(new_df)
        


movies_b = pd.DataFrame()
movies_b['Movie'] = movie_name
movies_b['Clean_Script'] = clean_script
    
df_res = df_res.append(movies_b)
df_res = df_res.reset_index(drop=True)   


Pocahontas


In [288]:
# Cleaning of one hundred 

clean_script  = []
movie_name = []

for index, row in movies.iterrows():
    if row['html'] == 'test':
        
        movie = row['Movie']
        movie_name.append(movie)
        
        script = row['Script']
        script = re.sub("[\(\[].*?[\)\]]", "", script)
        script = re.sub("<[^>]*>", "", script) 
        
        index_first_character = re.search(r"[A-Z][a-z]{1,}:", script).start()

        # extract everything starting from the first character name.
        script = script[index_first_character:]

        script_split = re.split(r"([A-Z][a-z]{1,}):", script)[1:]
        
        characters = script_split[0::2]
        lines      = script_split[1::2]
        clean_lines = []
        for line in lines:
            line = cleaner(line)
            line = re.sub("[\(\[].*?[\)\]]", "", line)
            line = re.sub(r' {[^}]*}','', line)
            line = re.sub(r'\\n','', line)
            
            
            clean_lines.append(line)

        new_df = pd.DataFrame()
        new_df['Character'] = characters
        new_df['Line']      = clean_lines
        
        
        clean_script.append(new_df)
        
df6 = pd.DataFrame()
df6['Movie'] = movie_name
df6['Clean_Script'] = clean_script

df_res = df_res.append(df6, ignore_index = True)  

In [289]:
#Peter Pan

script = movies['Script'][38]
script = script.title()
script = re.sub(r"Mr. Darling:", "Mrdarling:", script)
script = re.sub(r"Mrs. Darling:", "Mrsdarling:", script)
index_first_character = re.search(r"[A-Z][a-z]{2,}.*?:", script).start()

# extract everything starting from the first character name.
script = script[index_first_character:]

script_split = re.split(r"([A-Z][a-z]{2,}).*?:", script)[1:]

# extract characters and lines. if our assumptions about the input text were
# correct, then each character name will reside at an even index, while the
# corresponding line will reside at the immediately following index (ie. odd
# indices).
characters = script_split[0::2]
lines      = script_split[1::2]
clean_lines = []
for line in lines:
    line = cleaner(line)
    line = re.sub("[\(\[].*?[\)\]]", "", line)
    clean_lines.append(line)

df = pd.DataFrame()
df['Character'] = characters
df['Line']      = clean_lines

clean_list = []
clean_list.append(df)

df12 = pd.DataFrame()
df12['Clean_Script'] = clean_list
df12['Movie'] = 'Peter Pan'
df12['Clean_Script'][0]
df_res = df_res.append(df12, ignore_index = True)  

In [290]:
# save movies
# save df_res
os.getcwd()
os.chdir('/Users/Sofie/Disney Thesis')

pickle_dataframe(df_res, "df_res_03mar.pickle")
pickle_dataframe(movies, "movies_03mar.pickle")

In [291]:
movies_final = movies
movies_final = movies_final.merge(df_res,on='Movie',how="left")

script = []
for index, row in movies_final.iterrows():
    if type(row['Clean_Script_x']) == float:
        script.append(row['Clean_Script_y'])
    else:
        script.append(row['Clean_Script_x'])

movies_final['final_clean_script'] = script
movies_final = movies_final.drop(columns=['Clean_Script_x', 'Clean_Script_y'])
movies_final = movies_final.rename(columns={"final_clean_script": "Clean_Script"})

In [292]:
small_df_res = pd.DataFrame()
movie_name = []
clean_script = []

for index, row in df_res.iterrows():
    if row['Movie'] not in movies_final['Movie'].values:
        movie_name.append(row['Movie'])
        clean_script.append(row['Clean_Script'])

small_df_res['Movie'] = movie_name  
small_df_res['Clean_Script'] = clean_script 
movies_final = movies_final.append(small_df_res, ignore_index=True)

In [293]:
pickle_dataframe(movies_final, "Clean_Scripts_03mar.pickle")

In [294]:
movies = open_pickled_dataframe('Clean_Scripts_03mar.pickle')