# Data Cleaning for The Simpsons 

In [1]:
# Import package for creating data frame
import pandas as pd

In [2]:
# Import .csv file from folder and view top 5 entries
simpsons_script = pd.read_csv("The Simpsons Data/simpsons_script_lines.csv")
simpsons_script.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31
1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3
2,9551,32,211,Miss Hoover: I don't know. Although I'd sure l...,856000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,i dont know although id sure like to talk to h...,22
3,9552,32,212,Lisa Simpson: That life is worth living.,864000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,That life is worth living.,that life is worth living,5
4,9553,32,213,Edna Krabappel-Flanders: The polls will be ope...,864000,True,40.0,3.0,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,the polls will be open from now until the end ...,33


In [3]:
# Sort values by Line ID
simpsons_script = simpsons_script.sort_values(by = ['id'])
simpsons_script.head()

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
148761,1,1,0,(Street: ext. street - establishing - night),8000,False,,1.0,,Street,,,
148762,2,1,1,(Car: int. car - night),8000,False,,2.0,,Car,,,
148763,3,1,2,"Marge Simpson: Ooo, careful, Homer.",8000,True,1.0,2.0,Marge Simpson,Car,"Ooo, careful, Homer.",ooo careful homer,3.0
148764,4,1,3,Homer Simpson: There's no time to be careful.,10000,True,2.0,2.0,Homer Simpson,Car,There's no time to be careful.,theres no time to be careful,6.0
148765,5,1,4,Homer Simpson: We're late.,10000,True,2.0,2.0,Homer Simpson,Car,We're late.,were late,2.0


In [4]:
# Reset indexing so lines are chronological
simpsons_script = simpsons_script.reset_index()
simpsons_script.head()

Unnamed: 0,index,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,148761,1,1,0,(Street: ext. street - establishing - night),8000,False,,1.0,,Street,,,
1,148762,2,1,1,(Car: int. car - night),8000,False,,2.0,,Car,,,
2,148763,3,1,2,"Marge Simpson: Ooo, careful, Homer.",8000,True,1.0,2.0,Marge Simpson,Car,"Ooo, careful, Homer.",ooo careful homer,3.0
3,148764,4,1,3,Homer Simpson: There's no time to be careful.,10000,True,2.0,2.0,Homer Simpson,Car,There's no time to be careful.,theres no time to be careful,6.0
4,148765,5,1,4,Homer Simpson: We're late.,10000,True,2.0,2.0,Homer Simpson,Car,We're late.,were late,2.0


In [5]:
# Calculate total number of lines
len(simpsons_script)

158271

In [6]:
# View all columns and their types
simpsons_script.dtypes

index                   int64
id                      int64
episode_id              int64
number                  int64
raw_text               object
timestamp_in_ms        object
speaking_line          object
character_id           object
location_id           float64
raw_character_text     object
raw_location_text      object
spoken_words           object
normalized_text        object
word_count             object
dtype: object

In [7]:
# Find classifications for speaking line, i.e., combined usage of string and Boolean
simpsons_script['speaking_line'].unique()

array(['false', 'true', True, False,
       'Guess what. I also play Frankenstein!'], dtype=object)

In [8]:
# Filter data frame by spoken lines only
s_spoken = simpsons_script[simpsons_script.speaking_line.isin(["true", True])]
s_spoken.head()

Unnamed: 0,index,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
2,148763,3,1,2,"Marge Simpson: Ooo, careful, Homer.",8000,True,1,2.0,Marge Simpson,Car,"Ooo, careful, Homer.",ooo careful homer,3
3,148764,4,1,3,Homer Simpson: There's no time to be careful.,10000,True,2,2.0,Homer Simpson,Car,There's no time to be careful.,theres no time to be careful,6
4,148765,5,1,4,Homer Simpson: We're late.,10000,True,2,2.0,Homer Simpson,Car,We're late.,were late,2
7,148768,8,1,7,"Marge Simpson: (HUSHED VOICE) Sorry, Excuse us...",24000,True,1,4.0,Marge Simpson,Auditorium,"Sorry, Excuse us. Pardon me...",sorry excuse us pardon me,5
8,148769,9,1,8,"Homer Simpson: (SIMULTANEOUSLY) Hey, Norman. H...",26000,True,2,4.0,Homer Simpson,Auditorium,"Hey, Norman. How's it going? So you got dragge...",hey norman hows it going so you got dragged do...,21


In [9]:
# Calculate number of spoken lines
len(s_spoken)

132112

In [10]:
# Isolate raw text after resetting the index
s_spoken = s_spoken.reset_index()
s_spoken = s_spoken.raw_text
s_spoken.head()

0                  Marge Simpson: Ooo, careful, Homer.
1        Homer Simpson: There's no time to be careful.
2                           Homer Simpson: We're late.
3    Marge Simpson: (HUSHED VOICE) Sorry, Excuse us...
4    Homer Simpson: (SIMULTANEOUSLY) Hey, Norman. H...
Name: raw_text, dtype: object

In [11]:
# Import packages for strings and substitution
import string
import re

In [12]:
# Iterate through each row of data frame to clean
for i in range(len(s_spoken)):
    # Convert to string
    s_spoken[i] = str(s_spoken[i])
    # Convert to lowercase
    s_spoken[i] = s_spoken[i].lower()
    # Remove speaker designation (everything until colon)
    s_spoken[i] = re.sub(r'^.*?:','', s_spoken[i])
    # Remove leading whitespace
    s_spoken[i].lstrip()
    # Remove all other grammar
    s_spoken[i] = re.sub(r'[^\w\s]','',s_spoken[i])
s_spoken.head()    

0                                    ooo careful homer
1                         theres no time to be careful
2                                            were late
3               hushed voice sorry excuse us pardon me
4     simultaneously hey norman hows it going so yo...
Name: raw_text, dtype: object

In [17]:
# Export as csv file
s_spoken.to_csv("simpsons_df_fixed.csv")

In [25]:
# Convert to list and then string for output of raw text as text file
spoken_words = str(list(s_spoken))

In [27]:
# Remove grammar again from list, i.e., square brackets and apostrophes
spoken_words = re.sub(r'[^\w\s]','',spoken_words)

In [28]:
# Export as text file
with open("simpsons_script_fixed.txt", "w") as output:
    output.write(spoken_words)