# Data Cleaning

## Scraping and pickling transcripts

In [47]:
# Web scraping, pickle imports
import requests, re
from bs4 import BeautifulSoup
import pickle

# Scrape transcripts
def url_to_transcript(url):
    '''Return transcript data'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [br.next_sibling.text for br in soup.find_all("br") if br.next_sibling]
    return text

# URLs of transcripts
urls = ['https://subslikescript.com/movie/Trainspotting-117951',
        'https://subslikescript.com/movie/The_Wrestler-1125849',
        'https://subslikescript.com/movie/Whiplash-2582802',
        'https://subslikescript.com/movie/Coco-2380307',
        'https://subslikescript.com/movie/Rocky-75148',
        'https://subslikescript.com/movie/Oldboy-364569']

# Movies
movies = ['Trainspotting', 'The_Wrestler', 'Whiplash', 'Coco', 'Rocky', 'Oldboy']

In [48]:
# Request all transcripts
transcripts = [url_to_transcript(u) for u in urls]
print(transcripts)



In [51]:
# Pickle it for later
!mkdir movies
data = {}
for i, c in enumerate(movies):
    with open("movies/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

Џ®¤Ї ЇЄ  Ё«Ё д ©« movies г¦Ґ бгйҐбвўгҐв.


In [53]:
# Load pickled files
data = {}
for i, c in enumerate(movies):
    with open("movies/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [56]:
# Check if loaded correctly
data.keys()

dict_keys(['Trainspotting', 'The_Wrestler', 'Whiplash', 'Coco', 'Rocky', 'Oldboy'])

In [57]:
data

{'Trainspotting': ['',
  'Choose a career. Choose a family.',
  '',
  'Choose a fucking big television.',
  '',
  'Choose washing machines, cars,',
  'compact disc players, electrical tin openers.',
  '',
  'Choose good health,',
  'low cholesterol and dental insurance.',
  '',
  'Choose fixed-interest mortgage repayments.',
  '',
  'Choose a starter home.',
  '',
  'Choose your friends.',
  '',
  'Choose leisure wear and matching luggage.',
  '',
  'Choose a three-piece suite on',
  'hire purchase in a range of fucking fabrics.',
  '',
  'Choose DIY and wondering who the fuck',
  'you are on a Sunday morning.',
  '',
  '',
  '',
  'Choose sitting on that couch',
  '',
  'watching mind-numbing,',
  'spirit-crushing game shows',
  '',
  'stuffing fucking junk food into your mouth.',
  '',
  'Tommy, go!',
  '',
  'Choose rotting away',
  'at the end of it all,',
  '',
  'pissing your last in a miserable home,',
  '',
  'nothing more than an embarrassment',
  'to the selfish, fucked-up br

# Time to clean data

In [58]:
# Combining text
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [59]:
# Combine it!
data_combined = {key: [combine_text(values)] for (key, values) in data.items()}

In [60]:
# Put it in pandas dataframe
import pandas as pd
pd.set_option('max_colwidth', 150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df

Unnamed: 0,transcript
Trainspotting,"Choose a career. Choose a family. Choose a fucking big television. Choose washing machines, cars, compact disc players, electrical tin openers...."
The_Wrestler,"and a true American, the people's hero, Randy ""The Ram""... Robinson! And the Ram is up immediately, throwing haymakers and a pile driver! He..."
Whiplash,"No. Stay. What's your name? Andrew Neiman, sir. What year are you? I'm first year. - Do you know who I am? - Yes, sir. So you know I'm look..."
Coco,"Because of something that happened\nbefore I was even born.\n See, a long time ago,\nthere was this family.\n The papa, he was a musician.\n He an..."
Rocky,You're waltzin'. Give the sucker some action. You're fightin' like a bum. Want some advice? Water! Should l bet the fight don't go three round...
Oldboy,has been digitally remastered to celebrate its 10th anniversary. What? I said I want to tell you my story. What the hell? What's with the way...


In [61]:
# First round of text cleaning
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctutation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [63]:
# Clean with first time
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
Trainspotting,choose a career choose a family choose a fucking big television choose washing machines cars compact disc players electrical tin openers choos...
The_Wrestler,and a true american the peoples hero randy the ram robinson and the ram is up immediately throwing haymakers and a pile driver here tonight ...
Whiplash,no stay whats your name andrew neiman sir what year are you im first year do you know who i am yes sir so you know im looking for players...
Coco,because of something that happened\nbefore i was even born\n see a long time ago\nthere was this family\n the papa he was a musician\n he and his ...
Rocky,youre waltzin give the sucker some action youre fightin like a bum want some advice water should l bet the fight dont go three rounds you fe...
Oldboy,has been digitally remastered to celebrate its anniversary what i said i want to tell you my story what the hell whats with the way you fuck...


In [64]:
# Second round of cleaning
def clean_text_round2(text):
    '''Get rid of additional stuff'''
    text = re.sub('♪', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [66]:
# Apply second round
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
Trainspotting,choose a career choose a family choose a fucking big television choose washing machines cars compact disc players electrical tin openers choos...
The_Wrestler,and a true american the peoples hero randy the ram robinson and the ram is up immediately throwing haymakers and a pile driver here tonight ...
Whiplash,no stay whats your name andrew neiman sir what year are you im first year do you know who i am yes sir so you know im looking for players...
Coco,because of something that happenedbefore i was even born see a long time agothere was this family the papa he was a musician he and his family wou...
Rocky,youre waltzin give the sucker some action youre fightin like a bum want some advice water should l bet the fight dont go three rounds you fe...
Oldboy,has been digitally remastered to celebrate its anniversary what i said i want to tell you my story what the hell whats with the way you fuck...


# Corpus

In [67]:
# Add movies column
data_df['movies'] = movies
data_df

Unnamed: 0,transcript,movies
Trainspotting,"Choose a career. Choose a family. Choose a fucking big television. Choose washing machines, cars, compact disc players, electrical tin openers....",Trainspotting
The_Wrestler,"and a true American, the people's hero, Randy ""The Ram""... Robinson! And the Ram is up immediately, throwing haymakers and a pile driver! He...",The_Wrestler
Whiplash,"No. Stay. What's your name? Andrew Neiman, sir. What year are you? I'm first year. - Do you know who I am? - Yes, sir. So you know I'm look...",Whiplash
Coco,"Because of something that happened\nbefore I was even born.\n See, a long time ago,\nthere was this family.\n The papa, he was a musician.\n He an...",Coco
Rocky,You're waltzin'. Give the sucker some action. You're fightin' like a bum. Want some advice? Water! Should l bet the fight don't go three round...,Rocky
Oldboy,has been digitally remastered to celebrate its 10th anniversary. What? I said I want to tell you my story. What the hell? What's with the way...,Oldboy


In [68]:
# Pickle it for later use
data_df.to_pickle("corpus.pkl")

# Document-Term Matrix

In [73]:
# Create document-term matrix
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,ab,abandoned,abbey,ability,able,aboutany,aboutim,abscess,absolute,absolutely,...,zacks,zane,ziggy,zip,zoo,ﬁesta,ﬁne,ﬁreworks,ﬂoor,ﬂy
Trainspotting,0,0,0,0,0,0,0,1,0,2,...,0,0,1,0,0,0,0,0,0,0
The_Wrestler,0,0,0,2,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Whiplash,0,0,0,0,3,0,0,0,2,1,...,0,1,0,0,0,0,0,0,0,0
Coco,0,1,0,0,0,1,1,0,0,0,...,0,0,0,1,0,1,1,1,1,1
Rocky,0,0,0,0,0,0,0,0,0,6,...,1,0,0,0,3,0,0,0,0,0
Oldboy,1,0,1,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Pickle document-term matrix
data_dtm.to_pickle("dtm.pkl")

In [None]:
# Pickle cleaned data and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv"))