In [3]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="fl-callout-text").find_all('p')]
    print(url)
    return text

# URLs of transcripts in scope
urls = ['https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-hickory-north-carolina-november-1',
        'https://www.rev.com/blog/transcripts/joe-biden-campaign-event-speech-transcript-philadelphia-november-1',
        'https://www.rev.com/blog/transcripts/barack-obama-campaign-speech-for-joe-biden-transcript-orlando-october-27',
        'https://www.rev.com/blog/transcripts/mike-pence-faith-in-leadership-speech-transcript-october-1',
        'https://www.rev.com/blog/transcripts/kamala-harris-campaign-speech-transcript-mcallen-texas-october-30'
       ]

# Comedian names
comedians = ['President Donald J. Trump', 'Joe Biden','Barack Obama','Vice President Mike Pence','Kamala Harris']

In [4]:
# Load pickled files
data = {}
# for i, c in enumerate(comedians):
#     with open("transcripts/" + c + ".txt", "rb") as file:
#         data[c] = pickle.load(file)
for i, c in enumerate(comedians):
    data[c]=url_to_transcript(urls[i])

https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-hickory-north-carolina-november-1
https://www.rev.com/blog/transcripts/joe-biden-campaign-event-speech-transcript-philadelphia-november-1
https://www.rev.com/blog/transcripts/barack-obama-campaign-speech-for-joe-biden-transcript-orlando-october-27
https://www.rev.com/blog/transcripts/mike-pence-faith-in-leadership-speech-transcript-october-1
https://www.rev.com/blog/transcripts/kamala-harris-campaign-speech-transcript-mcallen-texas-october-30


In [5]:
# Double check to make sure data has been loaded properly
data.keys()

dict_keys(['President Donald J. Trump', 'Joe Biden', 'Barack Obama', 'Vice President Mike Pence', 'Kamala Harris'])

In [6]:
# More checks
data['President Donald J. Trump'][:2]

['President Donald J. Trump: (02:08)\nThank you very much. Wow. This is a very big crowd. Wow look at this. Hello North Carolina, hello. Two days from now we are going to win this great State just like we did last time. You were the one, right? You were the one that put us over that hump and we’re going to win four more years in the white house, our beautiful white house. With your vote we will continue to cut your taxes, cut regulations, support our great police, protect our Second Amendment, defend religious liberty and ensure more products are stamped with that beautiful phrase made in the USA. And next year will be the greatest economic year in the history of our country. Under my leadership our economy grew at the fastest rate ever recorded, 33.1% just announced. We created a record 11.4 million jobs in the last five months while foreign nations are in a free fall, we’re creating the world’s greatest economic powerhouse. A recent Gallup Poll just came out, found that 56% of Americ

In [7]:
# Let's take a look at our data again
next(iter(data.keys()))

'President Donald J. Trump'

In [8]:
# Notice that our dictionary is currently in key: person, value: list of text format
next(iter(data.values()))

['President Donald J. Trump: (02:08)\nThank you very much. Wow. This is a very big crowd. Wow look at this. Hello North Carolina, hello. Two days from now we are going to win this great State just like we did last time. You were the one, right? You were the one that put us over that hump and we’re going to win four more years in the white house, our beautiful white house. With your vote we will continue to cut your taxes, cut regulations, support our great police, protect our Second Amendment, defend religious liberty and ensure more products are stamped with that beautiful phrase made in the USA. And next year will be the greatest economic year in the history of our country. Under my leadership our economy grew at the fastest rate ever recorded, 33.1% just announced. We created a record 11.4 million jobs in the last five months while foreign nations are in a free fall, we’re creating the world’s greatest economic powerhouse. A recent Gallup Poll just came out, found that 56% of Americ

In [9]:
# We are going to change this to key: person, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [10]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [11]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
Barack Obama,"Barack Obama: (00:00)\nHello, Orlando! Barack Obama: (00:10)\nOh, this is a good looking crowd here! Thank you so much. Can everybody please give ..."
Joe Biden,"Joe Biden: (00:24)\nHello, hello, hello. Hello, Philadelphia. It’s great to these everyone. Thank you, thank you, thank you. I want to thank Bisho..."
Kamala Harris,Kamala Harris: (00:00)\n… Working two jobs yet receiving no healthcare because private insurance system makes it so we are dependent on the genero...
President Donald J. Trump,"President Donald J. Trump: (02:08)\nThank you very much. Wow. This is a very big crowd. Wow look at this. Hello North Carolina, hello. Two days fr..."
Vice President Mike Pence,"Ben Carson: (00:01)\n… but a great friend of America, our Vice President, the 48th Vice President of the United States. And I’ll tell you somethin..."


In [12]:
# Let's take a look at the transcript for Ali Wong
data_df.transcript.loc['Barack Obama']



In [13]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('\(.*?\)', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [14]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
Barack Obama,barack obama \nhello orlando barack obama \noh this is a good looking crowd here thank you so much can everybody please give a big round of appla...
Joe Biden,joe biden \nhello hello hello hello philadelphia it’s great to these everyone thank you thank you thank you i want to thank bishop reid i really m...
Kamala Harris,kamala harris \n… working two jobs yet receiving no healthcare because private insurance system makes it so we are dependent on the generosity of ...
President Donald J. Trump,president donald j trump \nthank you very much wow this is a very big crowd wow look at this hello north carolina hello two days from now we are g...
Vice President Mike Pence,ben carson \n… but a great friend of america our vice president the vice president of the united states and i’ll tell you something interesting a...


In [15]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [16]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
Barack Obama,barack obama hello orlando barack obama oh this is a good looking crowd here thank you so much can everybody please give a big round of applause ...
Joe Biden,joe biden hello hello hello hello philadelphia its great to these everyone thank you thank you thank you i want to thank bishop reid i really mean...
Kamala Harris,kamala harris working two jobs yet receiving no healthcare because private insurance system makes it so we are dependent on the generosity of our...
President Donald J. Trump,president donald j trump thank you very much wow this is a very big crowd wow look at this hello north carolina hello two days from now we are goi...
Vice President Mike Pence,ben carson but a great friend of america our vice president the vice president of the united states and ill tell you something interesting about...


In [17]:
# Let's take a look at our dataframe
data_df

Unnamed: 0,transcript
Barack Obama,"Barack Obama: (00:00)\nHello, Orlando! Barack Obama: (00:10)\nOh, this is a good looking crowd here! Thank you so much. Can everybody please give ..."
Joe Biden,"Joe Biden: (00:24)\nHello, hello, hello. Hello, Philadelphia. It’s great to these everyone. Thank you, thank you, thank you. I want to thank Bisho..."
Kamala Harris,Kamala Harris: (00:00)\n… Working two jobs yet receiving no healthcare because private insurance system makes it so we are dependent on the genero...
President Donald J. Trump,"President Donald J. Trump: (02:08)\nThank you very much. Wow. This is a very big crowd. Wow look at this. Hello North Carolina, hello. Two days fr..."
Vice President Mike Pence,"Ben Carson: (00:01)\n… but a great friend of America, our Vice President, the 48th Vice President of the United States. And I’ll tell you somethin..."


In [18]:
# Let's add the comedians' full names as well
full_names = ['Barack Obama','Joe Biden','Kamala Harris','Donald Trump','Mike Pence' ]

data_df['full_name'] = full_names
data_df

Unnamed: 0,transcript,full_name
Barack Obama,"Barack Obama: (00:00)\nHello, Orlando! Barack Obama: (00:10)\nOh, this is a good looking crowd here! Thank you so much. Can everybody please give ...",Barack Obama
Joe Biden,"Joe Biden: (00:24)\nHello, hello, hello. Hello, Philadelphia. It’s great to these everyone. Thank you, thank you, thank you. I want to thank Bisho...",Joe Biden
Kamala Harris,Kamala Harris: (00:00)\n… Working two jobs yet receiving no healthcare because private insurance system makes it so we are dependent on the genero...,Kamala Harris
President Donald J. Trump,"President Donald J. Trump: (02:08)\nThank you very much. Wow. This is a very big crowd. Wow look at this. Hello North Carolina, hello. Two days fr...",Donald Trump
Vice President Mike Pence,"Ben Carson: (00:01)\n… but a great friend of America, our Vice President, the 48th Vice President of the United States. And I’ll tell you somethin...",Mike Pence


In [19]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")

In [20]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,abandon,abe,abiding,ability,able,abolish,abolition,abortion,abortions,abraham,...,york,youd,youll,young,youre,youve,zero,zeta,zone,zones
Barack Obama,3,1,0,2,4,0,0,0,0,0,...,0,1,2,3,8,12,1,0,1,0
Joe Biden,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
Kamala Harris,0,0,0,1,4,0,0,0,0,0,...,0,0,0,2,1,1,0,1,0,0
President Donald J. Trump,0,0,1,0,1,1,0,0,0,0,...,2,0,1,3,7,3,0,0,0,1
Vice President Mike Pence,0,0,0,0,3,0,1,3,1,2,...,0,0,2,5,1,5,0,0,0,1


In [21]:
# Let's pickle it for later use
data_dtm.to_pickle("usa.pkl")

In [22]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('usa_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))