In [1]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="post-content").find_all('p')]
    print(url)
    return text

# URLs of transcripts in scope
urls = ['http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/',
        'http://scrapsfromtheloft.com/2017/08/07/bo-burnham-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2017/09/19/ali-wong-baby-cobra-2016-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlfriends-boyfriend-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/19/joe-rogan-triggered-2016-full-transcript/']

# Comedian names
comedians = ['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe']

In [2]:
# Load pickled files
data = {}
for i, c in enumerate(comedians):
    with open("Transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [3]:
# Double check to make sure data has been loaded properly
data.keys()

dict_keys(['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe'])

In [4]:
data[comedians[1]][:2]

['This is Dave. He tells dirty jokes for a living. That stare is where most of his hard work happens. It signifies a profound train of thought, the alchemist’s fire that transforms fear and tragedy into levity and livelihood. Dave calls that look “the trance.” ♪ Play me ♪ ♪ Buy me ♪ ♪ Workinonit ♪ ♪ Tune up ♪ ♪ Tune ♪ ♪ Oh ♪ ♪ Fade me ♪ ♪ Ah-ah, ah-ah, ah-ah ♪ ♪ In every ghetto ♪ ♪ Ah-ah, ah-ah, ah-ah ♪ ♪ In every ghetto ♪ ♪ Ah-ah, ah-ah, ah-ah ♪ ♪ In every ghetto ♪ ♪ Ah-ah, ah-ah, ah-ah ♪ ♪ In every ghetto ♪ ♪ Ah-ah, ah-ah, ah-ah ♪ ♪ In every ghetto ♪ ♪ Ah-ah, ah-ah, ah-ah ♪ ♪ In every ghetto ♪ ♪ Ah-ah, ah-ah, ah-ah ♪',
 'Thank you! Thank you very much! Thank you all. Oh, wow. That was exciting, wasn’t it? Thank you, guys. Have a seat, feel comfortable, relax. I want to thank everyone in LA for a wonderful week. It’s been great here. You know what? It’s been ten years since the last time I played Los Angeles, if you can imagine. I know! I know, I’ve been gone for a very long time. And

### Cleaning The Data

When dealing with numerical data, data cleaning often involves removing null values and duplicate data, dealing with outliers, etc. With text data, there are some common data cleaning techniques, which are also known as text pre-processing techniques.

With text data, this cleaning process can go on forever. There's always an exception to every cleaning step. So, we're going to follow the MVP (minimum viable product) approach - start simple and iterate. Here are a bunch of things you can do to clean your data. We're going to execute just the common cleaning steps here and the rest can be done at a later point to improve our results.

Common data cleaning steps on all text:

* Make text all lower case
* Remove punctuation
* Remove numerical values
* Remove common non-sensical text (/n)
* Tokenize text
* Remove stop words

More data cleaning steps after tokenization:

* Stemming / lemmatization
* Parts of speech tagging
* Create bi-grams or tri-grams
* Deal with typos
* And more...



In [5]:
combined_lists = []
[combined_lists.append(' '.join(data[comedian]))
 for comedian in comedians]

[None, None, None, None, None, None, None, None, None, None, None, None]

In [6]:
import pandas as pd

In [7]:
corpus_df = pd.DataFrame()
corpus_df['comedian'] = pd.Series(comedians)
corpus_df['transcripts'] = pd.Series(combined_lists)
corpus_df.index = corpus_df['comedian']
corpus_df = corpus_df.drop(labels = ['comedian'], axis = 1)
corpus_df

Unnamed: 0_level_0,transcripts
comedian,Unnamed: 1_level_1
louis,Intro\nFade the music out. Let’s roll. Hold th...
dave,This is Dave. He tells dirty jokes for a livin...
ricky,Hello. Hello! How you doing? Great. Thank you....
bo,Bo What? Old MacDonald had a farm E I E I O An...
bill,"[cheers and applause] All right, thank you! Th..."
jim,[Car horn honks] [Audience cheering] [Announce...
john,"All right, Petunia. Wish me luck out there. Yo..."
hasan,[theme music: orchestral hip-hop] [crowd roars...
ali,"Ladies and gentlemen, please welcome to the st..."
anthony,"Thank you. Thank you. Thank you, San Francisco..."


In [8]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = text.lower()
    return text

In [9]:
cleaned_df = pd.DataFrame(corpus_df.transcripts.apply(lambda x: clean_text_round1(x)))
cleaned_df

Unnamed: 0_level_0,transcripts
comedian,Unnamed: 1_level_1
louis,intro\nfade the music out let’s roll hold ther...
dave,this is dave he tells dirty jokes for a living...
ricky,hello hello how you doing great thank you wow ...
bo,bo what old macdonald had a farm e i e i o and...
bill,all right thank you thank you very much than...
jim,ladies and gentlemen please welcome to t...
john,all right petunia wish me luck out there you w...
hasan,what’s up davis what’s up i’m home i had t...
ali,ladies and gentlemen please welcome to the sta...
anthony,thank you thank you thank you san francisco th...


In [10]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', ' ', text)
    return text

In [11]:
cleaned_df = pd.DataFrame(cleaned_df.transcripts.apply(lambda x: clean_text_round2(x)))
cleaned_df

Unnamed: 0_level_0,transcripts
comedian,Unnamed: 1_level_1
louis,intro fade the music out lets roll hold there ...
dave,this is dave he tells dirty jokes for a living...
ricky,hello hello how you doing great thank you wow ...
bo,bo what old macdonald had a farm e i e i o and...
bill,all right thank you thank you very much than...
jim,ladies and gentlemen please welcome to t...
john,all right petunia wish me luck out there you w...
hasan,whats up davis whats up im home i had to b...
ali,ladies and gentlemen please welcome to the sta...
anthony,thank you thank you thank you san francisco th...


In [12]:
# pickling to save the dataframe for later use
cleaned_df.to_pickle('cleaned_corpus.pkl')

In [13]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(cleaned_df.transcripts)
data_dtm = pd.DataFrame(data_cv.toarray(), 
                        columns = cv.get_feature_names())
data_dtm.index = cleaned_df.index
data_dtm

Unnamed: 0_level_0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aaaahhhhh,aaah,aah,abc,abcs,ability,abject,...,zee,zen,zeppelin,zero,zillion,zombie,zombies,zoning,zoo,éclair
comedian,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
louis,0,0,0,0,0,3,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
dave,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ricky,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0
bo,0,1,1,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
bill,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,1,1,1,1,0,0
jim,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
john,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
hasan,0,0,0,0,0,0,0,0,0,0,...,2,1,0,1,0,0,0,0,0,0
ali,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
anthony,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [15]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
pickle.dump(cv, open("cv.pkl", "wb"))