# Natural Language Processing with Python.

## Gathering the data with BeautifulSoup.

In [2]:
# importing libraries.

import requests                # an HTTP library for Python.
from bs4 import BeautifulSoup  # a webscrapping library.
import pickle                  # a library for data stream format.

In [3]:
# defining a function to get the data from the URLs.

def urlToTranscript(url):
    '''Return transcript data from the web.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'lxml')
    text = [p.text for p in soup.find(class_ = 'post-content').find_all('p')]
    print(url)
    return text

In [4]:
# URLs of transcripts.

urls = ['http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/',
        'http://scrapsfromtheloft.com/2017/08/07/bo-burnham-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2017/09/19/ali-wong-baby-cobra-2016-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlfriends-boyfriend-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/19/joe-rogan-triggered-2016-full-transcript/']


In [5]:
# comedians names.

comedians = ['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe']



In [6]:
# requesting transcripts.

transcripts = [urlToTranscript(u) for u in urls]

http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/
http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/
http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/
http://scrapsfromtheloft.com/2017/08/07/bo-burnham-2013-full-transcript/
http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/
http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/
http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/
http://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/
http://scrapsfromtheloft.com/2017/09/19/ali-wong-baby-cobra-2016-full-transcript/
http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/
http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlfriends-boyfriend-2013-full-transcript/
http://scrapsfromtheloft.com/2017/08/19/joe-rogan-triggered-

In [9]:
# making a new directory to hold the transcriptions.

!mkdir Transcripts

for i, c in enumerate(comedians):
    with open('transcripts/' + c + '.txt', 'wb') as file:
        pickle.dump(transcripts[i], file)

In [10]:
# loading pickle file.

data = {}

for i, c in enumerate(comedians):
    with open('transcripts/' + c + '.txt', 'rb') as file:
        data[c] = pickle.load(file)

In [11]:
data.keys()

dict_keys(['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe'])

In [13]:
data['louis'][:1]

['Intro\nFade the music out. Let’s roll. Hold there. Lights. Do the lights. Thank you. Thank you very much. I appreciate that. I don’t necessarily agree with you, but I appreciate very much. Well, this is a nice place. This is easily the nicest place For many miles in every direction. That’s how you compliment a building And shit on a town with one sentence. It is odd around here, as I was driving here. There doesn’t seem to be any difference Between the sidewalk and the street for pedestrians here. People just kind of walk in the middle of the road. I love traveling And seeing all the different parts of the country. I live in New York. I live in a– There’s no value to your doing that at all.']

## Cleaning the data

In [14]:
def combineText(listOfText):
    '''Take a list of texts and combine them into one large chunk of text
    
    Return a text (larger one)'''
    
    return ' '.join(listOfText)

In [15]:
# combining data to have it stored in a dictionary.

dataCombined = {key : [combineText(value)] for key, value in data.items()}

In [16]:
# converting data from a dictionary to a 
# Pandas dataframe to preprocess the data.

import pandas as pd
pd.set_option('max_colwidth', 150)

dataDf = pd.DataFrame.from_dict(dataCombined).transpose()
dataDf.columns = ['Transcript']
dataDf = dataDf.sort_index()

dataDf.head()

Unnamed: 0,Transcript
ali,"Ladies and gentlemen, please welcome to the st..."
anthony,"Thank you. Thank you. Thank you, San Francisco..."
bill,"[cheers and applause] All right, thank you! Th..."
bo,Bo What? Old MacDonald had a farm E I E I O An...
dave,This is Dave. He tells dirty jokes for a livin...


### Preprocessing the data.

In [17]:
import re       # stand for regular expression library.
import string

In [18]:
def cleanTextRe(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation,
       remove words containing numbers, additional punctuation and non-sensical text.'''
    
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[''""…]', '', text)
    text = re.sub('\n', '', text)
    return text

step1 = lambda x: cleanTextRe(x)

In [21]:
# let clean the data

dataClean = pd.DataFrame(dataDf.Transcript.apply(step1))
dataDf.to_pickle('corpus.pkl')
print('Data after cleaning regular expressions')


Data after cleaning regular expressions


In [22]:
dataClean.head()

Unnamed: 0,Transcript
ali,ladies and gentlemen please welcome to the sta...
anthony,thank you thank you thank you san francisco th...
bill,all right thank you thank you very much thank...
bo,bo what old macdonald had a farm e i e i o and...
dave,this is dave he tells dirty jokes for a living...


### Organizing the data.

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words = 'english')
dataCv = cv.fit_transform(dataClean.Transcript)
dataDtm = pd.DataFrame(dataCv.toarray(), columns = cv.get_feature_names())
dataDtm.index = dataClean.index

dataDtm.head()

Unnamed: 0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aaaahhhhh,aaah,aah,abc,abcs,ability,abject,...,zee,zen,zeppelin,zero,zillion,zombie,zombies,zoning,zoo,éclair
ali,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
anthony,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bill,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,1,1,1,1,0,0
bo,0,1,1,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
dave,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Saving the data.

In [24]:
# saving the data for later use.

dataDtm.to_pickle('dtm.pkl')
dataClean.to_pickle('dataClean.pkl')
pickle.dump(cv, open('cv.pkl', 'wb'))

## Exploratory data analysis.

In [25]:
# reload the data.

df = pd.read_pickle('dtm.pkl')
df = df.transpose()
df.head()

Unnamed: 0,ali,anthony,bill,bo,dave,hasan,jim,joe,john,louis,mike,ricky
aaaaah,0,0,1,0,0,0,0,0,0,0,0,0
aaaaahhhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaaauuugghhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaahhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaah,0,0,0,0,1,0,0,0,0,0,0,0


### Most common words.

In [27]:
# looking top 25 common words used by each commedian.

topDict = {}

for person in df.columns:
    topWord = df[person].sort_values(ascending = False).head(25)
    topDict[person] = list(zip(topWord.index, topWord.values))
    

### Stop Words.

Words with very little meaning.

In [30]:
# looking at the most common top words and
# adding them to the stop word list.

from collections import Counter

words = []

for comedian in df.columns:
    top = [word for (word, count) in topDict[comedian]]
    for t in top:
        words.append(t)

print(words[:10])

['like', 'know', 'just', 'don', 'shit', 'gonna', 'ok', 'lot', 'wanna', 'gotta']


In [31]:
# Let´s aggregate this list and identify the most common words
# along with how many times they are in.

print(Counter(words).most_common())

[('like', 12), ('know', 12), ('just', 12), ('don', 12), ('right', 12), ('people', 12), ('got', 10), ('time', 8), ('think', 8), ('gonna', 7), ('said', 7), ('cause', 6), ('yeah', 6), ('ve', 6), ('say', 6), ('fucking', 6), ('shit', 5), ('oh', 5), ('day', 5), ('thing', 5), ('good', 5), ('fuck', 5), ('did', 4), ('ll', 4), ('want', 4), ('didn', 4), ('going', 4), ('really', 4), ('dad', 3), ('guy', 3), ('man', 3), ('life', 3), ('went', 3), ('ok', 2), ('lot', 2), ('women', 2), ('tell', 2), ('joke', 2), ('guys', 2), ('dude', 2), ('make', 2), ('come', 2), ('love', 2), ('mom', 2), ('hey', 2), ('white', 2), ('goes', 2), ('kids', 2), ('little', 2), ('old', 2), ('mean', 2), ('wanna', 1), ('gotta', 1), ('husband', 1), ('pregnant', 1), ('god', 1), ('need', 1), ('work', 1), ('anthony', 1), ('grandma', 1), ('school', 1), ('baby', 1), ('let', 1), ('doing', 1), ('bo', 1), ('stuff', 1), ('repeat', 1), ('cos', 1), ('eye', 1), ('prolonged', 1), ('contact', 1), ('um', 1), ('sluts', 1), ('ahah', 1), ('black', 1

In [32]:
# if more than the half of the comedians have it as a top word,
# exclude it from the list.

addStopWords = [word for word, count in Counter(words).most_common() if count > 6]

print(addStopWords)

['like', 'know', 'just', 'don', 'right', 'people', 'got', 'time', 'think', 'gonna', 'said']


In [34]:
# lets update the matrix with the new list of stop words.
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# read the clean data.
dataClean = pd.read_pickle('dataClean.pkl')

# add new stopr words.
stopWords = text.ENGLISH_STOP_WORDS.union(addStopWords)

# recreate the matrix.
cv = CountVectorizer(stop_words = stopWords)
dataCv = cv.fit_transform(dataClean.Transcript)
dataStop = pd.DataFrame(dataCv.toarray(), 
                        columns = cv.get_feature_names())
dataStop.index = dataClean.index

# pickle it for late.
pickle.dump(cv, open('cvStop.pkl', 'wb'))
dataStop.to_pickle('dtmStop.pkl')