In [1]:
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
pd.set_option('mode.chained_assignment', None)
all_df = pd.read_csv('debate_transcripts_v3_2020-02-26.csv', index_col = 0, encoding='latin1')
candidates = ['Bernie Sanders', 'Andrew Yang', 'Joe Biden', 'Amy Klobuchar', 
              'Pete Buttigieg', 'Elizabeth Warren', 'Tom Steyer', 'Michael Bloomberg']

all_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5911 entries, 02-25-2020 to 06-26-2019
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   debate_name            5911 non-null   object 
 1   debate_section         5911 non-null   object 
 2   speaker                5911 non-null   object 
 3   speech                 5911 non-null   object 
 4   speaking_time_seconds  5395 non-null   float64
dtypes: float64(1), object(4)
memory usage: 277.1+ KB


In [2]:
def combine_all(df):
    all_combined = pd.DataFrame(columns = ['speaker', 'speech'])
    for name in candidates:
        texts_Series = df.speech.loc[df.speaker == name]
        df_string = ' '.join(texts_Series)
        all_combined = all_combined.append({'speaker' : name , 'speech' : df_string}, ignore_index=True)

    return all_combined

In [3]:
combined_data = combine_all(all_df)
combined_data

Unnamed: 0,speaker,speech
0,Bernie Sanders,"Well, youre right, the economy is doing reall..."
1,Andrew Yang,"First, let me say America, its great to be ba..."
2,Joe Biden,"We talk about progressive, lets talk about be..."
3,Amy Klobuchar,"Yes, and I think that what we need to do inste..."
4,Pete Buttigieg,We know what the President what Russia wants...
5,Elizabeth Warren,"Look, the way I see this is that Bernie is win..."
6,Tom Steyer,Bernie Sanders analysis is right. The differe...
7,Michael Bloomberg,Senator- I think that Donald Trump thinks it w...


In [4]:
clean = lambda x: text_cleaning(x)
def text_cleaning(speech):
    speech = speech.encode('ascii',errors='ignore').decode()
    speech = speech.lower()
    speech = re.sub('\[.*?\'"]', '', speech)
    speech = re.sub('[%s]' % re.escape(string.punctuation), '', speech)
    speech = re.sub('\w*\d\w*', '', speech)
    return speech

In [5]:
data_clean = pd.DataFrame({'speaker' : combined_data.speaker , 'speech' : combined_data.speech.apply(clean)})
data_clean.to_pickle('corpus_clean.pkl')
data_clean

Unnamed: 0,speaker,speech
0,Bernie Sanders,well youre right the economy is doing really g...
1,Andrew Yang,first let me say america its great to be back ...
2,Joe Biden,we talk about progressive lets talk about bein...
3,Amy Klobuchar,yes and i think that what we need to do instea...
4,Pete Buttigieg,we know what the president what russia wants ...
5,Elizabeth Warren,look the way i see this is that bernie is winn...
6,Tom Steyer,bernie sanders analysis is right the differenc...
7,Michael Bloomberg,senator i think that donald trump thinks it wo...


In [6]:
def tokenize_lemmatize(data_clean):
    lemmatizer = WordNetLemmatizer()

    data_lemma = pd.DataFrame(columns = ['speaker', 'speech'])
    vocabulary = pd.DataFrame(columns = ['speaker', 'vocabulary'])
    for i in range(0, len(candidates)):
        words = word_tokenize(data_clean.iloc[i, :].speech)
        words_Series = pd.Series(words)
        print(i)
        lemma_words = []
        string = ''
        pos_tag(words)
        #print(pos_tag(words))
        for word, tag in pos_tag(words):
            if tag.startswith("NN"):
                lemma_words.append(lemmatizer.lemmatize(word, pos='n'))
            elif tag.startswith('VB'):
                lemma_words.append(lemmatizer.lemmatize(word, pos='v'))
            elif tag.startswith('JJ'):
                lemma_words.append(lemmatizer.lemmatize(word, pos='a'))
            else:
                lemma_words.append(lemmatizer.lemmatize(word))
            string = ' '.join(lemma_words)
        data_lemma = data_lemma.append({'speaker' : candidates[i], 'speech' : string}, ignore_index=True)
        vocabulary = vocabulary.append({'speaker' : candidates[i], 'vocabulary' : len(words_Series.unique())}, ignore_index=True)
        vocabulary.to_pickle('vocabulary.pkl')
    return data_lemma

In [7]:
data_lemma = tokenize_lemmatize(data_clean)
data_lemma.to_pickle('corpus_lemma.pkl')
data_lemma

0
1
2
3
4
5
6
7


Unnamed: 0,speaker,speech
0,Bernie Sanders,well youre right the economy be do really grea...
1,Andrew Yang,first let me say america it great to be back o...
2,Joe Biden,we talk about progressive let talk about be pr...
3,Amy Klobuchar,yes and i think that what we need to do instea...
4,Pete Buttigieg,we know what the president what russia want it...
5,Elizabeth Warren,look the way i see this be that bernie be win ...
6,Tom Steyer,bernie sander analysis be right the difference...
7,Michael Bloomberg,senator i think that donald trump think it wou...


In [8]:
def create_dtm(data):
    cv = CountVectorizer(stop_words='english')
    data_cv = cv.fit_transform(data.speech)
    dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
    dtm.index = data.speaker
    return dtm

In [9]:
data_clean_dtm = create_dtm(data_clean)
data_lemma_dtm = create_dtm(data_lemma)

In [10]:
data_clean_dtm

Unnamed: 0_level_0,aa,aapi,abandoned,abated,abc,aberration,abhorrent,abide,ability,able,...,youre,youth,youtube,youve,zealand,zero,zeroed,zip,zone,zuckerberg
speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bernie Sanders,0,0,0,0,1,0,0,0,2,13,...,36,1,2,6,0,3,0,1,0,0
Andrew Yang,0,0,0,0,0,0,0,0,1,5,...,12,0,0,5,0,11,0,1,0,1
Joe Biden,0,0,0,1,0,1,1,0,6,88,...,22,0,0,8,0,2,1,1,3,0
Amy Klobuchar,1,0,0,0,0,0,0,0,0,24,...,16,0,0,10,0,0,0,0,0,0
Pete Buttigieg,0,0,1,0,0,1,0,0,3,18,...,42,0,0,6,1,5,0,0,2,0
Elizabeth Warren,0,0,0,0,0,0,0,1,2,19,...,17,0,0,15,0,7,0,0,0,0
Tom Steyer,0,1,1,0,0,0,0,0,2,3,...,7,1,0,1,0,0,0,0,0,0
Michael Bloomberg,0,0,0,0,0,0,0,0,1,3,...,8,0,0,0,0,1,0,0,0,0


In [11]:
data_lemma_dtm

Unnamed: 0_level_0,aa,aapi,abandon,abate,abc,aberration,abhorrent,abide,ability,able,...,young,youre,youth,youtube,youve,zealand,zero,zip,zone,zuckerberg
speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bernie Sanders,0,0,0,0,1,0,0,0,2,13,...,15,36,1,2,6,0,3,1,0,0
Andrew Yang,0,0,0,0,0,0,0,0,1,5,...,9,12,0,0,5,0,11,1,0,1
Joe Biden,0,0,0,1,0,1,1,0,6,88,...,9,22,0,0,8,0,3,1,3,0
Amy Klobuchar,1,0,0,0,0,0,0,0,0,24,...,0,16,0,0,10,0,0,0,0,0
Pete Buttigieg,0,0,1,0,0,1,0,0,3,18,...,2,42,0,0,6,1,5,0,2,0
Elizabeth Warren,0,0,0,0,0,0,0,1,2,19,...,7,17,0,0,15,0,7,0,0,0
Tom Steyer,0,1,1,0,0,0,0,0,2,3,...,4,7,1,0,1,0,0,0,0,0
Michael Bloomberg,0,0,0,0,0,0,0,0,1,3,...,1,8,0,0,0,0,1,0,0,0


In [12]:
data_clean_dtm.to_pickle('data_clean_dtm.pkl')
data_lemma_dtm.to_pickle('data_lemma_dtm.pkl')