In [1]:
import pandas as pd
import unicodedata
import string
import re
import numpy as np
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from statistics import mean
plt.style.use('ggplot')


## Data Pipeline
The code below shows the following steps:
1. Importing the data from .txt files
1. Splitting the data into title, date, text as well as type of speech
1. Splitting debate transcripts by speaker
1. Scraping additional debates from Debates.org & splitting them by speaker

In [2]:
### Creates initial dataframe from .txt files
parties = ['republican', 'democrat', 'democrat', 'republican', 'democrat', 'republican', 'republican', 'republican', 'democrat', 'democrat', 'republican', 'democrat', 'republican', 'democrat']
pres = ['bush', 'carter', 'clinton', 'eisenhower', 'fdroosevelt', 'ford', 'gwbush', 'hoover', 'kennedy', 'lbjohnson', 'nixon', 'obama', 'reagan', 'truman']
war = ['war', 'nw', 'nw', 'nw', 'war', 'nw', 'war', 'nw', 'war', 'war', 'war', 'war', 'nw', 'war']
pres_dict = {}
for i, v in enumerate(pres):
    pres_dict[v]= {}
    pres_dict[v]['party'] = parties[i]

## imports .txt files and puts them into a dictionary
import os
data = []
for i, p in enumerate(pres):
    folder = 'corpus/' + p + '/'
    for filename in os.listdir(folder): 
        indiv = []
        indiv.append(p)
        indiv.append(parties[i])
        indiv.append(war[i])
        path = folder + filename
        if os.path.isfile(path) and filename.endswith(".txt"): 
            with open(path, "r") as file: 
                indiv.append(file.read())
            data.append(indiv)

df = pd.DataFrame (data, columns = ['name', 'party', 'war','speech'])

### splits the speech into title, date, text
def get_text(x):
    idx = x.find('>', x.find('>')+1, 100)
    return x[idx+1:]

df['title'] = df['speech'].apply(lambda x: x.split('"')[1])
df['date'] = df['speech'].apply(lambda x: x.split('"')[3])
df['text'] = df['speech'].apply(get_text)


### Classifies speeches as interview, debate, inaugural address, state of the union or press conference
def is_speech_type(string, speech_type):
    if string.lower().find(speech_type)>=0:
        return True
    else:
        return False


speech_type = ['interview', 'debate', 'inaugural address', 'state of the union', 'press conference']
for i in speech_type:
    df[i] = df.title.apply(lambda x: is_speech_type(x, i))

## Use this function to find parts of the debates that belong to each speaker
def find_between( s, first, last ):
    result = ''
    while True:
        try:
            start = s.index(first) + len(first)
            end = s.index( last, start )
            result += s[start:end] + ' '
            s = s[end:]
        except ValueError:
            return result

In [3]:
df.head()

Unnamed: 0,name,party,war,speech,title,date,text,interview,debate,inaugural address,state of the union,press conference
0,bush,republican,war,"<title=""Address at West Point"">\n<date=""Januar...",Address at West Point,"January 5, 1993",\nThank you all very much. Good luck. Please b...,False,False,False,False,False
1,bush,republican,war,"<title=""Address on Somalia"">\n<date=""December ...",Address on Somalia,"December 4, 1992",\nI want to talk to you today about the traged...,False,False,False,False,False
2,bush,republican,war,"<title=""Address on Iraq's Invasion of Kuwait"">...",Address on Iraq's Invasion of Kuwait,"August 8, 1990","\nIn the life of a nation, we're called upon t...",False,False,False,False,False
3,bush,republican,war,"<title=""Address Before a Joint Session of Cong...",Address Before a Joint Session of Congress,"September 11, 1990",\nMr. President and Mr. Speaker and Members of...,False,False,False,False,False
4,bush,republican,war,"<title=""Remarks at Texas A and M University"">\...",Remarks at Texas A and M University,"December 15, 1992",\nThank you all for that welcome back. Thank y...,False,False,False,False,False


In [4]:
df[df['debate']==True]

Unnamed: 0,name,party,war,speech,title,date,text,interview,debate,inaugural address,state of the union,press conference
5,bush,republican,war,"<title=""Debate with Bill Clinton and Ross Pero...",Debate with Bill Clinton and Ross Perot,"October 11, 1992","\n\n<Jim Lehrer. Good evening, and welcome to ...",False,True,False,False,False
13,bush,republican,war,"<title=""Debate with Michael Dukakis"">\n<date=""...",Debate with Michael Dukakis,"September 25, 1988",\n\n<BUSH:> I think we've seen a deterioration...,False,True,False,False,False
23,carter,democrat,nw,"<title=""Debate with President Gerald Ford"">\n<...",Debate with President Gerald Ford,"October 6, 1976",\n\n<Good evening. I am Pauline Frederick of N...,False,True,False,False,False
26,carter,democrat,nw,"<title=""Debate with President Gerald Ford"">\n<...",Debate with President Gerald Ford,"October 22, 1976","\n\n<Good evening, I am Barbara Walters, moder...",False,True,False,False,False
28,carter,democrat,nw,"<title=""Debate with President Gerald Ford"">\n<...",Debate with President Gerald Ford,"September 23, 1976","\n\n<I am Edwin Newman, moderator of this firs...",False,True,False,False,False
41,carter,democrat,nw,"<title=""Debate with Ronald Reagan"">\n<date=""Oc...",Debate with Ronald Reagan,"October 28, 1980",\n\n<MR. SMITH. The League of Women Voters is ...,False,True,False,False,False
60,clinton,democrat,nw,"<title=""Presidential Debate with Senator Bob D...",Presidential Debate with Senator Bob Dole,"October 6, 1996",\n\n<JIM LEHRER: Good evening from the Bushnel...,False,True,False,False,False
252,kennedy,democrat,war,"<title=""Debate with Richard Nixon in Washingto...","Debate with Richard Nixon in Washington, D. C.","October 7, 1960","\n\n<FRANK McGEE, MODERATOR: Good evening. Thi...",False,True,False,False,False
253,kennedy,democrat,war,"<title=""Debate with Richard Nixon in New York ...",Debate with Richard Nixon in New York and Los ...,"October 13, 1960","\n\n<BILL SHADEL, MODERATOR: Good evening. I'm...",False,True,False,False,False
260,kennedy,democrat,war,"<title=""Debate with Richard Nixon in New York""...",Debate with Richard Nixon in New York,"October 21, 1960","\n\n<QUINCY HOWE, MODERATOR: I am Quincy Howe ...",False,True,False,False,False


In [5]:
### Splits existing debate transcripts by speaker
idx = [5 ,13, 23, 26, 28, 41,60, 252, 253, 260, 265, 449, 453]
opponent = [['clinton', 'democrat'], ['dukakis', 'democrat'], ['ford', 'republican'], ['ford', 'republican'],
            ['ford', 'republican'], ['reagan', 'republican'], ['dole', 'republican'], ['nixon', 'republican'], ['nixon', 'republican'],
            ['nixon', 'republican'], ['nixon', 'republican'], ['mondale','democrat'], ['mondale','democrat']]
        
delimiters = [['Bush.>', '>', '<Governor Clinton.', '>'], ['BUSH:>', '<', '<DUKAKIS:','>'], 
              ['MR. CARTER.>', '<', '<THE PRESIDENT.', '>'], ['MR. CARTER.>', '<', '<THE PRESIDENT.', '>'], 
              ['MR. CARTER.>', '<', '<THE PRESIDENT.', '>'],['THE PRESIDENT.>', '<', '<GOVERNOR REAGAN.', '>'],
             ['<PRESIDENT CLINTON:>', '<', '<SENATOR DOLE:', '<'],['<MR. KENNEDY:>', '<', '<MR. NIXON:', '>'],
              ['<MR. KENNEDY:>', '<', '<MR. NIXON:', '>'], ['<MR. KENNEDY:>', '<', '<MR. NIXON:', '>'], 
              ['<MR. KENNEDY:>', '<', '<MR. NIXON:', '>'], ['<The President.>', '<', '<Mr. Mondale.', '>'],
              ['<PRESIDENT REAGAN:>', '<', '<MR. MONDALE:', '>']]
opponents = []
for i in range(len(idx)):
    text = df.loc[idx[i], 'text']
    df.loc[idx[i],'text'] = find_between(df.loc[idx[i],'text'], delimiters[i][0], delimiters[i][1])
    opponent_text = find_between(text, delimiters[i][2], delimiters[i][3])
    #print(f'for {idx[i]}, delimited opponent {opponent[i]} between {delimiters[i][2]} and {delimiters[i][3]}')
    opponents.append(opponent[i] + [0, 0, 0, 0] + [opponent_text] + [False, True, False, False, False])
opponents = pd.DataFrame(data = opponents, columns = df.columns)
df = pd.concat([df, opponents])

In [6]:
### scrapes debate transcripts from Debates.org and splits by speaker
import pprint
# Requests sends and recieves HTTP requests.
import requests

# Beautiful Soup parses HTML documents in python.
from bs4 import BeautifulSoup

url = 'https://www.debates.org/voter-education/debate-transcripts/'
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')

classes = []
class_num = [i for i in range(100, 141)]
for v in [138, 139, 133, 140, 135, 136, 137, 134, 130, 132, 129, 127, 121, 122, 118, 101]:
    class_num.remove(v)

for i in class_num:
    classes.append("page_item page-item-"+str(i))

debate_names = [['kaine', 'democrat', 'pence', 'republican'], ['obama', 'democrat', 'romney', 'republican'],
                ['obama', 'democrat', 'romney', 'republican'], ['biden', 'democrat', 'ryan', 'republican'],
               ['obama', 'democrat', 'romney', 'republican'], ['mccain', 'republican', 'obama', 'democrat'],
               ['biden', 'democrat', 'palin', 'republican'], ['mccain', 'republican', 'obama', 'democrat'],
               ['mccain', 'republican', 'obama', 'democrat'], ['bush', 'republican', 'kerry', 'democrat'],
               ['bush','republican','kerry','democrat'], ['cheney', 'republican', 'edwards', 'democrat'],
               ['bush', 'republican', 'kerry', 'democrat'], ['gore', 'democrat', 'bush', 'republican'],
                ['lieberman', 'democrat', 'cheney', 'republican'], ['gore', 'democrat', 'bush', 'republican'],
                ['gore', 'democrat', 'bush', 'republican'], ['gore', 'democrat', 'kemp', 'republican'],
                ['clinton', 'democrat', 'dole', 'republican'], ['gore', 'democrat', 'quayle', 'republican'],
                ['clinton', 'democrat', 'bush', 'republican'], ['clinton', 'democrat', 'bush', 'republican'],
                ['bush', 'republican', 'dukakis', 'democrat'], ['bentsen', 'democrat', 'quayle', 'republican'],
                ['bush', 'republican','ferraro', 'democrat']]

### Gets text from each speech
links = []
for i in classes:
    x = soup.find_all(class_= i)
    links.append('http://debates.org/'+ str(x).split('"')[3])


results = []
for i, link in enumerate(links):
    req = requests.get(link)
    soup_link = BeautifulSoup(req.content, 'html.parser')
    text = soup_link.get_text()
    text = text.lower()
    for v in [0, 2]:
        text_1 = find_between(text, '\n'+debate_names[i][v]+":", ':')
        results.append([debate_names[i][v], debate_names[i][v+1]] + [0, 0, 0, 0] + [text_1] + [False, True, False, False, False])
for i in [18, 20, 24, 27, 31, 33]:
     results[i][0]=results[i][0].replace("bush", "gwbush")
more_debates = pd.DataFrame(data = results, columns = df.columns)
df = pd.concat([df, more_debates])


KeyboardInterrupt: 

In [None]:
df = df.reset_index()

In [None]:
pd.index

## EDA!

In [None]:
df.head()

In [None]:
df.count()

The 549 texts included 16 Democrats (7 presidents, 9 debate candidates) and 15 Republicans (7 presidents, 8 debate candidates).

In [None]:
print('Speeches by president')
print(df.name.value_counts())
print('')
print('speeches by party')
print(df.party.value_counts())

Speeches by speaker

In [None]:
df['name'].value_counts().plot(kind='bar')

LBJ had the most speeches, which lead me to wonder if these were disproportionately press conferences due to the Vietnam War.  It turns out that yes, he had the most press conferences in the corpus, 23 in total.

In [None]:
df[df['press conference']== True]['name'].value_counts().plot(kind='bar')

The table below shows the breakdown of speech type by speaker.

In [None]:
df.groupby('name').sum()

In [None]:
df['text length'] = df.text.apply(lambda x: len(x.split(' ')))

The median speech length was 3,176 words, which assuming 150 words/minute means 21.17 minutes.

In [None]:
df['text length'].median()/150

In [None]:
df[df['party'] == 'democrat']['text length'].sum()

In [None]:
df[df['party'] == 'republican']['text length'].sum()

In [None]:
df.sort_values(by='text length', ascending=True)

The shortest text was FDR's declaration of war on Germany and Italy.

In [None]:
df.iloc[120].text

In [None]:
vocab = set()
total = []
df['text'].str.lower().str.split().apply(vocab.update)
df['text'].str.lower().str.split().apply(total.append)
total = [item for sublist in total for item in sublist]
print(f'The vocabulary of the corpus is {len(vocab)} unique words & {len(total)} total words.')

In [None]:
fig, ax = plt.subplots()
df[df['party'] == 'democrat']['text length'].plot(kind = 'hist', bins = 20, density = 'true', alpha = .5, label = 'democrat')
df[df['party'] == 'republican']['text length'].plot(kind = 'hist', bins = 20, density = 'true', alpha = .5, label = 'republican')
ax.legend()
ax.set_title('Word Count by Party')

In [None]:
df[df['text length']==0]

# Testing and Feature Engineering

This section includes:
1. The initial Multinomial Bayes Model and the feature engineering used in optimization
1. The Multinomial Naive Bayes Model used to predict whether a text is from a Democrat or a Republican
1. The top tri-grams unique to each party (by log probability)
1. The optimized Random Forest Classifier that ultimately was not used

In [None]:
sw = stopwords.words('english')

new_sw = ['also', '000', 'see', 'come', 'day','say', 'us', 'well', 'le', "'m", 'made', 'applause', 'mr', 'men', 'candidate', 'instead', 'q', 'part', "'re", "'le", 'get', 'man', "'ve", 'let', 'every', 'may', 'upon', 'shall', 'going','ve','year', '--', '``', "'s", "''", 'way', 'cannot', 'two', 'long', 'said', 'like', 'years', 'think','would',"n't", 'must','u','many','one','first','last',"'re'", 'today','', 'ha', 'wa']
for i in new_sw:
    sw.append(i)



In [None]:
## returns the top features for dems republications (by log probability), as well as the features that are unique to 
#each party in the top num_features
def get_top_features(feature_logs, labels, num_features):
    zero_all = np.array(labels)[np.argsort(feature_logs[0])[-1*num_features:]]
    one_all = np.array(labels)[np.argsort(feature_logs[1])[-1*num_features:]]
    zero_unique = [x for x in zero_all if x not in one_all]
    one_unique = [x for x in one_all if x not in zero_all]
    return zero_all, one_all, zero_unique, one_unique


In [None]:
## Initial model has 25% recall for republicans and a cross validation score of .66
## Naive Bayes Model
def run_model_initial(X, y, max_feat = 200, stop_words = sw):
    tfidf = TfidfVectorizer(stop_words = sw, ngram_range = (1,3), max_features = max_feat)
    X = tfidf.fit_transform(X)
    labels = tfidf.get_feature_names()
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 5)
# %
    clf = MultinomialNB()
    clf = clf.fit(X_train, y_train)
    cvs = np.mean(cross_val_score(clf, X_train, y_train))
    return clf, cvs, X_test, y_test, labels, X, tfidf
    
clf, cvs, X_test, y_test, labels, X, tfidf = run_model_initial(df.text, df.party, stop_words = sw)

td, fr, fd, tr = confusion_matrix(y_test, clf.predict(X_test)).ravel()
print(f'True Dems: {td}, False Republicans: {fr}, False Dems: {fd}, True Republicans: {tr}')
print(f'Recall for Democrats: {td/(td + fr)}')
print(f'Recall for Republicans: {tr/(fd + tr)}')
print(f'Cross-validation score: {cvs}')

zero, one, zero_unique, one_unique = get_top_features(clf.feature_log_prob_, labels, 50)

print(" ")
print(f'Dem unique: {zero_unique}')
print(f'Rep unique: {one_unique}')

I looked at the tradeoff between accuracy and recall for each party, ultimately deciding to work within a range of 1000 to 1500 features

In [None]:
feat = np.arange(500, 10000, 250)
acc = []
recall_dems = []
recall_rep = []
for i in feat:
    clf, cvs, X_test, y_test, _, _, _ = run_model_initial(df.text, df.party, max_feat = i, stop_words = sw)
    td, fr, fd, tr = confusion_matrix(y_test, clf.predict(X_test)).ravel()
    recall_d = td/(td + fr)
    recall_r = tr/(fd + tr)
    acc.append(cvs)
    recall_dems.append(recall_d)
    recall_rep.append(recall_r)


In [None]:
fig, ax = plt.subplots(3,1)

ax[0].plot(feat, acc)
ax[0].set_xlabel('Features')
ax[0].set_ylabel('Accuracy')
ax[1].plot(feat, recall_dems)
ax[1].set_xlabel('Features')
ax[1].set_ylabel('Recall (Dem)')
#ax[0].scatter([feat[np.argsort(acc)[-5:]]], acc[np.argsort(acc)[-5:]])
#ax[0].title('Features vs Accuracy')
ax[2].plot(feat, recall_rep)
ax[2].set_xlabel('Features')
ax[2].set_ylabel('Recall (Rep)')
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

As the features grew, the model began to classify more people as Democrats.

The initial ROC curve showed an AUC of .870

In [None]:
clf, cvs, X_test, y_test, _, _, _ = run_model_initial(df.text, df.party, max_feat = 1500, stop_words = sw)

In [None]:
td, fr, fd, tr = confusion_matrix(y_test, clf.predict(X_test)).ravel()
print(f'True Dems: {td}, False Republicans: {fr}, False Dems: {fd}, True Republicans: {tr}')
print(f'Recall for Democrats: {td/(td + fr)}')
print(f'Recall for Republicans: {tr/(fd + tr)}')

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score


# generate a no skill prediction (majority class)
y_test_coded = []
for i in y_test:
    if i == 'democrat':
        y_test_coded.append(1)
    else:
        y_test_coded.append(0)
    
    
ns_probs = [0 for _ in range(len(y_test))]
mnb_probs = clf.predict_proba(X_test)
# keep probabilities for being a republican only (column 1), democrat (column 0)
mnb_probs = mnb_probs[:, 0]
# calculate scores
ns_auc = roc_auc_score(y_test_coded, ns_probs)
mnb_auc = roc_auc_score(y_test_coded, mnb_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (mnb_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test_coded, ns_probs)
mnb_fpr, mnb_tpr, _ = roc_curve(y_test_coded, mnb_probs)
# plot the roc curve for the model
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(mnb_fpr, mnb_tpr, marker='.', label='Multinomial Bayes')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

Credit for the ROC code goes to https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/

Next I graphed accuracy as a function of different thresholds

In [None]:
y_test_array = y_test.to_numpy()

In [None]:
from statistics import mean
dem_prob = clf.predict_proba(X_test)[:,0]
thresh = np.arange(0,1, .02)
y_test
thresh_acc =[]

for t in thresh:
    correct = []
    for i,v in enumerate(dem_prob):
        if v >= t and y_test_array[i] =='democrat':
            correct.append(1)
        elif v<t and y_test_array[i] == 'republican':
            correct.append(1)
        else:
            correct.append(0)
    thresh_acc.append(mean(correct))

fig, ax = plt.subplots()
ax.plot(thresh, thresh_acc)
ax.set_xlabel('Probability Threshold for Classifying as Democrat')
ax.set_ylabel('Accuracy')
    

It appears that the optimal threshold is between .56 and .64; however, once I changed from 1-3grams to just trigrams in my model, I found that the best threshold was .54.

This is my main model with 1500 features, trigrams only, special stopwords, and a .54 threshold for classification as Democrat.

In [None]:
## Naive Bayes Model
def run_model(X, y, max_feat = 1500, stop_words = sw):
    tfidf = TfidfVectorizer(stop_words = sw, ngram_range = (3,3), max_features = max_feat)
    X = tfidf.fit_transform(X)
    labels = tfidf.get_feature_names()
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 5)
# %
    clf = MultinomialNB()
    clf = clf.fit(X_train, y_train)
    cvs = np.mean(cross_val_score(clf, X_train, y_train))
    return clf, cvs, X_test, y_test, labels, X, tfidf
    
clf, cvs, X_test, y_test, labels, X, tfidf = run_model(df.text, df.party, stop_words = sw)

The confusion matrix is broken down below:

In [None]:
td, fr, fd, tr = confusion_matrix(y_test, clf.predict(X_test)).ravel()
print(f'True Dems: {td}, False Republicans: {fr}, False Dems: {fd}, True Republicans: {tr}')
print(f'Recall for Democrats: {td/(td + fr)}')
print(f'Recall for Republicans: {tr/(fd + tr)}')
print(f'Cross-validation score (using .5 threshold): {cvs}')

In [None]:
### Evaluates accuracy using the .54 threshold
dem_prob = clf.predict_proba(X_test)[:,0]
correct = []
idx = list(y_test.index)
t = .54
print(y_test.shape)

y_test_array = y_test.to_numpy()
for i,v in enumerate(dem_prob):
    if v >= t and y_test_array[i] =='democrat':
        correct.append(1)
        df.loc[idx[i], 'accurate'] = True
    elif v<t and y_test_array[i] == 'republican':
        correct.append(1)
        df.loc[idx[i], 'accurate'] = True
    else:
        correct.append(0)
        df.loc[idx[i], 'accurate'] = False
print(f'Accuracy using threshold = {t}: {mean(correct)}')

In [None]:
## Final ROC curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score


# generate a no skill prediction (majority class)
y_test_coded = []
for i in y_test:
    if i == 'democrat':
        y_test_coded.append(1)
    else:
        y_test_coded.append(0)
    
    
ns_probs = [0 for _ in range(len(y_test))]
mnb_probs = clf.predict_proba(X_test)
# keep probabilities for being a republican only (column 1), democrat (column 0)
mnb_probs = mnb_probs[:, 0]
# calculate scores
ns_auc = roc_auc_score(y_test_coded, ns_probs)
mnb_auc = roc_auc_score(y_test_coded, mnb_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (mnb_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test_coded, ns_probs)
mnb_fpr, mnb_tpr, _ = roc_curve(y_test_coded, mnb_probs)
# plot the roc curve for the model
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(mnb_fpr, mnb_tpr, marker='.', label='Multinomial Bayes')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

## Random Forest

In [None]:
### fitted with grid search results
from sklearn.ensemble import RandomForestClassifier
overall_acc = []
# for i in range(10):
#     X_train, y_train, X_test, y_test = train_test_split(df.text, df.party, stratify = df.party, random_state = i)
#     rf = RandomForestClassifier()
#     rf = rf.fit(X_train, y_train)
#     acc = mean(cross_val_score(rf, X_train, y_train))
#     overall_acc.apend(acc)

# print(mean(overall_acc))

tfidf = TfidfVectorizer(stop_words = sw, ngram_range = (1,3), max_features = 1500)
X = tfidf.fit_transform(df.text)
labels = tfidf.get_feature_names()
X_train, X_test, y_train, y_test = train_test_split(X, df.party, stratify = df.party, random_state = 5)
rf = RandomForestClassifier(n_estimators = 600, min_samples_split = 5, min_samples_leaf = 1, max_features = 'sqrt',
                           max_depth = 115, bootstrap = False)
rf = rf.fit(X_train, y_train)
acc = mean(cross_val_score(rf, X_train, y_train))

pred = rf.predict(X_test)
dem_prob_rf = rf.predict_proba(X_test)


In [None]:
## Finding the best threshold for random forest
y_test_array = y_test.to_numpy()
dem_prob = rf.predict_proba(X_test)[:,0]
thresh = np.arange(0,1, .02)
thresh_acc =[]

for t in thresh:
    correct = []
    for i,v in enumerate(dem_prob):
        if v >= t and y_test_array[i] =='democrat':
            correct.append(1)
        elif v<t and y_test_array[i] == 'republican':
            correct.append(1)
        else:
            correct.append(0)

    thresh_acc.append(mean(correct))

fig, ax = plt.subplots()
ax.plot(thresh, thresh_acc)
print(f'The best threshold is {thresh[np.argsort(thresh_acc)[-1:]]}')

In [None]:
## Accuracy using .54 threshold
y_test_array = y_test.to_numpy()
correct = []
t = .56
for i,v in enumerate(dem_prob_rf[:, 0]):
    if v >= t and y_test_array[i] =='democrat':
        correct.append(1)
    elif v<t and y_test_array[i] == 'republican':
        correct.append(1)
    else:
        correct.append(0)
print(f'Accuracy: {mean(correct)}')

In [None]:
important = rf.feature_importances_
idx = np.argsort(important)[-20:]

np.array(labels)[idx]

In [None]:
## using randomized search to find best hyperparameters(used above)
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 220, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

Code from https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

## Results

In this section I examine:
1. The top Democratic and Republican trigrams
1. The most Republican and most Democratic speeches in the test set by log probabilities
2. The breakdown of the incorrectly/correctly identified speeches in the test set

In [None]:
zero, one, zero_unique, one_unique = get_top_features(clf.feature_log_prob_, labels, 50)

print(f'Top Democratic Trigrams: {zero_unique}')
print(" ")
print(f'Top Republican Trigrams: {one_unique}')

In [None]:
prob_dem_list = []
sorted_idx = np.argsort(clf.predict_proba(X_test)[:, 0])
                        
for i,v in enumerate(list(clf.predict_proba(X_test))):
    prob = v[0]
    idx = list(y_test.index)[i]
    prob_dem_list.append([prob, idx])

The top ten most Republican speeches

In [None]:
df.iloc[np.array(prob_dem_list)[sorted_idx][:10,1]]

Top ten most Democratic speeches

In [None]:
df.iloc[np.array(prob_dem_list)[sorted_idx][-10:,1]]

Looking at correct / Incorrect data

In [None]:
df['accurate'] = 'na'
pred = []
idx = list(y_test.index)
for i, v in enumerate(list(y_test)):
    if v == clf.predict(X_test)[i]:
        df.loc[idx[i], 'accurate'] = True
    else:
        df.loc[idx[i], 'accurate'] = False

In [None]:
## Updates DF with whether the speech was accurately tagged

In [None]:
incorrect = df[df['accurate']==False]
incorrect.party.value_counts()

In [None]:
incorrect.debate.value_counts()

In [None]:
correct = df[df['accurate']==True]

In [None]:
correct.debate.value_counts()

In [None]:
df.debate.value_counts()

In [None]:
incorrect.loc[:,['name', 'title']]

## Insights

In this section I:

1. look at log probabilities to find the most "democratic" and "republican" speeches
2. Examine the speeches that were incorrectly classified

In [None]:
def get_most_partisan(clf, X_test, y_test):
    prob_dem_list = []
    sorted_idx = np.argsort(clf.predict_proba(X_test)[:, 0])
                        
    for i,v in enumerate(list(clf.predict_proba(X_test))):
        prob = v[0]
        idx = list(y_test.index)[i]
        prob_dem_list.append([prob, idx])
    most_rep_idx = np.array(prob_repub_list)[sorted_idx][:5,1]
    most_dem_idx = np.array(prob_dem_list)[sorted_idx][-5:,1]
    return most_rep_idx, most_dem_idx

most_rep, most_dem = get_most_partisan(clf, X_test, y_test)
print('Most Republican')
print(df.loc[most_rep, ['name', 'title']])
print('Most Democrat')
print(df.loc[most_dem, ['name', 'title']])

LBJ appears in 4 of the top 5 spots.

Republicans: Reagan, GWBush (x2), Hoover, Bush Senior.

In [None]:
zero, one, zero_unique, one_unique = get_top_features(clf.feature_log_prob_, labels, 100)

print(f'zero: {zero}')
print(f'one: {one}')
print(f'zero unique: {zero_unique}')
print(f'one unique: {one_unique}')

In [None]:
sw = stopwords.words('english')

new_sw = ['also', 'day','two', 'even', 'dont',"'ll", 'say', 'want', 'time', 'come', 'make', 'thing', 'le', "'m", 'go', 'thats', 'could', 'like', 'made', 'applause', 'mr', 'men', 'candidate', 'instead', 'q', 'part', "'re", "'le", 'get', 'man', "'ve", 'let', 'every', 'may', 'upon', 'shall', 'going','ve','year', '--', '``', "'s", "''", 'way', 'would',"n't", 'must','u','many','one','first','last',"'re'", 'today','', 'ha', 'wa']
for i in new_sw:
    sw.append(i)


In [None]:
### Build confusion matrix:

In [None]:
print(clf.predict(X_test))
print(y_test)

## Looking at correct/incorrect data

In [None]:
df['accurate'] = 'na'
pred = []
idx = list(y_test.index)
for i, v in enumerate(list(y_test)):
    if v == clf.predict(X_test)[i]:
        df.loc[idx[i], 'accurate'] = True
    else:
        df.loc[idx[i], 'accurate'] = False

In [None]:
## Updates DF with whether the speech was accurately tagged

In [None]:
incorrect = df[df['accurate']==False]
incorrect.party.value_counts()

In [None]:
incorrect.debate.value_counts()

In [None]:
correct.debate.value_counts()

In [None]:
correct.party.value_counts()

In [None]:
df.debate.value_counts()

In [None]:
correct = df[df['accurate']==True]

In [None]:
counts_wrong = incorrect['name'].value_counts()

In [None]:
counts_right = correct['name'].value_counts()

In [None]:
right_wrong = pd.concat([counts_wrong, counts_right], axis = 1)
right_wrong = right_wrong.fillna(0)
right_wrong['correct %'] = (1 - right_wrong.iloc[:, 0]/(right_wrong.iloc[:, 0] + right_wrong.iloc[:, 1]))*100


Below is the percent correct for each speaker

In [None]:
right_wrong

Below are the incorrectly identified speeches

In [None]:
incorrect

## Analyizing Trump's speeches

In [None]:
## loading Trump's 2016 campaign speeches
with open('corpus/trump_speech/trump.txt', 'r') as file:
    data = file.read().replace('\n', '')
trump_speeches = data.split("SPEECH")

In [None]:
### loading Trump's 2020 Republican National Convention Speech

politico = 'https://www.politico.com/story/2016/07/full-transcript-donald-trump-nomination-acceptance-speech-at-rnc-225974'

req = requests.get(politico)
soup_link = BeautifulSoup(req.content, 'html.parser')
text = soup_link.get_text()
text = text.lower()

speech= text.split("politico thursday afternoon")

trump_speeches.append(speech[2].split("follow @ politico")[0])

In [None]:
trump_series = pd.Series(trump_speeches)

In [None]:
trump_vector = tfidf.fit_transform(trump_series)

print(clf.predict_proba(trump_vector)[:,0])

Using a .54 threshold, Trump's speeches are classified as Republican only 6 out of 13 times.