In [5]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.deprecated.doc2vec import LabeledSentence
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import STOPWORDS
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn import metrics
#from sklearn.grid_search import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils.multiclass import unique_labels

from collections import Counter
import matplotlib.pyplot as plt
import string
import re
import random
import glob
import itertools

import json

np.random.seed(1234)
random.seed(1234)

porter = PorterStemmer()
punctuation_dictionary = {s:None for s in list(string.punctuation)}
punctuation_dictionary["-"] = "_"
punctuation_translator = str.maketrans(punctuation_dictionary)

def text_cleaner(text, punctuation_translator, stemmer):
    text = str(text).translate(punctuation_translator)
    text = text.lower()
    text = porter.stem(text)
    return(text)

  from numpy.core.umath_tests import inner1d


## Load Speech & Clean

In [40]:
SubSpeech = pd.read_csv('SubSpeech_coded.csv')
SubSpeech.head()

Unnamed: 0.1,Unnamed: 0,Speech_index,SubContent,Subspeech_index,Candidate,Title,Date,Content,Type,URL,Year,Month,Day,Pop
0,0,Speech0,"['', ""PRESIDENT DONALD TRUMP: Thank you, thank...",Speech0_0:10,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26,
1,1,Speech0,['The unemployment rate just hit the lowest le...,Speech0_10:20,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26,
2,2,Speech0,"[""And I'll tell you, a little—a little tricky ...",Speech0_20:30,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26,
3,3,Speech0,"['You are here, he is here to help elect Cindy...",Speech0_30:40,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26,
4,4,Speech0,"[""What a great crowd we have tonight for you. ...",Speech0_40:50,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26,


In [41]:
SubSpeech["clean_text"] = SubSpeech["SubContent"].apply(lambda x: text_cleaner(x, punctuation_translator, porter))

In [16]:
#SubSpeech.head()

In [42]:
phrases1 = Phrases(map(lambda x: x.split(), SubSpeech["clean_text"].tolist()))
phrases2 = Phrases(phrases1[map(lambda x: x.split(), SubSpeech["clean_text"].tolist())])
SubSpeech["phrased_text"] = SubSpeech["clean_text"].apply(lambda x: " ".join(phrases2[phrases1[x.split()]]))

## Doc2Vev Model

In [43]:
docs = list(zip(SubSpeech["phrased_text"].tolist(), SubSpeech["Subspeech_index"].tolist()))


## Define an iterator to feed documents and tags to Doc2Vec
class Sentences(object):
  def __init__(self, docs):
    self.docs = docs
  def __iter__(self):
    for doc in self.docs:
      yield TaggedDocument(words=str(doc[0]).split(), tags=[doc[1]])

## Train and save models
model = Doc2Vec(Sentences(docs), vector_size=100, window=10, min_count=5, negative=10, epochs=20, dm=0, dbow_words=1)

In [44]:
model.wv.save_word2vec_format("doc2vec_wordvecs.txt", binary=False)

In [48]:
model.wv.most_similar('washington')

[('washington_dc', 0.668540358543396),
 ('game_playing', 0.6560088992118835),
 ('washington_insiders', 0.6413161754608154),
 ('influence_peddling', 0.6388410329818726),
 ('politics', 0.6355743408203125),
 ('politicians', 0.634049654006958),
 ('special_interests', 0.6225845813751221),
 ('change', 0.620872974395752),
 ('wont_do', 0.6165003776550293),
 ('lobbyists', 0.6142523288726807)]

In [52]:
#Most similar phrase to populist doc
populist_index = list(set(SubSpeech.loc[SubSpeech['Pop'] == "accept", "Subspeech_index"].tolist()))
print(len(populist_index))
i = 3

value = SubSpeech.loc[SubSpeech['Subspeech_index']==populist_index[i]]["phrased_text"].tolist()
print(value)
model.wv.most_similar([model.docvecs[populist_index[i]]])

42
['we will repeal and replace_disastrous_obamacare president_obama promised his plan would reduce premiums_by_2500 dollars instead they surged 5000 our replacement plan includes expanded access to healthcare savings_accounts with support for those_who need it it includes allowing americans to buy health_insurance across state lines in all 50_states creating a dynamic and competitive new market – they will be competing for your business were also going to block_grant medicaid so states can develop innovative_solutions to make_sure no citizen in poverty ever falls through the cracks high_risk pools will also help to ensure that those with_pre_existing_conditions will always get the quality coverage they need on trade we are going to end the international abuse the foreign_cheating and the one_sided rules that govern nafta and the world_trade_organization right_now america eliminates its tariffs but then other_countries tax our goods with backdoor tariffs and close their markets our mas

[('global_special_interests', 0.5951030850410461),
 ('allow_medicare', 0.5717523097991943),
 ('foreign_cheating', 0.55609130859375),
 ('empowering', 0.5533382892608643),
 ('instantly', 0.541082501411438),
 ('access', 0.527299165725708),
 ('lower_income', 0.5212934613227844),
 ('timely_access', 0.5169598460197449),
 ('their_prescription_drugs', 0.5145695805549622),
 ('healthcare', 0.5127156972885132)]

## Classification

In [53]:
SubSpeech = pd.read_csv('SubSpeech_coded.csv')

In [None]:
model = Word2vec.load()

In [54]:
SubSpeech.Pop.replace('accept',1, inplace=True)

In [55]:
SubSpeech.Pop.replace('reject',0, inplace=True)

In [63]:
SubSpeech.Pop.value_counts()

0.0    226
1.0     42
Name: Pop, dtype: int64

In [56]:
labelled_data = SubSpeech.loc[SubSpeech['Pop'].isin([0,1])][["Subspeech_index","Pop"]]
labelled_data = labelled_data.groupby(["Subspeech_index"]).mean()
print(labelled_data.head())
print(labelled_data.shape)

X = np.asarray([model.docvecs[i] for i in labelled_data.index.tolist()])
Y = np.asarray(labelled_data['Pop'].tolist(), dtype="int")

# ## Get the words most closely associated with all of the "populist" articles
# for k, v in model.most_similar(model.docvecs[populist_indices], topn=50):
#   print(k) 

kf = KFold(n_splits=5)
print(X.shape)
print(Y)

                   Pop
Subspeech_index       
Speech110_0:10     0.0
Speech110_100:120  0.0
Speech110_10:20    0.0
Speech110_20:30    1.0
Speech110_30:40    1.0
(268, 1)
(268, 100)
[0 0 0 1 1 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0
 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0
 0 0 0 0 0 1 1 1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0]


In [60]:
auc_scores_d2v = []
accuracy_scores_d2v = []

np.random.seed(1234) 
random.seed(1234)

plt.figure()
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['axes.edgecolor'] = 'black'
plt.rcParams['xtick.color'] = 'black'
plt.rcParams['ytick.color'] = 'black'
plt.rcParams['axes.labelcolor'] = 'black'
plt.rcParams['savefig.transparent'] = 'false'

plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.axes().set_aspect('equal')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
#plt.title('ROC curve', fontsize=16)
for i in range(1):
    ## Create a test and train set
    test_size = 80
    test_set = random.sample(range(0,len(Y)), test_size)
    train_set = list(set(list(range(0,len(Y)))) - set(test_set))
    ## Initialize a gradient boosting classifier
    gbc = RandomForestClassifier(n_estimators=5000, max_depth=10, random_state=0, class_weight="balanced")
  # gbc = GradientBoostingClassifier(loss="deviance",
  #   learning_rate=0.1,
  #   n_estimators=20000,
  #   subsample=1.0,
  #   min_samples_split=2,
  #   min_samples_leaf=1,
  #   max_depth=4,
  #   init=None,
  #   random_state=None,
  #   max_features=None,
  #   verbose=0)
    gbc = CalibratedClassifierCV(gbc, cv=2, method="isotonic")
    ## Fit the model to the training set
    gbc.fit(X[np.asarray(train_set, dtype="int")], Y[np.asarray(train_set, dtype="int")])
    ## Predict out-of-sample on the test set and compute AUC
    preds = gbc.predict_proba(X[np.asarray(test_set, dtype="int")])
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(Y[np.asarray(test_set, dtype="int")], preds[:,1], pos_label=1)
    auc_scores_d2v = auc_scores_d2v + [metrics.auc(fpr_d2v, tpr_d2v)]
    plt.plot(fpr_d2v, tpr_d2v, lw=2, linestyle='--', label="AUC:" + str(metrics.auc(fpr_d2v, tpr_d2v))[0:4],color='#fd8d3c')
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    accuracy_d2v = metrics.accuracy_score(Y[np.asarray(test_set, dtype="int")], gbc.predict(X[np.asarray(test_set, dtype="int")]), normalize=True)
    accuracy_scores_d2v = accuracy_scores_d2v + [accuracy_d2v]
    print("Accuracy: " + str(accuracy_d2v))
leg = plt.legend(framealpha = 0,loc='lower right', fontsize=13)
for text in leg.get_texts():
    plt.setp(text, color = 'black')
plt.savefig('images/d2v_rocs.png')
# vocab = model.wv.vocab.keys()
# vectors = [model[v] for v in vocab]
# vectors = np.asarray(vectors)

# word_preds = gbc.predict_proba(vectors)
# word_scores = dict(zip(vocab, word_preds[:,1].tolist()))
# sorted_keys = sorted(word_scores, key=word_scores.get, reverse=True)
# for r in sorted_keys[0:100]:
#     print(str(r))
print("Mean AUC: " + str(np.mean(auc_scores_d2v)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_d2v)))

AUC: 0.899540757749713
Accuracy: 0.8375
Mean AUC: 0.899540757749713
Mean Accuracy: 0.8375


In [64]:
predicted = gbc.predict(X[np.asarray(test_set, dtype="int")])
confusion = confusion_matrix(Y[np.asarray(test_set, dtype="int")], predicted)
print(confusion)

[[67  0]
 [13  0]]


In [73]:
def plot_confusion_matrix(y_true, y_pred,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = unique_labels(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [74]:
plot_confusion_matrix(Y[np.asarray(test_set, dtype="int")], predicted, 
                      title='Confusion matrix, without normalization')

Confusion matrix, without normalization
[[67  0]
 [13  0]]


<matplotlib.axes._subplots.AxesSubplot at 0x11ec65f28>

In [67]:
scores = gbc.predict_proba(np.asarray([model.docvecs[a] for a in SubSpeech["Subspeech_index"]]))
predictions = gbc.predict(np.asarray([model.docvecs[a] for a in SubSpeech["Subspeech_index"]]))

In [68]:
SubSpeech["Pop_class"] = predictions.tolist()
SubSpeech["Pop_prob"] = scores[:,1].tolist()

In [69]:
SubSpeech["Pop_prob"].describe()

count    11929.000000
mean         0.109605
std          0.078837
min          0.047170
25%          0.047170
50%          0.092624
75%          0.147170
max          0.458333
Name: Pop_prob, dtype: float64

In [77]:
#SubSpeech.sort_values(by=['Pop_prob']).tail()

In [124]:
SubSpeech["Pop_class"] = [1 if x>=0.4 else 0 for x in SubSpeech["Pop_prob"]]

In [125]:
np.mean(SubSpeech["Pop_class"])

0.02288540531477911

In [126]:
plot_confusion_matrix(SubSpeech.loc[SubSpeech['Pop'].isin([0,1])]["Pop"], 
                      SubSpeech.loc[SubSpeech['Pop'].isin([0,1])]["Pop_class"])

Confusion matrix, without normalization
[[220   6]
 [ 20  22]]


<matplotlib.axes._subplots.AxesSubplot at 0x1217bdef0>

In [127]:
SubSpeech.to_csv('SubSpeech_predicted.csv', index=False)

## Blind Test (prodigy)

In [128]:
SubSpeech_predicted = pd.read_csv('SubSpeech_predicted.csv')

In [128]:
#SubSpeech_predicted.head()

In [129]:
virgintext = SubSpeech.loc[-SubSpeech['Pop'].isin([0,1])]

In [130]:
virgintext = virgintext[virgintext.Candidate != 'Donald J. Trump']

In [132]:
virgintext[virgintext.Pop_class==1].Candidate.value_counts()

Barack Obama             12
Bernie Sanders            9
John Edwards              5
Robert Dole               5
Hillary Clinton           5
Ted Cruz                  3
Rick Perry                3
Mitt Romney               3
John F. Kerry             3
Bill Richardson           2
John McCain               2
Scott Walker              2
Mike Pence                1
Newt Gingrich             1
Albert Gore, Jr.          1
Franklin D. Roosevelt     1
John F. Kennedy           1
Mike Huckabee             1
Fred Thompson             1
Rudy Giuliani             1
Jon Huntsman              1
Name: Candidate, dtype: int64

In [139]:
#virgintext.Subspeech_index[virgintext.Pop_class==1]

In [133]:
blind_index = list(np.random.choice(virgintext.Subspeech_index[virgintext.Pop_class==1], 10, replace=False)) + list(
    np.random.choice(virgintext.Subspeech_index[virgintext.Pop_class==0], 5, replace=False)) 

In [134]:
blindspeech_index = [re.split('_',x)[0] for x in blind_index]

In [135]:
blindspeech_sample = SubSpeech[['Speech_index', 'Subspeech_index','SubContent']].loc[
    SubSpeech.Speech_index.isin(blindspeech_index)]

In [136]:
blindspeech_sample['text'] = [blindspeech_sample.Subspeech_index[i] + '## ' + ''.join(blindspeech_sample.SubContent[i])
                           for i in blindspeech_sample.Subspeech_index.index]

In [137]:
len(blindspeech_sample.Subspeech_index)

117

In [138]:
blindspeech_sample.to_csv('blindspeech_sample2.csv')

In [11]:
blindspeech_sample = pd.read_csv("blindspeech_sample2.csv")

In [152]:
SubSpeech.loc[
    SubSpeech.Speech_index.isin(blindspeech_index)].Year.value_counts()

2008    50
1996    24
2011    12
2015    10
2016     9
2007     6
2004     6
Name: Year, dtype: int64

In [20]:
#SubSpeech_predicted.Candidate[SubSpeech_predicted.Speech_index.isin(blindspeech_sample.Speech_index)]

In [117]:
len(SubSpeech.Speech_index.loc[SubSpeech['Pop'].isin([0,1])].value_counts())

47

In [162]:
SubSpeech_predicted.loc[SubSpeech_predicted.Speech_index == "Speech1754"]

Unnamed: 0.1,Unnamed: 0,Speech_index,SubContent,Subspeech_index,Candidate,Title,Date,Content,Type,URL,Year,Month,Day,Pop,Pop_class,Pop_prob
9480,9480,Speech1754,"['', 'A few weeks ago, President Clinton looke...",Speech1754_0:10,Robert Dole,\nRemarks at Hudson Chamber of Commerce\n,"\nFebruary 22, 1996","\nA few weeks ago, President Clinton looked Am...",Campaign Documents,/documents/remarks-hudson-chamber-commerce,1996,February,22,,0,0.04717
9481,9481,Speech1754,['A mother of two who is worried her job may b...,Speech1754_10:20,Robert Dole,\nRemarks at Hudson Chamber of Commerce\n,"\nFebruary 22, 1996","\nA few weeks ago, President Clinton looked Am...",Campaign Documents,/documents/remarks-hudson-chamber-commerce,1996,February,22,,0,0.14717
9482,9482,Speech1754,['Let me briefly touch upon each of these four...,Speech1754_20:30,Robert Dole,\nRemarks at Hudson Chamber of Commerce\n,"\nFebruary 22, 1996","\nA few weeks ago, President Clinton looked Am...",Campaign Documents,/documents/remarks-hudson-chamber-commerce,1996,February,22,,0,0.083333
9483,9483,Speech1754,"['If you go to any Main Street in America, and...",Speech1754_30:40,Robert Dole,\nRemarks at Hudson Chamber of Commerce\n,"\nFebruary 22, 1996","\nA few weeks ago, President Clinton looked Am...",Campaign Documents,/documents/remarks-hudson-chamber-commerce,1996,February,22,,1,0.438596
9484,9484,Speech1754,['In designing a new tax system I will ensure ...,Speech1754_40:50,Robert Dole,\nRemarks at Hudson Chamber of Commerce\n,"\nFebruary 22, 1996","\nA few weeks ago, President Clinton looked Am...",Campaign Documents,/documents/remarks-hudson-chamber-commerce,1996,February,22,,0,0.15855
9485,9485,Speech1754,"['As President, I would direct every Departmen...",Speech1754_50:70,Robert Dole,\nRemarks at Hudson Chamber of Commerce\n,"\nFebruary 22, 1996","\nA few weeks ago, President Clinton looked Am...",Campaign Documents,/documents/remarks-hudson-chamber-commerce,1996,February,22,,0,0.128788


### Merge Blind Test Annotation and Test

In [23]:
with open('Annotations/blind_test1117.jsonl', 'r') as annotation_jsonl:
    annotation_list = list(annotation_jsonl)

In [3]:
len(annotation_list)

68

In [6]:
coded = {}
for annotation in annotation_list:
    annotation_dict = json.loads(annotation)
    coded[annotation_dict['text'].split('##')[0]] = annotation_dict['answer']

In [7]:
len(coded)

68

In [8]:
blindtest_coded = pd.DataFrame.from_dict(coded, orient='index')

In [10]:
blindtest_coded = blindtest_coded.rename(columns={0:'Pop'})

In [12]:
blindtest_coded['Subspeech_index'] = blindtest_coded.index

In [13]:
SubSpeech = pd.read_csv('SubSpeech_coded.csv')

In [16]:
blindtest = SubSpeech.merge(blindtest_coded, how = 'outer', left_on='Subspeech_index', right_on = 'Subspeech_index')

In [18]:
#blindtest

### Update training data with all annotated

In [24]:
with open('Annotations/test_annotation.jsonl', 'r') as annotation_jsonl:
    annotation_list += list(annotation_jsonl)

In [25]:
with open('Annotations/pop_code1117.jsonl', 'r') as annotation_jsonl:
    annotation_list += list(annotation_jsonl)

In [26]:
len(annotation_list)

268

In [27]:
coded = {}
for annotation in annotation_list:
    annotation_dict = json.loads(annotation)
    coded[annotation_dict['text'].split('##')[0]] = annotation_dict['answer']

coded_data = pd.DataFrame.from_dict(coded, orient='index')
coded_data = coded_data.rename(columns={0:'Pop'})
coded_data['Subspeech_index'] = coded_data.index

In [29]:
len(coded_data.Pop)

268

In [30]:
Sub_Speech = pd.read_csv('SubSpeech.csv')

In [31]:
SubSpeech_coded = Sub_Speech.merge(coded_data, how = 'outer', left_on='Subspeech_index', right_on = 'Subspeech_index')

In [34]:
#SubSpeech_coded.head()

In [35]:
SubSpeech_coded.to_csv("SubSpeech_coded.csv", index=False)