## Final Project Submission

Please fill out:
* Student name: 
* Student pace: self paced / part time / full time
* Scheduled project review date/time: 
* Instructor name: 
* Blog post URL:


In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)
from nltk import word_tokenize
from gensim.models import word2vec
import os

In [46]:
lvl1 = 'Corpus of Presential Speeches/'

speech_dict = {'president': [], 'text': []}

for folder in os.listdir(lvl1):
    if folder != '.DS_Store':
        for file in os.listdir(lvl1 + folder):
            with open(f'{lvl1}{folder}/{file}', 'r') as f:
                data = f.read().replace('\n\n', '\n')
                paragraphs = data.split('\n')
    #             if 'date=' in data:
                for paragraph in paragraphs[2:]:
                    speech_dict['president'].append(folder)
                    speech_dict['text'].append(paragraph)
                
df = pd.DataFrame.from_dict(speech_dict)
print(df.shape)
df.head()

(36501, 2)


Unnamed: 0,president,text
0,coolidge,We have come here to dedicate a cornerstone th...
1,coolidge,It is but natural that such a design should be...
2,coolidge,"Next to him will come Thomas Jefferson, whose ..."
3,coolidge,"After our country had been established, enlarg..."
4,coolidge,That the principles for which these three men ...


In [3]:
df['title'] = df['raw_text'].apply(lambda x: x.split('> ')[0].split('<title="')[1][:-1])
# df['date']  = df['raw_text'].apply(lambda x: x.split('> ')[1].split('date="')[1][:-1])
df['text']  = df['raw_text'].apply(lambda x: x.split('> ')[2])
df = df.drop('raw_text', axis=1)
df.head()

Unnamed: 0,president,title,text
0,coolidge,Address at the Opening of Work on Mount Rushmo...,We have come here to dedicate a cornerstone th...
1,coolidge,Third Annual Message,Members of the Congress: In meeting the consti...
2,coolidge,First Annual Message,Since the close of the last Congress the Natio...
3,coolidge,Message Regarding Relationship of Church and S...,"Mr. Moderator, Members Of The Council: It is m..."
4,coolidge,Inaugural Address,My Countrymen: No one can contemplate curren...


In [58]:
df['length'] = df['text'].apply(lambda x: len(x.split(' ')))
df = df[df.length > 3]
df[~df.text.str.contains("<")].sort_values(by=['length'])

Unnamed: 0,president,text,length
18400,fdroosevelt,Certain facts are self-evident.,4
7995,carter,Thank you very much.,4
25413,nixon,Thank you and goodnight.,4
19362,hayes,"Excess of expenditures $4,854,180.82",4
23255,johnson,"DEPARTMENT OF THE INTERIOR,",4
652,tyler,"department, are herewith communicated.",4
24647,cleveland,(f) Chiefs of divisions.,4
2268,pierce,Secretary of State .,4
631,tyler,"J. B. BUCKNALL ESTCOURT,",4
626,tyler,"J. B. BUCKNALL ESTCOURT,",4


In [59]:
target = df.president
data = df['text'].map(word_tokenize).values

In [60]:
total_vocabulary = set(word.lower() for text in data for word in text)
print(f'Total Words in Vocabulary: {len(total_vocabulary)}')

Total Words in Vocabulary: 44169


In [61]:
glove = {}
with open('glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [62]:
n = np.random.randint(0, len(total_vocabulary)-10)

for k in list(glove.keys())[n:n+10]:
    print(f'{k}: {glove[k][:5]}')
    print(f'Vector Length: {len(glove[k])}\n')

cheered: [-0.36736 -0.75968  0.58122 -0.97144  0.67499]
Vector Length: 50

ruined: [ 0.82673  -0.20167  -0.10983  -0.25694   0.026495]
Vector Length: 50

approves: [ 0.17652  -0.52094   0.20237  -0.010022 -0.104   ]
Vector Length: 50

tutsi: [ 0.74322 -1.711    0.27562 -1.3226   0.66681]
Vector Length: 50

consult: [ 0.80756 -0.2623  -0.24342 -0.27774 -0.04636]
Vector Length: 50

bust: [ 0.35826   0.16594   0.3303   -0.50035  -0.080152]
Vector Length: 50

unlimited: [ 0.10005  0.75968  0.54155 -0.21724 -0.22628]
Vector Length: 50

explicitly: [ 0.0058917 -0.28056   -0.27778   -0.6508     0.27178  ]
Vector Length: 50

premiums: [ 0.16995  0.08008  1.0474  -1.6206  -0.54574]
Vector Length: 50

escorted: [ 0.86319 -0.48385  0.13338 -0.88502  0.44473]
Vector Length: 50



In [63]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    # Note from Mike: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # It can't be used in a sklearn Pipeline. 
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

rf  = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
                ("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True))])
svc = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
                ('Support Vector Machine', SVC())])
lr  = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
                ('Logistic Regression', LogisticRegression())])

In [65]:
models = [('Random Forest', rf),
          ("Support Vector Machine", svc),
          ("Logistic Regression", lr)]

In [68]:
import warnings
warnings.filterwarnings('ignore')

scores = [(name, cross_val_score(model, data, target, cv=2).mean()) for name, model in models]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   15.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   15.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.6s finished


In [69]:
scores

[('Random Forest', 0.2600893874176554),
 ('Support Vector Machine', 0.19063518718284356),
 ('Logistic Regression', 0.22400822299117315)]

In [70]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence

In [71]:
y = pd.get_dummies(target).values

In [72]:
tokenizer = text.Tokenizer(num_words=30000)
tokenizer.fit_on_texts(list(df.text))
list_tokenized_headlines = tokenizer.texts_to_sequences(df.text)
X_t = sequence.pad_sequences(list_tokenized_headlines, maxlen=10000)

In [73]:
len(np.unique(target))

44

In [79]:
np.unique(target)

array(['Trump', 'adams', 'arthur', 'bharrison', 'buchanan', 'bush',
       'carter', 'cleveland', 'clinton', 'coolidge', 'eisenhower',
       'fdroosevelt', 'fillmore', 'ford', 'garfield', 'grant', 'gwbush',
       'harding', 'harrison', 'hayes', 'hoover', 'jackson', 'jefferson',
       'johnson', 'jqadams', 'kennedy', 'lbjohnson', 'lincoln', 'madison',
       'mckinley', 'monroe', 'nixon', 'obama', 'pierce', 'polk', 'reagan',
       'roosevelt', 'taft', 'taylor', 'truman', 'tyler', 'vanburen',
       'washington', 'wilson'], dtype=object)

In [80]:
embedding_size = 128
input_ = Input(shape=(10000,))
x = Embedding(30000, embedding_size)(input_)
x = Bidirectional(LSTM(50, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(.25)(x)
x = Dense(50, activation='relu')(x)
x = Dropout(.25)(x)
x = Dense(44, activation='softmax')(x)

model = Model(inputs=input_, outputs=x)

In [81]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [82]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 10000)             0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 10000, 128)        3840000   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 10000, 100)        71600     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 100)               0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_8 (Dropout)          (None, 50)                0         
__________

In [83]:
model.fit(X_t, y, epochs=5, batch_size=8, validation_split=0.1)

Train on 30530 samples, validate on 3393 samples
Epoch 1/5
   80/30530 [..............................] - ETA: 7:54:17 - loss: 3.7806 - acc: 0.0125

KeyboardInterrupt: 