In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk
import re
from nltk.corpus import gutenberg, stopwords, inaugural
import spacy
from collections import Counter
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('inaugural')

[nltk_data] Downloading package inaugural to
[nltk_data]     /home/brandoncsteed/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!


True

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/brandoncsteed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
inaugural

<PlaintextCorpusReader in '/home/brandoncsteed/nltk_data/corpora/inaugural'>

In [5]:
#Checking for available files
inaugural.fileids()

['1789-Washington.txt',
 '1793-Washington.txt',
 '1797-Adams.txt',
 '1801-Jefferson.txt',
 '1805-Jefferson.txt',
 '1809-Madison.txt',
 '1813-Madison.txt',
 '1817-Monroe.txt',
 '1821-Monroe.txt',
 '1825-Adams.txt',
 '1829-Jackson.txt',
 '1833-Jackson.txt',
 '1837-VanBuren.txt',
 '1841-Harrison.txt',
 '1845-Polk.txt',
 '1849-Taylor.txt',
 '1853-Pierce.txt',
 '1857-Buchanan.txt',
 '1861-Lincoln.txt',
 '1865-Lincoln.txt',
 '1869-Grant.txt',
 '1873-Grant.txt',
 '1877-Hayes.txt',
 '1881-Garfield.txt',
 '1885-Cleveland.txt',
 '1889-Harrison.txt',
 '1893-Cleveland.txt',
 '1897-McKinley.txt',
 '1901-McKinley.txt',
 '1905-Roosevelt.txt',
 '1909-Taft.txt',
 '1913-Wilson.txt',
 '1917-Wilson.txt',
 '1921-Harding.txt',
 '1925-Coolidge.txt',
 '1929-Hoover.txt',
 '1933-Roosevelt.txt',
 '1937-Roosevelt.txt',
 '1941-Roosevelt.txt',
 '1945-Roosevelt.txt',
 '1949-Truman.txt',
 '1953-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1965-Johnson.txt',
 '1969-Nixon.txt',
 '1973-Nixon.txt',
 '1

In [6]:
#Viewing first part of raw address text
inaugural.raw('1789-Washington.txt')[:500]

'Fellow-Citizens of the Senate and of the House of Representatives:\n\nAmong the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order, and received on the 14th day of the present month. On the one hand, I was summoned by my Country, whose voice I can never hear but with veneration and love, from a retreat which I had chosen with the fondest predilection, and, in my flattering hopes, with an immutable dec'

In [7]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub(r'-',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
wash1 = inaugural.raw('1789-Washington.txt')
jeff1 = inaugural.raw('1801-Jefferson.txt')

wash1 = text_cleaner(wash1)
jeff1 = text_cleaner(jeff1)

In [8]:
#Looking at cleaned Washington address
wash1[:200]

'Fellow Citizens of the Senate and of the House of Representatives: Among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was '

In [9]:
#Lookign at cleaned Jefferson address
jeff1[:200]

'Friends and Fellow Citizens: Called upon to undertake the duties of the first executive office of our country, I avail myself of the presence of that portion of my fellow citizens which is here assemb'

In [10]:
# Parsing the cleaned novels
nlp = spacy.load('en')
wash1_doc = nlp(wash1)
jeff1_doc = nlp(jeff1)

In [11]:
# Grouping into sentences
wash1_sents = [[sent, "Washington"] for sent in wash1_doc.sents]
jeff1_sents = [[sent, "Jefferson"] for sent in jeff1_doc.sents]

# Combining the sentences from the two novels into one data frame
sentences = pd.DataFrame(wash1_sents + jeff1_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Fellow, Citizens, of, the, Senate, and, of, t...",Washington
1,"(Among, the, vicissitudes, incident, to, life,...",Washington
2,"(On, the, one, hand, ,, I, was, summoned, by, ...",Washington
3,"(On, the, other, hand, ,, the, magnitude, and,...",Washington
4,"(In, this, conflict, of, emotions, all, I, dar...",Washington


# Bag of Words

In [12]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
wash1words = bag_of_words(wash1_doc)
jeff1words = bag_of_words(jeff1_doc)

# Combine bags to create a set of unique words.
common_words = set(wash1words + jeff1words)

In [13]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0


Unnamed: 0,asylum,department,character,consultation,preserve,united,year,pursuit,throe,light,...,independent,refer,error,strong,mankind,objection,far,conspicuous,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Fellow, Citizens, of, the, Senate, and, of, t...",Washington
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Among, the, vicissitudes, incident, to, life,...",Washington
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,"(On, the, one, hand, ,, I, was, summoned, by, ...",Washington
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(On, the, other, hand, ,, the, magnitude, and,...",Washington
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(In, this, conflict, of, emotions, all, I, dar...",Washington


In [14]:
rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 1.0

Test set score: 0.7407407407407407


In [15]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(40, 850) (40,)
Training set score: 1.0

Test set score: 0.7037037037037037


In [16]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 1.0

Test set score: 0.7037037037037037


In [17]:
#Using SVM as a modeling technique
svm = SVC(C=250)
svm.fit(X_train, y_train)

print('Training set score:', svm.score(X_train, y_train))
print('\nTest set score:', svm.score(X_test, y_test))

Training set score: 1.0

Test set score: 0.6666666666666666


In [18]:
bnb = BernoulliNB()
train = bnb.fit(X_train, y_train)

print('Training set score:', bnb.score(X_train, y_train))
print('\nTest set score:', bnb.score(X_test, y_test))

Training set score: 0.8

Test set score: 0.6666666666666666


In [19]:
#reading in the data, this time in the form of paragraphs
was = [[sent, "Washington"] for sent in inaugural.paras('1789-Washington.txt')]
jef = [[sent, "Jefferson"] for sent in inaugural.sents('1801-Jefferson.txt')]
#was=inaugural.sents('1789-Washington.txt')
#jef = inaugural.sents('1801-Jefferson.txt')
inaugural1 = pd.DataFrame(was + jef)



# TF_IDF

In [20]:
import nltk
from nltk.corpus import inaugural
nltk.download('inaugural')
import re
from sklearn.model_selection import train_test_split

#reading in the data, this time in the form of paragraphs
wash=inaugural.sents('1789-Washington.txt')
#processing
wash_sents=[]
for sentence in wash:
    #removing the double-dash from all words
    sentence=[re.sub(r'-','',word) for word in sentence]
    #Forming each paragraph into a string and adding it to the list of strings.
    wash_sents.append(' '.join(sentence))

print(wash_sents[0:4])

[nltk_data] Downloading package inaugural to
[nltk_data]     /home/brandoncsteed/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
['Fellow  Citizens of the Senate and of the House of Representatives :', 'Among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order , and received on the 14th day of the present month .', 'On the one hand , I was summoned by my Country , whose voice I can never hear but with veneration and love , from a retreat which I had chosen with the fondest predilection , and , in my flattering hopes , with an immutable decision , as the asylum of my declining years  a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination , and of frequent interruptions in my health to the gradual waste committed on it by time .', 'On the other hand , the magnitude and difficulty of the trust to which th

In [21]:
#reading in the data, this time in the form of paragraphs
jeff=inaugural.sents('1801-Jefferson.txt')
#processing
jeff_sents=[]
for sentence in jeff:
    #removing the double-dash from all words
    sentence=[re.sub(r'-','',word) for word in sentence]
    #Forming each paragraph into a string and adding it to the list of strings.
    jeff_sents.append(' '.join(sentence))

print(jeff_sents[0:4])

['Friends and Fellow Citizens :', 'Called upon to undertake the duties of the first executive office of our country , I avail myself of the presence of that portion of my fellow citizens which is here assembled to express my grateful thanks for the favor with which they have been pleased to look toward me , to declare a sincere consciousness that the task is above my talents , and that I approach it with those anxious and awful presentiments which the greatness of the charge and the weakness of my powers so justly inspire .', 'A rising nation , spread over a wide and fruitful land , traversing all the seas with the rich productions of their industry , engaged in commerce with nations who feel power and forget right , advancing rapidly to destinies beyond the reach of mortal eye  when I contemplate these transcendent objects , and see the honor , the happiness , and the hopes of this beloved country committed to the issue , and the auspices of this day , I shrink from the contemplation 

In [22]:
# Grouping into sentences
wash2_sents = [[sent, "Washington"] for sent in wash_sents]
jeff2_sents = [[sent, "Jefferson"] for sent in jeff_sents]

# Combining the sentences from the two novels into one data frame
inaugural_sents = pd.DataFrame(wash2_sents + jeff2_sents)
inaugural_sents.head()

Unnamed: 0,0,1
0,Fellow Citizens of the Senate and of the Hous...,Washington
1,Among the vicissitudes incident to life no eve...,Washington
2,"On the one hand , I was summoned by my Country...",Washington
3,"On the other hand , the magnitude and difficul...",Washington
4,In this conflict of emotions all I dare aver i...,Washington


In [23]:
inaugural_sents[0]

0     Fellow  Citizens of the Senate and of the Hous...
1     Among the vicissitudes incident to life no eve...
2     On the one hand , I was summoned by my Country...
3     On the other hand , the magnitude and difficul...
4     In this conflict of emotions all I dare aver i...
5     All I dare hope is that if , in executing this...
6     Such being the impressions under which I have ...
7     In tendering this homage to the Great Author o...
8     No people can be bound to acknowledge and ador...
9     Every step by which they have advanced to the ...
10    These reflections , arising out of the present...
11    You will join with me , I trust , in thinking ...
12    By the article establishing the executive depa...
13    The circumstances under which I now meet you w...
14    It will be more consistent with those circumst...
15    In these honorable qualifications I behold the...
16    I dwell on this prospect with every satisfacti...
17    Besides the ordinary objects submitted to 

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test = train_test_split(inaugural_sents, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=3, # only use words that appear at least three times
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
inaugural_sents_tfidf=vectorizer.fit_transform(inaugural_sents[0])
print("Number of features: %d" % inaugural_sents_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(inaugural_sents_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[0])
print('Tf_idf vector:', tfidf_bypara[0])

Number of features: 67
Original sentence: 56    They should be the creed of our political fait...
42    I believe it the only one where every man , at...
43    Sometimes it is said that man can not be trust...
60    I shall often go wrong through defect of judgm...
18    Instead of undertaking particular recommendati...
62    I ask your indulgence for my own errors , whic...
15    In these honorable qualifications I behold the...
5     All I dare hope is that if , in executing this...
16    I dwell on this prospect with every satisfacti...
20    It concerns myself , and will therefore be as ...
57    I repair , then , fellow  citizens , to the po...
8     No people can be bound to acknowledge and ador...
13    The circumstances under which I now meet you w...
25    Called upon to undertake the duties of the fir...
37    We are all Republicans , we are all Federalists .
17    Besides the ordinary objects submitted to your...
49    Still one thing more , fellow citizens  a wise...
52    

In [25]:
Y = inaugural_sents[1]

X_train, X_test, Y_train, Y_test = train_test_split(inaugural_sents_tfidf, Y, test_size=0.3)

In [26]:
X_train

<46x67 sparse matrix of type '<class 'numpy.float64'>'
	with 207 stored elements in Compressed Sparse Row format>

In [27]:
print(X_train)

  (1, 24)	0.3908418767234323
  (1, 4)	0.6986789262497043
  (1, 0)	0.36800145032593373
  (1, 31)	0.36800145032593373
  (1, 6)	0.2970526056608228
  (2, 44)	0.5167068779727263
  (2, 1)	0.17223562599090878
  (2, 46)	0.17223562599090878
  (2, 49)	0.30789280626248505
  (2, 58)	0.15394640313124253
  (2, 61)	0.17223562599090878
  (2, 55)	0.17223562599090878
  (2, 57)	0.17223562599090878
  (2, 54)	0.15394640313124253
  (2, 25)	0.34447125198181755
  (2, 10)	0.16217034032742758
  (2, 53)	0.17223562599090878
  (2, 34)	0.16217034032742758
  (2, 20)	0.10696977891894652
  (2, 45)	0.14699316330628157
  (2, 37)	0.17223562599090878
  (2, 51)	0.42290997900679844
  (3, 62)	1.0
  (4, 61)	1.0
  (5, 49)	0.6664121293958076
  :	:
  (44, 27)	0.3143708714720647
  (44, 59)	0.7719110253160831
  (44, 17)	0.2809887015762525
  (44, 34)	0.2959993376651688
  (44, 20)	0.1952452196022345
  (44, 3)	0.21733154416716483
  (44, 15)	0.23108520070751393
  (45, 52)	0.2449118618970982
  (45, 30)	0.2449118618970982
  (45, 57)	0.2

In [28]:
X_test

<20x67 sparse matrix of type '<class 'numpy.float64'>'
	with 89 stored elements in Compressed Sparse Row format>

In [29]:
len(Y_train)

46

In [30]:
Y_test

15    Washington
60     Jefferson
23    Washington
61     Jefferson
12    Washington
34     Jefferson
0     Washington
17    Washington
41     Jefferson
9     Washington
54     Jefferson
31     Jefferson
28     Jefferson
27     Jefferson
2     Washington
18    Washington
35     Jefferson
50     Jefferson
52     Jefferson
38     Jefferson
Name: 1, dtype: object

In [31]:
rfc.fit(X_train, Y_train)

print('Training set score:', rfc.score(X_train, Y_train))
print('\nTest set score:', rfc.score(X_test, Y_test))

Training set score: 0.9347826086956522

Test set score: 0.8


In [32]:
bnb.fit(X_train, Y_train)

print('Training set score:', bnb.score(X_train, Y_train))
print('\nTest set score:', bnb.score(X_test, Y_test))

Training set score: 0.8695652173913043

Test set score: 0.85


In [33]:
svm.fit(X_train, Y_train)

print('Training set score:', svm.score(X_train, Y_train))
print('\nTest set score:', svm.score(X_test, Y_test))

Training set score: 0.9565217391304348

Test set score: 0.8


In [34]:
lr.fit(X_train, Y_train)

print('Training set score:', lr.score(X_train, Y_train))
print('\nTest set score:', lr.score(X_test, Y_test))

Training set score: 0.782608695652174

Test set score: 0.6


In [35]:
clf.fit(X_train, Y_train)

print('Training set score:', clf.score(X_train, Y_train))
print('\nTest set score:', clf.score(X_test, Y_test))

Training set score: 0.9565217391304348

Test set score: 0.85


# Refining Model

In [36]:
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
inaugural_sents_tfidf=vectorizer.fit_transform(inaugural_sents[0])
print("Number of features: %d" % inaugural_sents_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(inaugural_sents_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('\nOriginal sentence:', X_train[0])
print('\nTf_idf vector:', tfidf_bypara[0])

clf.fit(X_train, Y_train)

print('\nTraining set score:', clf.score(X_train, Y_train))
print('\nTest set score:', clf.score(X_test, Y_test))

Number of features: 206

Original sentence: 

Tf_idf vector: {'services': 0.32620100630785664, 'steps': 0.32620100630785664, 'faith': 0.32620100630785664, 'moments': 0.32620100630785664, 'peace': 0.30334656266038423, 'political': 0.30334656266038423, 'let': 0.24828058985748414, 'safety': 0.27113503350495666, 'liberty': 0.2856192789428273, 'error': 0.30334656266038423, 'trust': 0.2856192789428273}

Training set score: 0.9565217391304348

Test set score: 0.85
