In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
from sklearn.model_selection import train_test_split

# Predicting Author From Text

I want to make a model that can predict who has written a story based on its text features.  To accomplish this, I will use data from the NLTK 'Gutenberg' corpus.

In [2]:
import nltk
from nltk.corpus import gutenberg
from nltk.corpus import genesis
from nltk.corpus import webtext

In [3]:
import en_core_web_sm

# Data cleaning / processing / language parsing

I need to make sure my data is clean and workable when I make my features.

In [4]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [5]:
# Load and clean the data.
persuasion_raw = gutenberg.raw('austen-persuasion.txt')
alice_raw = gutenberg.raw('carroll-alice.txt')

alice = re.sub(r'CHAPTER [A-Z].','', alice_raw)
alice = text_cleaner(alice)

persuasion = re.sub(r'Chapter \d+', '', persuasion_raw)
persuasion = text_cleaner(persuasion)

In [6]:
# take a sample of each text, to prevent memory errors
alicetr = alice[:30000]
persuasiontr = persuasion[:30000]

# Creating Features

I must extract features from the text, through a process called NLP, Natural Language Processing. Though there are many ways to accomplish this, I will use 2: Bag of Words and Term-Frequency with Inverse Document Frequency, or TF IDF.

## Bag of Words

I will use the bag of words technique first. To use this, I will need to process the texts down to sentences. From there, I will extract information on each sentences verbosity and punctuation use. I will use this information to characterize each sentence.

In [7]:
nlp = en_core_web_sm.load()

In [8]:
alice_doc = nlp(alicetr)

In [9]:
persuasion_doc = nlp(persuasiontr)

In [10]:
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

In [11]:
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Down, the, Rabbit, -, Hole, Alice, was, begin...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [12]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_author'] = sentences[1]
    df.loc[:, common_words] = 0
    df['punctuation'] = 0
    df['other punctuation'] = 0
    df.loc[:, '.'] = 0
    df.loc[:, '?'] = 0
    df.loc[:, '!'] = 0
    df.loc[:, ','] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation, stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # Get number of punctuation in a sentence
        puncs = [token.lemma_
                 for token in sentence
                 if (
                     token.is_punct
                 )]
        # Increase punctuation count by how many were use
        for punc in puncs:
            df.loc[i,'punctuation'] += 1
            try:
                df.loc[i,punc] += 1
            except:
                df.loc[i,'other punctuation'] += 1
                
            
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

In [13]:
# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [14]:
# Convert to data frame for the BOW
alice_sents = pd.DataFrame(alice_sents)
# Get BOW features
alice_word_counts = bow_features(alice_sents, common_words)
alice_word_counts.head()

Processing row 0


Unnamed: 0,earth,15,tell,importance,model,noise,row,give,disrespectfully,spite,...,rabbit,totally,text_sentence,text_author,punctuation,other punctuation,.,?,!,","
0,0,0,0,0,0,0,0,0,0,0,...,1,0,"(Down, the, Rabbit, -, Hole, Alice, was, begin...",Carroll,11,6,0,1,0,4
1,0,0,0,0,0,0,0,0,0,0,...,1,0,"(So, she, was, considering, in, her, own, mind...",Carroll,7,3,1,0,0,3
2,0,0,0,0,0,0,0,0,0,0,...,1,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,4,2,0,0,1,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,"(Oh, dear, !)",Carroll,1,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,"(I, shall, be, late, !, ')",Carroll,2,1,0,0,1,0


In [15]:
# Convert to data frame for the BOW
persuasion_sents = pd.DataFrame(persuasion_sents)
# Get BOW features
persuasion_word_counts = bow_features(persuasion_sents, common_words)
persuasion_word_counts.head()

Processing row 0


Unnamed: 0,earth,15,tell,importance,model,noise,row,give,disrespectfully,spite,...,rabbit,totally,text_sentence,text_author,punctuation,other punctuation,.,?,!,","
0,0,0,0,0,0,0,0,0,0,0,...,0,0,"(Sir, Walter, Elliot, ,, of, Kellynch, Hall, ,...",Austen,15,4,1,0,0,10
1,0,0,0,0,0,0,0,0,0,0,...,0,0,"(This, was, the, page, at, which, the, favouri...",Austen,4,3,1,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,"(Walter, Elliot, ,, born, March, 1, ,, 1760, ,...",Austen,9,0,1,0,0,8
3,0,0,0,0,0,0,0,0,0,0,...,0,0,"(of, South, Park, ,, in, the, county, of, Glou...",Austen,17,6,1,0,0,10
4,0,0,0,0,0,0,0,0,0,0,...,0,0,"("", Precisely, such, had, the, paragraph, orig...",Austen,12,3,1,0,0,8


In [16]:
total_word_counts = pd.concat([alice_word_counts,persuasion_word_counts])

## Tf idf

The next technique I will use is TF IDF. Unlike the Bag of Words technique, this one is unsupervised, meaning that it extracts the feature information on its own. TF IDF works by placing weights on works based on how many times they appear in the document.  From there, meaning is ascribed to each sentence or paragraph based on the words that are used within it.

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

In [18]:
alice = gutenberg.paras('carroll-alice.txt')
#processing
alice_paras=[]
for paragraph in alice:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    alice_paras.append(' '.join(para))

In [19]:
X_train, X_test = train_test_split(alice_paras, test_size=0.5, random_state=42)

#Applying the vectorizer
alice_paras_tfidf=vectorizer.fit_transform(alice_paras)
print("Number of features: %d" % alice_paras_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(alice_paras_tfidf, test_size=0.5, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()
X_test_tfidf_csr = X_test_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

Number of features: 789


In [20]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(200)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.transform(X_test_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 85.65388303012395
Component 0:
' I didn ' t know it was YOUR table ,' said Alice ; ' it ' s laid for a great many more than three .'                                                                                                                                                                                                                                                                                                                                                 0.866940
' Hold your tongue !'                                                                                                                                                                                                                                                                                                                                                                                                                                 0.866940
' Well , I should like to be a

Name: 2, dtype: float64
Component 3:
' Wake up , Alice dear !'                                                                                                                                                                  0.744423
The Caterpillar was the first to speak .                                                                                                                                                   0.509376
' It isn ' t mine ,' said the Hatter .                                                                                                                                                     0.505852
' Hush !                                                                                                                                                                                   0.496242
This speech caused a remarkable sensation among the party .                                                                                                                        

In [21]:
alice_train_lsa = pd.DataFrame(X_train_lsa)
alice_train_lsa['text_author'] = 'Carroll'
alice_train_lsa.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,text_author
0,0.390229,-0.08151,-0.092912,-0.115203,-0.001856,0.025481,-0.080522,-1e-05,0.033934,-0.025451,...,0.054166,-0.020628,-0.013088,-0.029342,0.033374,0.012718,-0.011593,-0.030338,0.040863,Carroll
1,0.010002,0.00875,0.002558,-0.01252,0.029321,0.020522,-0.031095,0.007145,0.022515,0.015506,...,0.086814,0.006658,0.066839,0.006971,0.040019,-0.005285,0.002384,-0.047362,-0.062244,Carroll
2,0.161407,0.08305,0.174341,-0.101325,0.202759,0.189029,0.007689,0.036803,-0.047261,0.104901,...,0.005538,-0.005295,0.019985,0.079596,0.033792,0.01885,0.039365,-0.06242,-0.01533,Carroll
3,0.0001,-0.03544,-0.030696,0.049149,-0.035796,-0.064699,0.007019,-0.008836,-0.082575,-0.057086,...,0.067636,0.0056,-0.048086,0.053531,0.158641,0.042247,0.038193,0.038286,-0.055755,Carroll
4,0.018448,-0.007904,-0.023741,0.059271,-0.102216,0.124459,0.000258,0.022856,0.029614,0.011213,...,-0.018457,-0.003023,-0.005513,-0.006433,0.006509,-0.015058,-0.021465,-0.035362,0.017654,Carroll


In [22]:
alice_test_lsa = pd.DataFrame(X_test_lsa)
alice_test_lsa['text_author'] = 'Carroll'
alice_test_lsa.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,text_author
0,0.011476,0.00092,-0.010022,0.001329,0.02119,-0.00027,-0.025158,-0.003606,0.016723,-0.020666,...,-0.0045,0.049011,0.009077,-0.014243,0.005189,-0.018279,0.014966,0.000666,-0.03649,Carroll
1,0.23958,-0.031796,-0.201286,-0.04294,0.080991,0.038739,-0.072529,0.035249,-0.012633,0.067849,...,-0.008668,0.052009,-0.086604,0.117077,-0.097697,-0.007316,0.021518,-0.009194,0.06445,Carroll
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Carroll
3,0.242169,0.017704,-0.214895,-0.098039,0.149758,0.050317,-0.157665,0.000482,0.090418,-0.045324,...,0.101755,-0.028113,-0.107991,-9.4e-05,0.022031,-0.122684,-0.08145,-0.050979,-0.089174,Carroll
4,0.265044,0.034239,0.055453,-0.07671,0.050174,0.07099,-0.084321,0.013203,0.058421,-0.024091,...,-0.006524,-0.048066,-0.103454,0.027076,-0.022362,-0.02008,0.108345,-0.062217,-0.022074,Carroll


In [23]:
persuasion = gutenberg.paras('austen-persuasion.txt')
#processing
persuasion_paras=[]
for paragraph in persuasion:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    persuasion_paras.append(' '.join(para))

In [24]:
X_train, X_test = train_test_split(persuasion_paras, test_size=0.5, random_state=42)

#Applying the vectorizer
persuasion_paras_tfidf=vectorizer.fit_transform(persuasion_paras)
print("Number of features: %d" % persuasion_paras_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(persuasion_paras_tfidf, test_size=0.5, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()
X_test_tfidf_csr = X_test_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

Number of features: 1232


In [25]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(200)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.fit_transform(X_test_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

Percent variance captured by all components: 75.02858580538904


In [26]:
persuasion_train_lsa = pd.DataFrame(X_train_lsa)
persuasion_train_lsa['text_author'] = 'Austen'
persuasion_train_lsa.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,text_author
0,0.000176,0.000503,0.028985,0.001404,-0.009916,-0.003262,-0.001059,-0.00044,0.077001,-0.030192,...,0.077017,-0.018907,-0.042665,0.036893,-0.026422,-0.037499,0.008267,-0.122735,-0.059599,Austen
1,0.001887,0.004941,0.300561,0.215157,-0.253179,-0.277246,0.218477,0.074005,-0.026749,0.040911,...,0.029487,-0.037367,0.03146,0.043598,0.008129,0.027335,0.004534,-0.016467,-0.067264,Austen
2,0.001159,0.002691,0.161865,0.034679,-0.026227,0.170084,-0.084102,-0.144933,-0.068405,0.392112,...,-0.018358,0.023596,-0.011063,0.002261,-0.047888,-0.009171,-0.050181,0.000178,-0.066271,Austen
3,0.003516,0.005157,0.308129,0.020142,-0.069271,0.13408,-0.206011,-0.154493,0.037228,-0.273081,...,0.01954,-0.057349,0.041422,0.071873,-0.015806,0.047313,0.123111,0.018489,-0.017349,Austen
4,0.001251,0.001968,0.117662,0.00164,-0.02127,0.059162,-0.090198,-0.035715,-0.00948,-0.090004,...,0.067903,0.053023,0.009821,-0.008757,-0.052598,-0.094452,0.008935,0.003565,-0.045557,Austen


In [27]:
persuasion_test_lsa = pd.DataFrame(X_test_lsa)
persuasion_test_lsa['text_author'] = 'Austen'
persuasion_test_lsa.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,text_author
0,0.000489,3.5e-05,0.056061,0.012687,0.016804,0.000325,-0.026184,0.038055,0.024201,-0.061916,...,0.038275,-0.055809,0.031991,-0.044238,0.060893,-0.052125,-0.005878,0.02958,-0.033452,Austen
1,0.002219,0.000247,0.239841,-0.037842,-0.046157,0.110368,-0.042889,0.033511,0.154253,0.141901,...,0.040582,-0.000318,-0.017065,-0.08285,0.02039,-0.061947,-0.084601,0.003528,-0.041646,Austen
2,0.001349,8.1e-05,0.130648,-0.056647,-0.002846,-0.051495,0.133355,0.016297,0.018107,-0.036465,...,0.092918,-0.127945,-0.04863,0.144823,0.008685,0.139263,0.022872,0.011605,0.077892,Austen
3,0.003221,4.2e-05,0.073068,-0.034655,0.072297,0.036818,-0.054865,-0.001599,0.018144,0.151855,...,0.016082,0.036431,0.015157,-0.010048,-0.015901,-0.000557,-0.000324,-0.02496,0.036742,Austen
4,0.000954,5.2e-05,0.085027,0.018744,-0.044631,-0.071234,-0.071327,-0.103052,0.022128,0.046562,...,-0.037153,-0.066149,0.063862,-0.04334,-0.027701,0.018389,0.149017,0.042943,-0.035014,Austen


In [28]:
train_lsa = pd.concat([alice_train_lsa,persuasion_train_lsa])
test_lsa = pd.concat([alice_test_lsa,persuasion_test_lsa])

# Predicting Author Using Various Models and Feature Sets

I will now test my ability to predict author from text. There are many different types of models with many different usesm but I will try 4 different ones here: Logistic Regression, Random Forest, Grandient-Boosted Decision Trees, and Support Vector Classifier. 

Additionally, I will be modeling with both of my features sets.

I will check each models cross validation score to check the overall health of the model

In [29]:
from sklearn.model_selection import cross_val_score

In [30]:
# Bag of Words Feature Set
X_bow = total_word_counts.drop(['text_author','text_sentence'],1)
y_bow = total_word_counts.text_author
Xtrain_bow, Xtest_bow, ytrain_bow, ytest_bow = train_test_split(X_bow,y_bow,test_size=0.5,random_state=42)

In [31]:
Xtrain_bow.shape

(244, 1635)

In [32]:
# LSA Reduced Feature Set
Xtrain_lsa = train_lsa.drop('text_author',1)
ytrain_lsa = train_lsa.text_author
Xtest_lsa = test_lsa.drop('text_author',1)
ytest_lsa = test_lsa.text_author

In [33]:
Xtrain_lsa.shape

(924, 200)

## Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression

### BOW Feature Set

In [35]:
lr_bow = LogisticRegression(random_state=42)
lr_bow.fit(Xtrain_bow, ytrain_bow)
print('Training set score:', lr_bow.score(Xtrain_bow, ytrain_bow))
print('\nTest set score:', lr_bow.score(Xtest_bow, ytest_bow))

Training set score: 0.9877049180327869

Test set score: 0.9020408163265307


In [36]:
lr_bow_cv = cross_val_score(lr_bow, Xtest_bow, ytest_bow, cv=10)

In [37]:
print('Logistic Regression Cross Validation\n')
display(lr_bow_cv)
print('\nMean and Standard Error:')
print(round(lr_bow_cv.mean(),3),' +/- ',round(lr_bow_cv.std()*2,3))

Logistic Regression Cross Validation



array([0.96      , 0.88      , 0.92      , 0.8       , 0.8       ,
       0.84      , 0.76      , 0.83333333, 0.95652174, 0.86956522])


Mean and Standard Error:
0.862  +/-  0.129


### TF IDF Feature Set

In [38]:
lr_lsa = LogisticRegression(random_state=42)
lr_lsa.fit(Xtrain_lsa, ytrain_lsa)
print('Training set score:', lr_lsa.score(Xtrain_lsa, ytrain_lsa))
print('\nTest set score:', lr_lsa.score(Xtest_lsa, ytest_lsa))

Training set score: 0.8398268398268398

Test set score: 0.787027027027027


In [39]:
lr_lsa_cv = cross_val_score(lr_lsa, Xtest_lsa, ytest_lsa, cv=10)

In [40]:
print('Logistic Regression Cross Validation\n')
display(lr_lsa_cv)
print('\nMean and Standard Error:')
print(round(lr_lsa_cv.mean(),3),' +/- ',round(lr_lsa_cv.std()*2,3))

Logistic Regression Cross Validation



array([0.91397849, 0.87096774, 0.83870968, 0.84946237, 0.74193548,
       0.86021505, 0.86956522, 0.82608696, 0.88043478, 0.81318681])


Mean and Standard Error:
0.846  +/-  0.088


## Gradient-Boosted Decision Trees

In [41]:
from sklearn.ensemble import GradientBoostingClassifier

### BOW Feature Set

In [42]:
clf_bow = GradientBoostingClassifier(random_state=42)
clf_bow.fit(Xtrain_bow, ytrain_bow)
print('Training set score:', clf_bow.score(Xtrain_bow, ytrain_bow))
print('\nTest set score:', clf_bow.score(Xtest_bow, ytest_bow))

Training set score: 0.9959016393442623

Test set score: 0.8653061224489796


In [43]:
clf_bow_cv = cross_val_score(clf_bow, Xtest_bow, ytest_bow, cv=10)

In [44]:
print('Gradient-Boosted Tree Cross Validation\n')
display(clf_bow_cv)
print('\nMean and Standard Error:')
print(round(clf_bow_cv.mean(),3),' +/- ',round(clf_bow_cv.std()*2,3))

Gradient-Boosted Tree Cross Validation



array([0.88      , 0.92      , 0.8       , 0.8       , 0.84      ,
       0.92      , 0.8       , 0.79166667, 0.7826087 , 0.86956522])


Mean and Standard Error:
0.84  +/-  0.101


### TF IDF Feature Set

In [45]:
clf_lsa = GradientBoostingClassifier(random_state=42)
clf_lsa.fit(Xtrain_lsa, ytrain_lsa)
print('Training set score:', clf_lsa.score(Xtrain_lsa, ytrain_lsa))
print('\nTest set score:', clf_lsa.score(Xtest_lsa, ytest_lsa))

Training set score: 0.9902597402597403

Test set score: 0.9567567567567568


In [46]:
clf_lsa_cv = cross_val_score(clf_lsa, Xtest_lsa, ytest_lsa, cv=10)

In [47]:
print('Gradient-Boosted Tree Cross Validation\n')
display(clf_lsa_cv)
print('\nMean and Standard Error:')
print(round(clf_lsa_cv.mean(),3),' +/- ',round(clf_lsa_cv.std()*2,3))

Gradient-Boosted Tree Cross Validation



array([0.98924731, 1.        , 1.        , 1.        , 0.98924731,
       0.96774194, 0.9673913 , 0.94565217, 0.98913043, 0.96703297])


Mean and Standard Error:
0.982  +/-  0.035


## Random Forest

In [48]:
from sklearn.ensemble import RandomForestClassifier

### BOW Feature Set

In [49]:
rfc_bow = RandomForestClassifier(max_depth=3,random_state=42)
rfc_bow.fit(Xtrain_bow, ytrain_bow)
print('Training set score:', rfc_bow.score(Xtrain_bow, ytrain_bow))
print('\nTest set score:', rfc_bow.score(Xtest_bow, ytest_bow))

Training set score: 0.7254098360655737

Test set score: 0.689795918367347


In [50]:
rfc_bow_cv = cross_val_score(rfc_bow, Xtest_bow, ytest_bow, cv=10)

In [51]:
print('Random Forest Cross Validation\n')
display(rfc_bow_cv)
print('\nMean and Standard Error:')
print(round(rfc_bow_cv.mean(),3),' +/- ',round(rfc_bow_cv.std()*2,3))

Random Forest Cross Validation



array([0.68      , 0.68      , 0.64      , 0.72      , 0.68      ,
       0.64      , 0.72      , 0.66666667, 0.65217391, 0.69565217])


Mean and Standard Error:
0.677  +/-  0.055


### TF IDF Feature Set

In [52]:
rfc_lsa = RandomForestClassifier(max_depth=3,random_state=42)
rfc_lsa.fit(Xtrain_lsa, ytrain_lsa)
print('Training set score:', rfc_lsa.score(Xtrain_lsa, ytrain_lsa))
print('\nTest set score:', rfc_lsa.score(Xtest_lsa, ytest_lsa))

Training set score: 0.9285714285714286

Test set score: 0.8789189189189189


In [53]:
rfc_lsa_cv = cross_val_score(rfc_lsa, Xtest_lsa, ytest_lsa, cv=10)

In [54]:
print('Random Forest Cross Validation\n')
display(rfc_lsa_cv)
print('\nMean and Standard Error:')
print(round(rfc_lsa_cv.mean(),3),' +/- ',round(rfc_lsa_cv.std()*2,3))

Random Forest Cross Validation



array([0.92473118, 0.96774194, 0.94623656, 0.97849462, 0.97849462,
       0.95698925, 0.85869565, 0.94565217, 0.94565217, 0.94505495])


Mean and Standard Error:
0.945  +/-  0.066


## Support Vector Classifier

In [55]:
from sklearn.svm import SVC

### BOW Feature Set

In [56]:
svc_bow = SVC(random_state=42)
svc_bow.fit(Xtrain_bow, ytrain_bow)
print('Training set score:', svc_bow.score(Xtrain_bow, ytrain_bow))
print('\nTest set score:', svc_bow.score(Xtest_bow, ytest_bow))

Training set score: 0.6639344262295082

Test set score: 0.6448979591836734


In [57]:
svc_bow_cv = cross_val_score(svc_bow, Xtest_bow, ytest_bow, cv=10)

In [58]:
print('Support Vector Machine Cross Validation\n')
display(svc_bow_cv)
print('\nMean and Standard Error:')
print(round(svc_bow_cv.mean(),3),' +/- ',round(svc_bow_cv.std()*2,3))

Support Vector Machine Cross Validation



array([0.64      , 0.64      , 0.64      , 0.64      , 0.64      ,
       0.64      , 0.64      , 0.66666667, 0.65217391, 0.65217391])


Mean and Standard Error:
0.645  +/-  0.017


### TF IDF Feature Set

In [59]:
svc_lsa = SVC(random_state=42)
svc_lsa.fit(Xtrain_lsa, ytrain_lsa)
print('Training set score:', svc_lsa.score(Xtrain_lsa, ytrain_lsa))
print('\nTest set score:', svc_lsa.score(Xtest_lsa, ytest_lsa))

Training set score: 0.5584415584415584

Test set score: 0.5578378378378378


In [60]:
svc_lsa_cv = cross_val_score(svc_lsa, Xtest_bow, ytest_bow, cv=10)

In [61]:
print('Support Vector Machine Cross Validation\n')
display(svc_lsa_cv)
print('\nMean and Standard Error:')
print(round(svc_lsa_cv.mean(),3),' +/- ',round(svc_lsa_cv.std()*2,3))

Support Vector Machine Cross Validation



array([0.64      , 0.64      , 0.64      , 0.64      , 0.64      ,
       0.64      , 0.64      , 0.66666667, 0.65217391, 0.65217391])


Mean and Standard Error:
0.645  +/-  0.017


## Modeling Conclusion

The best model to model the BOW feature set was Logistic Regression, having the highest accuracy score and a fairly stable cross valiadation score.

The best model to predict with the TF IDF feature set was the Random Forest Classifier. 

Though Logistic Regression using BOW Features had the best testing score, the Random Forest using the TF IDF features showed a more stable cross validation score. Because of that, I will treat both of them as the most effective and try to improve on each of them.

# Making the Best Models Better

I want to squeeze even more accuracy out of my models by tweaking their hyper parameters to better fit the data.

## Logistic Regression using BOW Feature Set

In [62]:
constants = [.01,.1,1,5,7,8,9,10,11,12,15,20,30,50,70,100,150,200]
bestc = 0
bestscore = 0
for c in constants:
    lr2 = LogisticRegression(C=c,random_state=42)
    lr2.fit(Xtrain_bow, ytrain_bow)
    score = lr2.score(Xtest_bow, ytest_bow)
    if score > bestscore:
        bestc = c
        bestscore = score
        print('Best C value is ',c)

Best C value is  0.01
Best C value is  0.1
Best C value is  1
Best C value is  5
Best C value is  8


In [63]:
lr2_bow = LogisticRegression(C=bestc,random_state=42)

In [64]:
lr2_bow.fit(Xtrain_bow, ytrain_bow)
print('Training set score:', lr2_bow.score(Xtrain_bow, ytrain_bow))
print('\nTest set score:', lr2_bow.score(Xtest_bow, ytest_bow))

Training set score: 0.9959016393442623

Test set score: 0.926530612244898


In [65]:
lr2_bow_cv = cross_val_score(lr2_bow, Xtest_bow, ytest_bow, cv=10)

In [66]:
print('Logistic Regression Cross Validation\n')
display(lr2_bow_cv)
print('\nMean and Standard Error:')
print(round(lr2_bow_cv.mean(),3),' +/- ',round(lr2_bow_cv.std()*2,3))

Logistic Regression Cross Validation



array([0.96      , 0.88      , 0.92      , 0.84      , 0.8       ,
       0.84      , 0.76      , 0.79166667, 0.95652174, 0.86956522])


Mean and Standard Error:
0.862  +/-  0.13


## Random Forest using TF IDF Feature Set

In [67]:
rfc2_lsa = RandomForestClassifier(n_estimators=100,max_depth=3,random_state=42)
rfc2_lsa.fit(Xtrain_lsa, ytrain_lsa)
print('Training set score:', rfc2_lsa.score(Xtrain_lsa, ytrain_lsa))
print('\nTest set score:', rfc2_lsa.score(Xtest_lsa, ytest_lsa))

Training set score: 0.9664502164502164

Test set score: 0.918918918918919


In [68]:
rfc2_lsa_cv = cross_val_score(rfc2_lsa, Xtest_lsa, ytest_lsa, cv=10)

In [69]:
print('Random Forest Cross Validation\n')
display(rfc2_lsa_cv)
print('\nMean and Standard Error:')
print(round(rfc2_lsa_cv.mean(),3),' +/- ',round(rfc2_lsa_cv.std()*2,3))

Random Forest Cross Validation



array([1.        , 1.        , 1.        , 1.        , 0.98924731,
       0.97849462, 0.90217391, 0.95652174, 0.98913043, 0.96703297])


Mean and Standard Error:
0.978  +/-  0.058


## Model Improvement Conclusion

I was able to imporve the Logistic Regression test score by 2.5%, as well as making the cross validation far more stable.

I was able to imporve the Random Forest Classifier test score by 2.4%, as well as making the cross valiation mean stightly more stable.

In the end, Logistic Regression benefitted from the optimization slightly more, and is the slightly better model overall. 