In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

# Predicting Author From Text

I want to make a model that can predict who has written a novel or other text based on features that I can parse form the text.  To accomplish this, I will use data from the Gutenberg corpus from NLTK, or the Natural Language Toolkit.  This is a collection of many free text corpra for those looking to analyze texts without having to pay for a corpra to analyze.

I will begin by importing my data.

I will use ten different texts from ten different authors: _Persuasion_ by Jane Austion, the poems of William Blake, the stories of William Cullen Bryant, _Buster Bear_ by Thornton Burgess, _Alice in Wonderland_ by Lewis Carroll, _The Man Who Became Thursday_ by G. K. Chesterton, _The Parent's Assistant_ by Maria Edgeworth, _Moby Dick_ by Herman Melville, _Hamelt_ by William Shakespeare, and _Leaves of Grass_ by Walt Whitman. 

This gives me quite a diverse set of texts and authors, having 4 novels, 2 collections of poems, 3 collections of short stories, and 1 play. 

In [2]:
import nltk
from nltk.corpus import gutenberg

In [3]:
import en_core_web_sm

# Importing and Cleaning

I will create features using Term Frequency and Inverse Document Frequency analysis. This will be best for my data because different authors structure their writing differently and express different themes, something TDIDF can learn and model.

I will import I text for each author.

In [45]:
persuasion = gutenberg.paras('austen-persuasion.txt')
#processing
persuasion_paras=[]
for paragraph in persuasion:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    persuasion_paras.append(' '.join(para))
    
persuasion_len = len(persuasion_paras)

In [46]:
blake = gutenberg.paras('blake-poems.txt')
#processing
blake_paras=[]
for paragraph in blake:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    blake_paras.append(' '.join(para))
    
blake_len = len(blake_paras)

In [47]:
bryant = gutenberg.paras('bryant-stories.txt')
#processing
bryant_paras=[]
for paragraph in bryant:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    bryant_paras.append(' '.join(para))
    
bryant_len = len(bryant_paras)

In [48]:
buster = gutenberg.paras('burgess-busterbrown.txt')
#processing
buster_paras=[]
for paragraph in buster:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    buster_paras.append(' '.join(para))
    
buster_len = len(buster_paras)

In [49]:
alice = gutenberg.paras('carroll-alice.txt')
#processing
alice_paras=[]
for paragraph in alice:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    alice_paras.append(' '.join(para))
    
alice_len = len(alice_paras)

In [50]:
thursday = gutenberg.paras('chesterton-thursday.txt')
#processing
thursday_paras=[]
for paragraph in thursday:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    thursday_paras.append(' '.join(para))
    
thursday_len = len(thursday_paras)

In [51]:
parents = gutenberg.paras('edgeworth-parents.txt')
#processing
parents_paras=[]
for paragraph in parents:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    parents_paras.append(' '.join(para))
    
parents_len = len(parents_paras)

In [52]:
moby = gutenberg.paras('melville-moby_dick.txt')
#processing
moby_paras=[]
for paragraph in moby:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    moby_paras.append(' '.join(para))
    
moby_len = len(moby_paras)

In [53]:
hamlet = gutenberg.paras('shakespeare-hamlet.txt')

#processing
shake_paras=[]
for paragraph in hamlet:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    para=[re.sub(r'Actus [A-Z][a-z]*\.','',word) for word in para] # Remove act numbers
    para=[re.sub(r'Scoena [A-Z][a-z]*\.','',word) for word in para] # Remove scene numbers
    para=[re.sub(r'[A-Z][a-z]*\s\.','',word) for word in para] # Remove charactr's names before their lines
    para=[re.sub(r'[A-Z][a-z]*\s\.','',word) for word in para] # Remove charactr's names before their lines
    #Forming each paragraph into a string and adding it to the list of strings.
    shake_paras.append(' '.join(para))
    

    
shake_len = len(shake_paras)

In [54]:
leaves = gutenberg.paras('whitman-leaves.txt')
#processing
leaves_paras=[]
for paragraph in leaves:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    leaves_paras.append(' '.join(para))
    
leaves_len = len(leaves_paras)

In [55]:
all_paras = persuasion_paras + blake_paras + bryant_paras + buster_paras + alice_paras + thursday_paras + parents_paras + moby_paras + shake_paras + leaves_paras

## Creating TFIDF Features

Now that all my data has been imported and stored, I can transform it into TFIDF features.

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

In [57]:
# create author labels for text
labels = []
for i in range(persuasion_len):
    labels.append('Austen')
for i in range(blake_len):
    labels.append('Blake')
for i in range(bryant_len):
    labels.append('Bryant')
for i in range(buster_len):
    labels.append('Burgess')
for i in range(alice_len):
    labels.append('Carroll')
for i in range(thursday_len):
    labels.append('Chesterton')
for i in range(parents_len):
    labels.append('Edgeworth')
for i in range(moby_len):
    labels.append('Melville')
for i in range(shake_len):
    labels.append('Shakespeare')
for i in range(leaves_len):
    labels.append('Whitman')

In [58]:
#splitting into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(all_paras, labels, test_size=0.5, random_state=0)

#Applying the vectorizer
X_train_tfidf=vectorizer.fit_transform(X_train)
X_test_tfidf=vectorizer.transform(X_test)
print("Number of features: %d" % X_train_tfidf.get_shape()[1])


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()
X_test_tfidf_csr = X_test_tfidf.tocsr()

Number of features: 6986


In [59]:
#number of paragraphs
n = X_train_tfidf_csr.shape[0]

#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]

#List of features
terms = vectorizer.get_feature_names()

#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]
    
print('Original sentence:', X_train[5])
print('Tf_idf vector:', tfidf_bypara[5])

Original sentence: Supper over , the company went back to the bar - room , when , knowing not what else to do with myself , I resolved to spend the rest of the evening as a looker on .
Tf_idf vector: {'supper': 0.3190147090269327, 'company': 0.306504947382207, 'went': 0.23746727833213563, 'bar': 0.3458769414258709, 'room': 0.2666127080621043, 'knowing': 0.36319005599692344, 'resolved': 0.3537421348086776, 'spend': 0.38228005006490146, 'rest': 0.26938301991861796, 'evening': 0.28536759951639357}


In [60]:
#Our SVD data reducer.  We are going to reduce the feature space from 6986 to 400
svd= TruncatedSVD(400, random_state=0)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.transform(X_test_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

Percent variance captured by all components: 46.00092644260595


In [61]:
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('\nComponent {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])


Component 0:
Ham .    1.0
Ham .    1.0
Ham .    1.0
Ham .    1.0
Ham .    1.0
Ham .    1.0
Ham .    1.0
Ham .    1.0
Ham .    1.0
Ham .    1.0
Name: 0, dtype: float64

Component 1:
CHAPTER 64     0.999719
Chapter 2      0.999719
Chapter 5      0.999719
Chapter 9      0.999719
CHAPTER 96     0.999719
CHAPTER 132    0.999719
CHAPTER 114    0.999719
CHAPTER 77     0.999719
CHAPTER 103    0.999719
CHAPTER 94     0.999719
Name: 1, dtype: float64

Component 2:
" Oh , at Harrow ," said the policeman                                                         0.735348
" Now you do that ," said he .                                                                 0.671664
" I have done it ," he said hoarsely .                                                         0.662345
Whoever degrades another degrades me , And whatever is done or said returns at last to me .    0.655795
" They have done us ," he said , with brief military irony .                                   0.648707
" It is jolly to get

Here, I have shown the first 5 components by the entries that best exemplify them.  The left hand value is the sentence and the right hand value is how strongly the sentence correlates with that particular component.

It seems that compenent 0 targets the Hamlet's line prompts.

Component 1 targets Chapter titles which start with the word 'chapter'.

Component 2 seems to target sentences where one persons talks about the effect of another character's actions.

Component 3 targets exclamatory sentences begining with 'Oh'.

And component 4 targets the begining of the King's line prompts.

# Creating Clusters

I will now cluster my data and see if it will naturally group by author.

I will use the KMeans clustering technique, as it is the most flexible and least computationally intensive.

In [62]:
km1 = KMeans(n_clusters=10, random_state=0)
y_pred1 = km1.fit_predict(X_train_lsa)

In [63]:
km2 = KMeans(n_clusters=10, random_state=42)
y_pred2 = km2.fit_predict(X_train_lsa)

In [64]:
km3 = KMeans(n_clusters=10, random_state=1337)
y_pred3 = km3.fit_predict(X_train_lsa)

In [65]:
Y_train = np.array(Y_train)

In [66]:
print('KMeans 1 Inertia: ',km1.inertia_)
print('\nComparing the K-Means 1 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_train,y_pred1))

KMeans 1 Inertia:  6588.094229689489

Comparing the K-Means 1 clusters to the actual author groupings:

col_0         0    1    2   3    4     5   6   7    8   9
row_0                                                    
Austen       50   25   19  10   28   374  15  15    0   0
Blake         0   31   14   4    2    77   0   4    0   0
Bryant        2   65  132   3   70   274   0  24    0  24
Burgess       2   10   28   0    4    83   0   3    0   0
Carroll       0   31   27   4  116   226   4   4    0  27
Chesterton    5   44   17  11  163   358   9  49    0   0
Edgeworth    91  101   72  33  283  1186   4  68    0   0
Melville      7  427   26  13   66   706  72  78    0   3
Shakespeare  26   30    0   1    0   216   0  25  141  43
Whitman       0  474   19  47    3   575   0  91    0   0


In [67]:
print('KMeans 2 Inertia: ',km2.inertia_)
print('\nComparing the K-Means 2 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_train,y_pred2))

KMeans 2 Inertia:  6572.738657087793

Comparing the K-Means 2 clusters to the actual author groupings:

col_0          0   1    2    3   4   5   6    7     8   9
row_0                                                    
Austen        20  15   30   22  30   0  15    0   389  15
Blake         14   1    2   48   2   0   5    0    60   0
Bryant       128  14   69   57  13  24  13    0   276   0
Burgess       28   1    4    6   5   0   0    0    86   0
Carroll       25   1  103   20  36  27   9    0   214   4
Chesterton    15  41  163   41  22   0  11    0   354   9
Edgeworth     72  45  291   68  97   0  82    0  1179   4
Melville      23  61   66  450  31   3  18    0   674  72
Shakespeare    0  46    0   46   2  43   1  141   203   0
Whitman       19  47    3  637  36   0   0    0   467   0


In [68]:
print('KMeans 3 Inertia: ',km3.inertia_)
print('\nComparing the K-Means 3 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_train,y_pred3))

KMeans 3 Inertia:  6586.917000775612

Comparing the K-Means 3 clusters to the actual author groupings:

col_0         0    1    2    3   4   5    6    7     8   9
row_0                                                     
Austen        0   21    0    0  14  19   49   24   394  15
Blake         0   14    0    0   1   0    2   66    49   0
Bryant       22  127    2    0  14  13   76   71   269   0
Burgess       0   28    0    0   1   3    4   14    80   0
Carroll      27   28    0    0   1   2  121   26   230   4
Chesterton    0   15    0    0  42  11  173   59   347   9
Edgeworth     0   66    0    0  43  73  325   94  1233   4
Melville      2   21  172    0  57  27   73  285   690  71
Shakespeare  43    0   26  141   0  30    1    5   236   0
Whitman       0   17    0    0  41  20    3  741   387   0


My clusters are certainly not grouping by author.  This indicates that the difference between author may not be instantly apparent from the feature set I have created.

However, my groupings to seem to be somewhat consistent.  And if the groups are consitently being formed, they must be meaningful in some way.  There is always 8 certain groupings of clusters (given with their cluster number in each KMeans run):

__(8, 7, 3)__: a cluster containing exactly 141 Shakespeare entries and nothing else. This is the only cluster that is exactly the same between KMeans runs.

__(9, 5, 0)__: a sparsely populated cluster containing only Bryant, Carroll, and Shakespeare.  This cluster is almost exactly the same between KMeans runs, only have a difference of 2 or 3 texts total.

__(6, 9, 9)__: A low populated cluster with plurality Melville and Austin as the other author of significant representation. This cluster was extremely similar in count between KMeans runs, though not exactly the same.

__(5, 8, 8)__: A heavily populated cluster dominating by Edgeworth but containing a every other author in similar ratios.  This was consistently the highest populated cluster by far.

__(2, 0, 1)__: a medium population cluster dominated by Byant but containing all authors.

__(4, 2, 6)__: a medium population cluster dominated by Edgeworth followed by Chesterton and Carroll, though it usually contains most authors. Also marked by it's consistently low amount of Blake, Burgess, and Skakespeare. 

__(7, 1, 4)__: a sparsely populated cluster with pluraity Melville, followed by Edgeworth and Chesterton, though it usually contains all authors.  This cluster is also marked by its consistently low amount of Blake, Burgess, and Carroll.

__(1, 3, 7)__: a medium to largely populated cluster dominated by Whitman and Melville though containing all authors. This is the most variable of the recognizable clusters.

Though there are some vague similarities between the remaining 2 clusters in each iteration, there are no trends that stay consistent across all runs. These clusters rely on the groupings of the other clusters to form than from an inherently strong grouping amoung the data.

## Taking a Closer Look at Clusters

I want to determine how the groups are being formed. To do this, I will look at the most important components behind each one.

In [69]:
paras_by_component_with_label=pd.DataFrame(X_train_lsa,index=X_train)
paras_by_component_with_label['Author'] = Y_train
paras_by_component_with_label['Predict1'] = y_pred1
paras_by_component_with_label['Predict2'] = y_pred2
paras_by_component_with_label['Predict3'] = y_pred3

In [177]:
def find_top_components(df, ind):
    top_components = []
    top_tfidf = []
    for i in range(ind):
        max_tfidf = max(df.loc[:,i])
        if (max_tfidf) > 0:
            if len(top_components) == 3:
                if max_tfidf > min(top_tfidf):
                    del top_components[np.argmin(top_tfidf)]
                    del top_tfidf[np.argmin(top_tfidf)]
                    top_components.append(i)
                    top_tfidf.append(max_tfidf)
            else:
                top_components.append(i)
                top_tfidf.append(max_tfidf) 
    return top_components

In [176]:
def show_top_factors(index, df):
    for i in index:
        print('\nComponent {}:'.format(i))
        print(df.loc[:,i].sort_values(ascending=False)[0:5])

In [72]:
cluster_10 = paras_by_component_with_label[paras_by_component_with_label['Predict1']==0]
cluster_11 = paras_by_component_with_label[paras_by_component_with_label['Predict1']==1]
cluster_12 = paras_by_component_with_label[paras_by_component_with_label['Predict1']==2]
cluster_13 = paras_by_component_with_label[paras_by_component_with_label['Predict1']==3]
cluster_14 = paras_by_component_with_label[paras_by_component_with_label['Predict1']==4]
cluster_15 = paras_by_component_with_label[paras_by_component_with_label['Predict1']==5]
cluster_16 = paras_by_component_with_label[paras_by_component_with_label['Predict1']==6]
cluster_17 = paras_by_component_with_label[paras_by_component_with_label['Predict1']==7]
cluster_18 = paras_by_component_with_label[paras_by_component_with_label['Predict1']==8]
cluster_19 = paras_by_component_with_label[paras_by_component_with_label['Predict1']==9]

In [73]:
cluster_20 = paras_by_component_with_label[paras_by_component_with_label['Predict2']==0]
cluster_21 = paras_by_component_with_label[paras_by_component_with_label['Predict2']==1]
cluster_22 = paras_by_component_with_label[paras_by_component_with_label['Predict2']==2]
cluster_23 = paras_by_component_with_label[paras_by_component_with_label['Predict2']==3]
cluster_24 = paras_by_component_with_label[paras_by_component_with_label['Predict2']==4]
cluster_25 = paras_by_component_with_label[paras_by_component_with_label['Predict2']==5]
cluster_26 = paras_by_component_with_label[paras_by_component_with_label['Predict2']==6]
cluster_27 = paras_by_component_with_label[paras_by_component_with_label['Predict2']==7]
cluster_28 = paras_by_component_with_label[paras_by_component_with_label['Predict2']==8]
cluster_29 = paras_by_component_with_label[paras_by_component_with_label['Predict2']==9]

In [74]:
cluster_30 = paras_by_component_with_label[paras_by_component_with_label['Predict3']==0]
cluster_31 = paras_by_component_with_label[paras_by_component_with_label['Predict3']==1]
cluster_32 = paras_by_component_with_label[paras_by_component_with_label['Predict3']==2]
cluster_33 = paras_by_component_with_label[paras_by_component_with_label['Predict3']==3]
cluster_34 = paras_by_component_with_label[paras_by_component_with_label['Predict3']==4]
cluster_35 = paras_by_component_with_label[paras_by_component_with_label['Predict3']==5]
cluster_36 = paras_by_component_with_label[paras_by_component_with_label['Predict3']==6]
cluster_37 = paras_by_component_with_label[paras_by_component_with_label['Predict3']==7]
cluster_38 = paras_by_component_with_label[paras_by_component_with_label['Predict3']==8]
cluster_39 = paras_by_component_with_label[paras_by_component_with_label['Predict3']==9]

In [178]:
comp_10 = find_top_components(cluster_10, 300)
comp_11 = find_top_components(cluster_11, 300)
comp_12 = find_top_components(cluster_12, 300)
comp_13 = find_top_components(cluster_13, 300)
comp_14 = find_top_components(cluster_14, 300)
comp_15 = find_top_components(cluster_15, 300)
comp_16 = find_top_components(cluster_16, 300)
comp_17 = find_top_components(cluster_17, 300)
comp_18 = find_top_components(cluster_18, 300)
comp_19 = find_top_components(cluster_19, 300)

In [179]:
comp_20 = find_top_components(cluster_20, 300)
comp_21 = find_top_components(cluster_21, 300)
comp_22 = find_top_components(cluster_22, 300)
comp_23 = find_top_components(cluster_23, 300)
comp_24 = find_top_components(cluster_24, 300)
comp_25 = find_top_components(cluster_25, 300)
comp_26 = find_top_components(cluster_26, 300)
comp_27 = find_top_components(cluster_27, 300)
comp_28 = find_top_components(cluster_28, 300)
comp_29 = find_top_components(cluster_29, 300)

In [180]:
comp_30 = find_top_components(cluster_30, 300)
comp_31 = find_top_components(cluster_31, 300)
comp_32 = find_top_components(cluster_32, 300)
comp_33 = find_top_components(cluster_33, 300)
comp_34 = find_top_components(cluster_34, 300)
comp_35 = find_top_components(cluster_35, 300)
comp_36 = find_top_components(cluster_36, 300)
comp_37 = find_top_components(cluster_37, 300)
comp_38 = find_top_components(cluster_38, 300)
comp_39 = find_top_components(cluster_39, 300)

### (8, 7, 3): 141 Skakespeare

This was the only cluster to stay exactly the same amoung clustering, so it's grouping must be strong.

In [181]:
print("Top Factors for K-Means 1, Cluster 8:")
show_top_factors(comp_18, cluster_18)

Top Factors for K-Means 1, Cluster 8:

Component 0:
Ham .    1.0
Ham .    1.0
Ham .    1.0
Ham .    1.0
Ham .    1.0
Name: 0, dtype: float64

Component 205:
Ham .    1.406077e-13
Ham .    1.397978e-13
Ham .    1.391472e-13
Ham .    1.391472e-13
Ham .    1.391472e-13
Name: 205, dtype: float64

Component 293:
Ham .    1.326131e-13
Ham .    1.295566e-13
Ham .    1.262961e-13
Ham .    1.262875e-13
Ham .    1.262875e-13
Name: 293, dtype: float64


In [182]:
print("Top Factors for K-Means 2, Cluster 7:")
show_top_factors(comp_27, cluster_27)

Top Factors for K-Means 2, Cluster 7:

Component 0:
Ham .    1.0
Ham .    1.0
Ham .    1.0
Ham .    1.0
Ham .    1.0
Name: 0, dtype: float64

Component 205:
Ham .    1.406077e-13
Ham .    1.397978e-13
Ham .    1.391472e-13
Ham .    1.391472e-13
Ham .    1.391472e-13
Name: 205, dtype: float64

Component 293:
Ham .    1.326131e-13
Ham .    1.295566e-13
Ham .    1.262961e-13
Ham .    1.262875e-13
Ham .    1.262875e-13
Name: 293, dtype: float64


In [183]:
print("Top Factors for K-Means 3, Cluster 3:")
show_top_factors(comp_33, cluster_33)

Top Factors for K-Means 3, Cluster 3:

Component 0:
Ham .    1.0
Ham .    1.0
Ham .    1.0
Ham .    1.0
Ham .    1.0
Name: 0, dtype: float64

Component 205:
Ham .    1.406077e-13
Ham .    1.397978e-13
Ham .    1.391472e-13
Ham .    1.391472e-13
Ham .    1.391472e-13
Name: 205, dtype: float64

Component 293:
Ham .    1.326131e-13
Ham .    1.295566e-13
Ham .    1.262961e-13
Ham .    1.262875e-13
Ham .    1.262875e-13
Name: 293, dtype: float64


This cluster appears to be entirely made up of sentences where Hamlet is about to say a line. 

Component 0, the most important by far for each component simply seems to model the line prompt for Hamlet from _Hamlet_ (in this version of the play, lines are prompted by the first 3-4 letters of the character's name and a period). 

All the other component coefficients are extremely low for each entry, which is constantly Hamlet line prompts. These coeffecients are entirely unrelated, and impossible to discern from this alone.

This cluster is obviously just Hamlet line prompts, pure and simple. It is interesting that each KMeans iteration found this to be the most important cluster to create over and over. Probably because line prompts are extremely predictable and easy to model and Hamlet has a lot of lines in _Hamlet_.

### (9, 5, 0): Bryant, Carroll, and Shakespeare

Though not comletely the same between runs, this cluster was still relatively remarkably similar.

In [184]:
print("Top Factors for K-Means 1, Cluster 9:")
show_top_factors(comp_19, cluster_19)

Top Factors for K-Means 1, Cluster 9:

Component 4:
King .    0.969387
King .    0.969387
King .    0.969387
King .    0.969387
King .    0.969387
Name: 4, dtype: float64

Component 24:
' It ' s the oldest rule in the book ,' said the King .                                                                                                  0.527404
Aloft , like a royal czar and king , the sun seemed giving this gentle air to this bold and rolling sea ; even as bride to groom .                       0.057380
At the end of three years , the Pope called the Emperor of Allemaine and the King of Sicily , his brothers , to a great meeting in his city of Rome .    0.054680
And that same night a mighty sea - king came up and slew Frode and plundered his city .                                                                  0.051369
At last all the festivities were over , and the King of Sicily went home to his own land again , with his people .                                       0.045215
Name

In [185]:
print("Top Factors for K-Means 2, Cluster 5:")
show_top_factors(comp_25, cluster_25)

Top Factors for K-Means 2, Cluster 5:

Component 4:
King .    0.969387
King .    0.969387
King .    0.969387
King .    0.969387
King .    0.969387
Name: 4, dtype: float64

Component 24:
' It ' s the oldest rule in the book ,' said the King .                                                                                                  0.527404
Aloft , like a royal czar and king , the sun seemed giving this gentle air to this bold and rolling sea ; even as bride to groom .                       0.057380
At the end of three years , the Pope called the Emperor of Allemaine and the King of Sicily , his brothers , to a great meeting in his city of Rome .    0.054680
And that same night a mighty sea - king came up and slew Frode and plundered his city .                                                                  0.051369
At last all the festivities were over , and the King of Sicily went home to his own land again , with his people .                                       0.045215
Name

In [186]:
print("Top Factors for K-Means 3, Cluster 0:")
show_top_factors(comp_30, cluster_30)

Top Factors for K-Means 3, Cluster 0:

Component 4:
King .    0.969387
King .    0.969387
King .    0.969387
King .    0.969387
King .    0.969387
Name: 4, dtype: float64

Component 24:
' It ' s the oldest rule in the book ,' said the King .                                                                                                  0.527404
Aloft , like a royal czar and king , the sun seemed giving this gentle air to this bold and rolling sea ; even as bride to groom .                       0.057380
At the end of three years , the Pope called the Emperor of Allemaine and the King of Sicily , his brothers , to a great meeting in his city of Rome .    0.054680
And that same night a mighty sea - king came up and slew Frode and plundered his city .                                                                  0.051369
At last all the festivities were over , and the King of Sicily went home to his own land again , with his people .                                       0.045215
Name

This cluster shares all the same top components: 4, 24, and 35. 

Component 4 deals wih the line prompt for the King in Hamlet.

Component 24 deals with sentences about Kings and cities.

Component 35 deals with quotoations talking about or said by a king.

The obvious theme for this cluster is Kings, dealing with every aspect of the 3 texts which include kings.

### (6, 9, 9): Melville and Austin
This cluster was extremely similar in count between KMeans runs, though not exactly the same.

In [187]:
print("Top Factors for K-Means 1, Cluster6 6:")
show_top_factors(comp_16, cluster_16)

Top Factors for K-Means 1, Cluster6 6:

Component 1:
CHAPTER 64     0.999719
Chapter 4      0.999719
CHAPTER 106    0.999719
CHAPTER 69     0.999719
CHAPTER 30     0.999719
Name: 1, dtype: float64

Component 137:
CHAPTER II       0.316544
CHAPTER II .     0.316544
CHAPTER III .    0.059989
CHAPTER III .    0.059989
CHAPTER III      0.059989
Name: 137, dtype: float64

Component 226:
CHAPTER III                                                                  0.623005
CHAPTER III .                                                                0.623005
CHAPTER III .                                                                0.623005
CHAPTER IV .                                                                 0.017074
* Vide " Priestley ' s History of Vision ," chapter on coloured shadows .    0.001236
Name: 226, dtype: float64


In [188]:
print("Top Factors for K-Means 2, Cluster 9:")
show_top_factors(comp_29, cluster_29)

Top Factors for K-Means 2, Cluster 9:

Component 1:
CHAPTER 64     0.999719
Chapter 4      0.999719
CHAPTER 106    0.999719
CHAPTER 69     0.999719
CHAPTER 30     0.999719
Name: 1, dtype: float64

Component 137:
CHAPTER II       0.316544
CHAPTER II .     0.316544
CHAPTER III .    0.059989
CHAPTER III .    0.059989
CHAPTER III      0.059989
Name: 137, dtype: float64

Component 226:
CHAPTER III                                                                  0.623005
CHAPTER III .                                                                0.623005
CHAPTER III .                                                                0.623005
CHAPTER IV .                                                                 0.017074
* Vide " Priestley ' s History of Vision ," chapter on coloured shadows .    0.001236
Name: 226, dtype: float64


In [189]:
print("Top Factors for K-Means 3, Cluster 9:")
show_top_factors(comp_39, cluster_39)

Top Factors for K-Means 3, Cluster 9:

Component 1:
CHAPTER 64     0.999719
Chapter 4      0.999719
CHAPTER 106    0.999719
CHAPTER 127    0.999719
CHAPTER 69     0.999719
Name: 1, dtype: float64

Component 137:
CHAPTER II       0.316544
CHAPTER II .     0.316544
CHAPTER III .    0.059989
CHAPTER III .    0.059989
CHAPTER III      0.059989
Name: 137, dtype: float64

Component 226:
CHAPTER III                                                                  0.623005
CHAPTER III .                                                                0.623005
CHAPTER III .                                                                0.623005
CHAPTER IV .                                                                 0.017074
* Vide " Priestley ' s History of Vision ," chapter on coloured shadows .    0.001236
Name: 226, dtype: float64


The clusters here share the same top 3 components, a testiment to their similarity: 1, 137, and 226.

Component 1 deals with Chapter Titles ending in numbers.

Component 137 likely deals with something unrelated to chapters titles judging by the low component correlation of each entry.  This indicates this cluster mostly or only contains chapter titles.

Component 226 seems to deal with Chapter Titles with roman numerals.

This cluster seems to contain Chapter titles, and perhaps other sentence fragments here and there with the word 'chapter' judging by the fact that is isn't perfectly the same between iterations.

### __(5, 8, 8)__: High Pop, Edgeworth Dominated
This is the most populated cluster by far. It probably has a common theme that all authors share and the fact that it is mostly Edgeworth comes from the fact that Edgeworth simply has more data points.

In [193]:
print("Top Factors for K-Means 1, Cluster 5:")
show_top_factors(comp_15, cluster_15)

Top Factors for K-Means 1, Cluster 5:

Component 22:
Ophe .    1.0
Ophe .    1.0
Ophe .    1.0
Ophe .    1.0
Ophe .    1.0
Name: 22, dtype: float64

Component 26:
Mar .    1.0
Mar .    1.0
Mar .    1.0
Mar .    1.0
Mar .    1.0
Name: 26, dtype: float64

Component 37:
Pol .    1.0
Pol .    1.0
Pol .    1.0
Pol .    1.0
Pol .    1.0
Name: 37, dtype: float64


In [191]:
print("Top Factors for K-Means 2, Cluster 8:")
show_top_factors(comp_28, cluster_28)

Top Factors for K-Means 2, Cluster 8:

Component 14:
Qu .    0.999992
Qu .    0.999992
Qu .    0.999992
Qu .    0.999992
Qu .    0.999992
Name: 14, dtype: float64

Component 37:
Pol .    1.0
Pol .    1.0
Pol .    1.0
Pol .    1.0
Pol .    1.0
Name: 37, dtype: float64

Component 91:
Osr .    1.0
Osr .    1.0
Osr .    1.0
Osr .    1.0
Osr .    1.0
Name: 91, dtype: float64


In [192]:
print("Top Factors for K-Means 3, Cluster 8:")
show_top_factors(comp_38, cluster_38)

Top Factors for K-Means 3, Cluster 8:

Component 22:
Ophe .    1.0
Ophe .    1.0
Ophe .    1.0
Ophe .    1.0
Ophe .    1.0
Name: 22, dtype: float64

Component 26:
Mar .    1.0
Mar .    1.0
Mar .    1.0
Mar .    1.0
Mar .    1.0
Name: 26, dtype: float64

Component 37:
Pol .    1.0
Pol .    1.0
Pol .    1.0
Pol .    1.0
Pol .    1.0
Name: 37, dtype: float64


The most important factors to this cluster group were all different, but they were all Shakespeare play line prompts. This is interesting since Shakespeare is a low minority in this cluster.

This is explained by the fact that line prompts are a very predictable concept, so they will be an important factor to any cluster they are a part of. Because so many different authors are contained in this group, in fact near half of the text is contained in this grouped, it must model something more common than line prompts. 

This cluster likely models short sentences, which line prompts would all fall into and pretty much all texts have a lot of.

### (2, 0, 1): Byant and the Rest
This cluster is populated by a lot of authors, but the Bryant domination is clear.

In [194]:
print("Top Factors for K-Means 1, Cluster 2:")
show_top_factors(comp_12, cluster_12)

Top Factors for K-Means 1, Cluster 2:

Component 3:
" Oh , no , no ; you are too little !"                            0.662676
" Oh , such a little creature ; to have so much sense , too !"    0.572320
" Oh no , no , no ; you are too little , you are too little !"    0.435618
" Oh , sing again , little Nightingale ," begged Death .          0.347084
" Oh ," said the little Jackal , " you want my opinion ?          0.254034
Name: 3, dtype: float64

Component 15:
" A little .                   0.535618
THE LITTLE MERCHANTS .         0.529686
THE LITTLE VAGABOND            0.528746
TWO LITTLE RIDDLES IN RHYME    0.528551
LITTLE JACK ROLLAROUND         0.523913
Name: 15, dtype: float64

Component 38:
Once upon a time there was a little Red Hen , who lived on a farm all by herself .                                                                                                                                                                                                                    

In [195]:
print("Top Factors for K-Means 2, Cluster 0:")
show_top_factors(comp_20, cluster_20)

Top Factors for K-Means 2, Cluster 0:

Component 15:
" A little .                   0.535618
THE LITTLE MERCHANTS .         0.529686
THE LITTLE VAGABOND            0.528746
TWO LITTLE RIDDLES IN RHYME    0.528551
LITTLE JACK ROLLAROUND         0.523913
Name: 15, dtype: float64

Component 32:
" You dear , lovely little thing !"                                                                                                                                                                                    0.506123
" DEAR MARY , NANCY , AND LITTLE PEG ,                                                                                                                                                                                 0.452717
Once upon a time there was a dear little girl , whose name was Elsa .                                                                                                                                                  0.396748
The old Alligator was so furious th

In [196]:
print("Top Factors for K-Means 3, Cluster 1:")
show_top_factors(comp_31, cluster_31)

Top Factors for K-Means 3, Cluster 1:

Component 3:
" Oh , no , no ; you are too little !"                            0.662676
" Oh , such a little creature ; to have so much sense , too !"    0.572320
" Oh no , no , no ; you are too little , you are too little !"    0.435618
" Oh , sing again , little Nightingale ," begged Death .          0.347084
" Oh ," said the little Jackal , " you want my opinion ?          0.254034
Name: 3, dtype: float64

Component 15:
" A little .                   0.535618
THE LITTLE MERCHANTS .         0.529686
THE LITTLE VAGABOND            0.528746
TWO LITTLE RIDDLES IN RHYME    0.528551
LITTLE JACK ROLLAROUND         0.523913
Name: 15, dtype: float64

Component 38:
Once upon a time there was a little Red Hen , who lived on a farm all by herself .                                                                                                                                                                                                                    

Not all clusters share their top 3 components here, only 2 are shared: 15 and 38. The First and Third iteration share component 3 and the Second iteration has component 32.

Component 15 seems to deal with titles or other very short sentences using the word 'little'.

Component 38 deals with sentences with more than one clause which use the word 'little'.

Component 3 deals with dialouge where one character describes another as 'little' and uses the word 'oh'.

Component 32 deals with descriptive sentences where a character is decribed as little and somehow alone or unreachable by other characters.

The theme of this cluster is very clear, if not a little odd, no pun intended. It completely deals with sentences using the word 'little'. If I wanted to get rid of this cluster I might add 'little' to the stop words for this vectorizer.

### (4, 2, 6): Edgeworth, Chesterton, and Carroll, minus Blake, Burgess, and Skakespeare.
This cluster has many authors, but the fact that is never has Blake, Burgess, or Shakespeare may be a key to it's theme.

In [197]:
print("Top Factors for K-Means 1, Cluster 4:")
show_top_factors(comp_14, cluster_14)

Top Factors for K-Means 1, Cluster 4:

Component 2:
" Oh , at Harrow ," said the policeman                                                         0.735348
" Now you do that ," said he .                                                                 0.671664
" I have done it ," he said hoarsely .                                                         0.662345
Whoever degrades another degrades me , And whatever is done or said returns at last to me .    0.655795
" They have done us ," he said , with brief military irony .                                   0.648707
Name: 2, dtype: float64

Component 6:
" Yes , sir ," said Sir Arthur , pulling the lease out of his pocket .                                                                                           0.702896
" Yes , sir , everything ," said the attendant gravely .                                                                                                         0.594378
" You have done him no injury ," said Sir Arthur ,

In [198]:
print("Top Factors for K-Means 2, Cluster 2:")
show_top_factors(comp_22, cluster_22)

Top Factors for K-Means 2, Cluster 2:

Component 2:
" Now you do that ," said he .                                                                 0.671664
" I have done it ," he said hoarsely .                                                         0.662345
Whoever degrades another degrades me , And whatever is done or said returns at last to me .    0.655795
" They have done us ," he said , with brief military irony .                                   0.648707
" It is jolly to get some pals ," he said .                                                    0.643445
Name: 2, dtype: float64

Component 6:
" Yes , sir ," said Sir Arthur , pulling the lease out of his pocket .                                                                                           0.702896
" Yes , sir , everything ," said the attendant gravely .                                                                                                         0.594378
" You have done him no injury ," said Sir Arthur ,

In [199]:
print("Top Factors for K-Means 3, Cluster 6:")
show_top_factors(comp_36, cluster_36)

Top Factors for K-Means 3, Cluster 6:

Component 2:
" Oh , at Harrow ," said the policeman                                                         0.735348
" Now you do that ," said he .                                                                 0.671664
" I have done it ," he said hoarsely .                                                         0.662345
Whoever degrades another degrades me , And whatever is done or said returns at last to me .    0.655795
" They have done us ," he said , with brief military irony .                                   0.648707
Name: 2, dtype: float64

Component 3:
" Oh !"               0.873917
" Oh !                0.873917
" Oh !                0.873917
" Oh , EDICATION !    0.873917
" Oh !                0.873917
Name: 3, dtype: float64

Component 6:
" Yes , sir ," said Sir Arthur , pulling the lease out of his pocket .                                                                                           0.702896
" Yes , sir , everything ,"

This cluster also does not share all it's top components, only having 2 i all 3 iterations: components 2 and 6. The First and Second iteration share top component 35 and the Third has component 3. 

Component 2 deals with things being done to others, especially with the connotation of authority.

Component 6 deals with quotations using the word 'sir' and/or adressing a character with 'Sir' in their name.

Component 35 seems to deal with quotations where one character expresses some kind of will to not do something. 

Component 3 deals with exclaimations using 'Oh'.

This cluster seems to have a theme of quotation responding to authority, whether it be by complying or refusing to comply. The authors omitted make perfect sense because they do not use quotations in their writing.

### (7, 1, 4): Melville, Edgeworth, and Chesterton minus Blake, Burgess, and Carroll.
Once again, remebering which authors are omitted may be key to finding the theme.

In [200]:
print("Top Factors for K-Means 1, Cluster 7:")
show_top_factors(comp_17, cluster_17)

Top Factors for K-Means 1, Cluster 7:

Component 12:
" You don ' t know , man !                                                                                         0.647269
" But if you don ' t teach him better now he is a child , how will he know when he is a man ?"                     0.459112
" Did you know ," he asked , " that that man Gogol was one of us ?"                                                0.430968
" I know it , old man ; these stubbs will weld together like glue from the melted bones of murderers .             0.336463
" I did not know that I fixed my eyes upon you ; I was thinking of my fireworks ," said the poor man , simply .    0.302457
Name: 12, dtype: float64

Component 14:
Qu .    0.999992
Qu .    0.999992
Qu .    0.999992
Qu .    0.999992
Qu .    0.999992
Name: 14, dtype: float64

Component 17:
" What ' s the old man have so much to do with him for ?"    0.585315
Old Man .                                                    0.585315
" Have you not robbed 

In [201]:
print("Top Factors for K-Means 2, Cluster 1:")
show_top_factors(comp_21, cluster_21)

Top Factors for K-Means 2, Cluster 1:

Component 11:
Laer .    1.0
Laer .    1.0
Laer .    1.0
Laer .    1.0
Laer .    1.0
Name: 11, dtype: float64

Component 17:
Old Man .                                                    0.585315
" What ' s the old man have so much to do with him for ?"    0.585315
" Have you not robbed this old man ?                         0.579286
" Well , man !                                               0.566657
It is , man .                                                0.566657
Name: 17, dtype: float64

Component 22:
Ophe .    1.0
Ophe .    1.0
Ophe .    1.0
Ophe .    1.0
Ophe .    1.0
Name: 22, dtype: float64


In [202]:
print("Top Factors for K-Means 3, Cluster 4:")
show_top_factors(comp_34, cluster_34)

Top Factors for K-Means 3, Cluster 4:

Component 2:
" Oh , they have been punished enough ," said the old man ; " forgive them , sir ."     0.571382
" Oh , shut it ," said the man in spectacles .                                          0.527614
" I am ," said the man .                                                                0.471594
" I found ," said he , " that I was considered by Harville an engaged man !             0.422711
" Don ' t be such a silly man ," he said , with the effeminate dignity of a curate .    0.409533
Name: 2, dtype: float64

Component 12:
" You don ' t know , man !                                                                                0.647269
" But if you don ' t teach him better now he is a child , how will he know when he is a man ?"            0.459112
" Did you know ," he asked , " that that man Gogol was one of us ?"                                       0.430968
" I don ' t know that , my little man ; I never yet saw him kneel ."           

This cluster was markedly less recognizeable than the others, and it shows in its top factors across iterations, only sharing 1 top component, 17. 12 was also shared by the First and Third iterations, but the rest of the top compenents, 2, 11, 12, 22, were unique.  Remember, eahc iteration undoubtedly values these compenents highly so they will all be usefull to find the theme; they just don't value them in exactly the same way.

Component 17 deals with quotations and short sentences address a man or containing the word 'man', especially and old man.

Component 12 deals with quotations about men, especially concerning their membership of a particular group. 

Component 2 deals with quotation about men, especially about one charter acknowledging another character. 

Component 14, which is only is the First iteration, deals with line prompts from Qu from _Hamlet_. So the first iteration got line prompts from one character.

Component 11 and 22, which are only top components in the Second iteration, deal with line prompts from 2 particular characters from _Hamlet_. So the second iteration got line prompts from 2 characters, indicating the overall theme may be weakest in it.

This cluster seems to deal with sentences and quotations dealing with the word 'man'. Though it is somewhat inconsistent, and while frequentll get line prompts, too.

### (1, 3, 7): Whitman and Melville
The least consistent of all the recognizable clusters.

In [207]:
print("Top Factors for K-Means 1, Cluster 1:")
show_top_factors(comp_11, cluster_11)

Top Factors for K-Means 1, Cluster 1:

Component 7:
Hor .    1.0
Hor .    1.0
Hor .    1.0
Hor .    1.0
Hor .    1.0
Name: 7, dtype: float64

Component 8:
Just .    0.848219
Just .    0.848219
Just .    0.848219
Just .    0.848219
Just .    0.848219
Name: 8, dtype: float64

Component 13:
" Great Lord !"                                                                                                                                                                                                                       0.777441
" Now the Lord had prepared a great fish to swallow up Jonah ."                                                                                                                                                                       0.596483
When serenely advancing on one of these journeys , if any strange suspicious sights are seen , my lord whale keeps a wary eye on his interesting family .                                                                             0.4

In [208]:
print("Top Factors for K-Means 2, Cluster 3:")
show_top_factors(comp_23, cluster_23)

Top Factors for K-Means 2, Cluster 3:

Component 7:
Hor .    1.0
Hor .    1.0
Hor .    1.0
Hor .    1.0
Hor .    1.0
Name: 7, dtype: float64

Component 23:
" And when shall I find it ?"                                                                                              0.764985
6 What will be will be well , for what is is well , To take interest is well , and not to take interest shall be well .    0.764985
" Where shall we go ?"                                                                                                     0.764985
" Perhaps I shall .                                                                                                        0.764985
"' Shall we ?'                                                                                                             0.764985
Name: 23, dtype: float64

Component 26:
Mar .    1.0
Mar .    1.0
Mar .    1.0
Mar .    1.0
Mar .    1.0
Name: 26, dtype: float64


In [209]:
print("Top Factors for K-Means 3, Cluster 7:")
show_top_factors(comp_37, cluster_37)

Top Factors for K-Means 3, Cluster 7:

Component 23:
" And when shall I find it ?"                                                                                              0.764985
"' Shall we ?'                                                                                                             0.764985
" Shall we go back , then ?"                                                                                               0.764985
6 What will be will be well , for what is is well , To take interest is well , and not to take interest shall be well .    0.764985
" Where shall we go ?"                                                                                                     0.764985
Name: 23, dtype: float64

Component 24:
[ BOOK XXV ]       0.804744
[ BOOK XXXI ]      0.804744
[ BOOK XXIX ]      0.804744
[ BOOK XXVIII ]    0.804744
[ BOOK XXXII .     0.804744
Name: 24, dtype: float64

Component 34:
6 Land of lands and bards to corroborate !                         

The irregularity of this cluster is really starting to show, having no top 3 components shared by all 3 iterations. Component 7 is shared by the First and Second iteration and Component 23 is shared by the Second and Third iteration.

Component 7 deals with Horatio line prompts.

Component 23 deals with questions using the word 'shall'

For the components only in the First iteration: Component 8 deals with Justine line prompts. Components 13 deals with quotations using the word 'lord', especially when using it to literally mean god.

The only top component in the Second iteration, 26, deals with Mar line prompts. 

The components only in the Third iteration are component 24 and 34. Component 24 deals with Book titles. Componenent 34 deals decriptive language that decribes natures, especially referring to god or personifying nature.

Though it has some inconsistencies, sometimes getting random line prompts and Chapter titles, the overaching theme of this cluster seems to be characterizing some kind of higher power, perhaps even asking it for something.

# Predicting Author Using Various Models

I will now test my ability to predict author from text. There are many different types of models with many different usesm but I will try 3 different ones here: Logistic Regression, Grandient-Boosted Decision Trees, and Random Forest. 

I will check each models cross validation score to check the overall health of the model

In [102]:
from sklearn.model_selection import cross_val_score

In [103]:
# Bag of Words Feature Set
X_bow = total_word_counts.drop(['text_author','text_sentence'],1)
y_bow = total_word_counts.text_author
Xtrain_bow, Xtest_bow, ytrain_bow, ytest_bow = train_test_split(X_bow,y_bow,test_size=0.5,random_state=42)

## Logistic Regression

In [104]:
from sklearn.linear_model import LogisticRegression

In [110]:
constants = [.01,.1,1,10,100,200,300,400,500,700,1000]
bestc = 0
bestscore = 0
for c in constants:
    lr_lsa = LogisticRegression(C=c,random_state=42)
    lr_lsa.fit(X_train_lsa, Y_train)
    score = lr_lsa.score(X_test_lsa, Y_test)
    if score > bestscore:
        bestc = c
        bestscore = score
        print('Best C value is ',c)

Best C value is  0.01
Best C value is  0.1
Best C value is  1
Best C value is  10
Best C value is  100
Best C value is  200


In [111]:
lr_lsa = LogisticRegression(C=bestc,random_state=42)

In [112]:
lr_lsa.fit(X_train_lsa, Y_train)
print('Training set score:', lr_lsa.score(X_train_lsa, Y_train))
print('\nTest set score:', lr_lsa.score(X_test_lsa, Y_test))

Training set score: 0.809009981116806

Test set score: 0.7098732128405719


In [113]:
lr_lsa_cv = cross_val_score(lr_lsa, X_test_lsa, Y_test, cv=10)

In [114]:
print('Logistic Regression Cross Validation\n')
display(lr_lsa_cv)
print('\nMean and Standard Error:')
print(round(lr_lsa_cv.mean(),3),' +/- ',round(lr_lsa_cv.std()*2,3))

Logistic Regression Cross Validation



array([0.68322148, 0.7033557 , 0.70659489, 0.68506057, 0.72005384,
       0.70350404, 0.68918919, 0.70135135, 0.70420624, 0.70380435])


Mean and Standard Error:
0.7  +/-  0.021


## Gradient-Boosted Decision Trees

In [115]:
from sklearn.ensemble import GradientBoostingClassifier

In [119]:
clf_lsa = GradientBoostingClassifier(n_estimators=50,max_depth=2,random_state=42)
clf_lsa.fit(X_train_lsa, Y_train)
print('Training set score:', clf_lsa.score(X_train_lsa, Y_train))
print('\nTest set score:', clf_lsa.score(X_test_lsa, Y_test))

Training set score: 0.668869705961694

Test set score: 0.5967089290531427


In [120]:
clf_lsa_cv = cross_val_score(clf_lsa, X_test_lsa, Y_test, cv=5)

In [121]:
print('Gradient-Boosted Tree Cross Validation\n')
display(clf_lsa_cv)
print('\nMean and Standard Error:')
print(round(clf_lsa_cv.mean(),3),' +/- ',round(clf_lsa_cv.std()*2,3))

Gradient-Boosted Tree Cross Validation



array([0.5870881 , 0.60740741, 0.60755226, 0.57837838, 0.5862069 ])


Mean and Standard Error:
0.593  +/-  0.024


## Random Forest

In [122]:
from sklearn.ensemble import RandomForestClassifier

In [173]:
rfc_lsa = RandomForestClassifier(n_estimators=100,random_state=42)
rfc_lsa.fit(X_train_lsa, Y_train)
print('Training set score:', rfc_lsa.score(X_train_lsa, Y_train))
print('\nTest set score:', rfc_lsa.score(X_test_lsa, Y_test))

Training set score: 0.9579174534664149

Test set score: 0.6154572430536822


In [127]:
rfc_lsa_cv = cross_val_score(rfc_lsa, X_test_lsa, Y_test, cv=5)

In [128]:
print('Random Forest Cross Validation\n')
display(rfc_lsa_cv)
print('\nMean and Standard Error:')
print(round(rfc_lsa_cv.mean(),3),' +/- ',round(rfc_lsa_cv.std()*2,3))

Random Forest Cross Validation



array([0.58439812, 0.5986532 , 0.60687795, 0.59459459, 0.5821501 ])


Mean and Standard Error:
0.593  +/-  0.018


## Modeling Conclusion

The Boosted model preformed the worst by far, having the lowest training and testing scores and performing badly in corss validation.

The Random Forest model did not perform as badly, but not that well either. It had a 91% training score, but only a 60% testing score, indicating some type of overfitting. Though its cross validation scores were not that bad, indicating that any overfitting was not severe.

The Logistic Regression model performed the best, having an 81% training score and a 71% percent testing score. Additionally, it performed the best in cross-validation.

# CLustering Holdout Group

Now I want to return to my clusters, and see if they retain similar groupings in the testing set.

In [166]:
kmt1 = KMeans(n_clusters=10, random_state=0)
y_predt1 = kmt1.fit_predict(X_test_lsa)

In [167]:
kmt2 = KMeans(n_clusters=10, random_state=42)
y_predt2 = kmt2.fit_predict(X_test_lsa)

In [168]:
kmt3 = KMeans(n_clusters=10, random_state=1337)
y_predt3 = kmt3.fit_predict(X_test_lsa)

In [169]:
Y_test = np.array(Y_test)

In [170]:
print('KMeans 1 Inertia: ',km1.inertia_)
print('\nComparing the K-Means 1 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_test,y_predt1))

KMeans 1 Inertia:  6588.094229689489

Comparing the K-Means 1 clusters to the actual author groupings:

col_0          0    1   2     3    4   5   6    7   8    9
row_0                                                     
Austen        97   20  23   262   38  10   0    0  19   27
Blake         12   13   1   116    2   1   0    0   0    7
Bryant        26  121   7   291   85  12  32    0   0   26
Burgess        1   30   2    85    6   4   0    0   0    8
Carroll       18   30  11   180   99   9  12    0   2   17
Chesterton    32   12  13   359  174   9   0    0   5   28
Edgeworth    127   70  34  1165  277  58   3    0  75   79
Melville      89   21  85   838   50  17  10    0  44  241
Shakespeare    0    0   0   283    0   0  49  134   0    2
Whitman       91   18   0   904    5  10   1    0   0  240


In [171]:
print('KMeans 2 Inertia: ',km2.inertia_)
print('\nComparing the K-Means 2 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_test,y_predt2))

KMeans 2 Inertia:  6572.738657087793

Comparing the K-Means 2 clusters to the actual author groupings:

col_0          0   1    2   3   4    5    6     7    8   9
row_0                                                     
Austen        30  15    0   9   0    0   45   377   18   2
Blake         14   2    0   0   0    0    2    83   50   1
Bryant       117  14    5   0  32    0   83   277   46  26
Burgess       34   2    0   0   0    0    6    87    4   3
Carroll       34   8    0   8  12    0  105   195   11   5
Chesterton    17  11    0   6   0    0  176   370   37  15
Edgeworth    120  56    0   6   3    0  313  1247   99  44
Melville      30  20  187  66   8    0   52   758  222  52
Shakespeare    0   0    0   0  49  134    0   281    4   0
Whitman       20   0    2   0   1    0    3   504  685  54


In [172]:
print('KMeans 3 Inertia: ',km3.inertia_)
print('\nComparing the K-Means 3 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_test,y_predt3))

KMeans 3 Inertia:  6586.917000775612

Comparing the K-Means 3 clusters to the actual author groupings:

col_0          0   1    2    3    4   5   6   7    8     9
row_0                                                     
Austen         0  14   28    0   51  15  12   9   40   327
Blake          0   4   14    0    9   2   2   0    2   119
Bryant         5  11  121    0   14  14  59   0   77   299
Burgess        0   3   34    0    4   2   0   0    6    87
Carroll        0   8   33    0   15   8  14   8  104   188
Chesterton     0  16   18    0   12  11  34   6  161   374
Edgeworth      0  28  117    0  128  57  40   6  305  1207
Melville     183  53   28    0  240  21  59  66   51   694
Shakespeare    0   0    0  134   21   0  82   0    0   231
Whitman        2  79   18    0  139   0  43   0    4   984


## Test Clusters Conclusion

The perfectly consistent Hamlet line prompt Shakespeare cluster makes an appearance. There is the super consistent king Bryant Carroll cluster. I can also see the authority Edgeworth-Chesterton-Carroll cluster. The 'little' Bryant dominated cluster is there as well. There is the commonly populated short-sentence cluster. The man cluster with low amounts of Blake, Burgess and Carroll looks to be there as well. I can also see the sparsely populated 'chapter' cluster with mostly Melville and Austin. There is even the inconsistence higher power Whitman and Melville cluster.

The clusters amoung my test set are similar to my training set. Though they did not model the author, the text groupings they made were meaningful and consistent.