In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

# Predicting Author From Text

I want to make a model that can predict who has written a story based on its text features.  To accomplish this, I will use data from the NLTK 'Gutenberg' corpus. (add data set decription)

In [3]:
import nltk
from nltk.corpus import gutenberg
from nltk.corpus import genesis
from nltk.corpus import webtext

In [4]:
import en_core_web_sm

In [5]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


# Data cleaning / processing / language parsing

I need to make sure my data is clean and workable when I make my features.

In [6]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [7]:
# Load and Clean Persuasion by Jane Austen
persuasion_raw = gutenberg.raw('austen-persuasion.txt')

persuasion = re.sub(r'Chapter \d+', '', persuasion_raw)
persuasion = text_cleaner(persuasion)

In [8]:
# Load and Clean The Poems of William Blake
blake_raw = gutenberg.raw('blake-poems.txt')

blake = re.sub(r'[A-Z][^a-z]*[A-Z]\s','', blake_raw)
blake = re.sub(r'\n[a-z][a-z ]*\n','', blake)
blake = text_cleaner(blake)

In [9]:
# Load and Clean some of the short stories of William Cullen Bryant
bryant_raw = gutenberg.raw('bryant-stories.txt')

bryant = re.sub(r'[A-Z][^a-z]+[A-Z][A-Z]','', bryant_raw)
bryant = text_cleaner(bryant)

In [10]:
# Load and Clean The Adventures of Buster Bear by Thornton Burgess
buster_raw = gutenberg.raw('burgess-busterbrown.txt')

buster = re.sub(r'[A-Z][^a-z]+[A-Z][A-Z]','', buster_raw)
buster = text_cleaner(buster)

In [11]:
# Load and Clean Alice in Wonderland by Lewis Carroll
alice_raw = gutenberg.raw('carroll-alice.txt')

alice = re.sub(r'CHAPTER [A-Z]*.','', alice_raw)
alice = text_cleaner(alice)

In [12]:
# Load and Clean The Man Who Became Thursday by G. K. Chesterton
thursday_raw = gutenberg.raw('chesterton-thursday.txt')

thursday = re.sub(r'[A-Z][^a-z]+[A-Z][A-Z]','', thursday_raw)
thursday = text_cleaner(thursday)

In [13]:
# Load and Clean The Parent's Assistent by Richard Lovell Edgeworth
parents_raw = gutenberg.raw('edgeworth-parents.txt')

parents = re.sub(r'[A-Z][^a-z]+[A-Z][A-Z]\W','', parents_raw)
parents = text_cleaner(parents)

In [14]:
# Load and Clean Moby Dick by Herman Melville
moby_raw = gutenberg.raw('melville-moby_dick.txt')

#remove etomology and the extracts
moby_no_et = 'Call me Ishmael.' + moby_raw.split('Call me Ishmael.')[1]

moby = re.sub(r'CHAPTER\s*\d.\s*[\w\s\-]+\.','', moby_no_et)
moby= text_cleaner(moby)

In [15]:
# Load and Clean Hamlet by William Shakespeare
hamlet_raw = gutenberg.raw('shakespeare-hamlet.txt')

hamlet = re.sub(r'Actus [A-Z][a-z]*\.','', hamlet_raw) # Remove act numbers
hamlet = re.sub(r'Scoena [A-Z][a-z]*\.','', hamlet) # Remove scene numbers
hamlet = re.sub(r'\n\s*[A-Z][a-z]*\.','', hamlet) # Remove charactr's names before their lines
hamlet = re.sub(r'\n\n\s*[A-Z][a-z]*\.','', hamlet) # Remove charactr's names before their lines
hamlet = text_cleaner(hamlet)

In [16]:
# Load and Clean Leaves of Grass by Walt Whitman
leaves_raw = gutenberg.raw('whitman-leaves.txt')

leaves = re.sub(r'\s\s\d\n','', leaves_raw) # Remove line Numberings
leaves = re.sub(r'\}  [A-Z][\w\s\\\'\-\"\,\?\!]*[a-z\?\!]\n\n','', leaves_raw) # Remove Peom Titles
leaves = re.sub(r'Walt Whitman','', leaves_raw) # Remove anytime Walt Whitman signed his name
leaves = re.sub(r'Whitman','', leaves_raw) # Remove anytime Walt Whitman signed his name

leaves = text_cleaner(leaves)

In [17]:
# take a sample of each text, to prevent memory errors
persuasiontr = persuasion[:30000]
blaketr = blake[:30000]
bryanttr = bryant[:30000]
bustertr = buster[:30000]
alicetr = alice[:30000]
thursdaytr = thursday[:30000]
parentstr = parents[:30000]
mobytr = moby[:30000]
hamlettr = hamlet[:30000]
leavestr = leaves[:30000]

In [18]:
len(leaves)

692262

# Creating Features

I must extract features from the text, through a process called NLP, Natural Language Processing. Though there are many ways to accomplish this, I will use 2: Bag of Words and Term-Frequency with Inverse Document Frequency, or TF IDF.

In [19]:
nlp = en_core_web_sm.load()

In [20]:
persuasion_doc = nlp(persuasiontr)

In [21]:
blake_doc = nlp(blaketr)

In [22]:
bryant_doc = nlp(bryanttr)

In [23]:
buster_doc = nlp(bustertr)

In [24]:
alice_doc = nlp(alicetr)

In [25]:
thursday_doc = nlp(thursdaytr)

In [26]:
parents_doc = nlp(parentstr)

In [27]:
moby_doc = nlp(mobytr)

In [28]:
hamlet_doc = nlp(hamlettr)

In [29]:
leaves_doc = nlp(leavestr)

In [30]:
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
blake_sents = [[sent, "Blake"] for sent in blake_doc.sents]
bryant_sents = [[sent, "Bryant"] for sent in bryant_doc.sents]
buster_sents = [[sent, "Burgess"] for sent in buster_doc.sents]
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
thursday_sents = [[sent, "Chesterton"] for sent in thursday_doc.sents]
parents_sents = [[sent, "Edgeworth"] for sent in parents_doc.sents]
moby_sents = [[sent, "Melville"] for sent in moby_doc.sents]
hamlet_sents = [[sent, "Shakespeare"] for sent in hamlet_doc.sents]
leaves_sents = [[sent, "Whitman"] for sent in leaves_doc.sents]

In [31]:
sents_list = [persuasion_sents, bryant_sents, buster_sents, alice_sents, thursday_sents, 
            parents_sents, moby_sents, blake_sents, hamlet_sents, leaves_sents]

In [32]:
# remove the last sentence due to initial cutoff
for sent in sents_list:
    l = len(sent)
    del sent[l-1]

In [33]:
sents_grouped = [item for sublist in sents_list for item in sublist]

In [34]:
sentences = pd.DataFrame(sents_grouped)
sentences.head()

Unnamed: 0,0,1
0,"(Sir, Walter, Elliot, ,, of, Kellynch, Hall, ,...",Austen
1,"(This, was, the, page, at, which, the, favouri...",Austen
2,"(Walter, Elliot, ,, born, March, 1, ,, 1760, ,...",Austen
3,"(of, South, Park, ,, in, the, county, of, Glou...",Austen
4,"("", Precisely, such, had, the, paragraph, orig...",Austen


## Bag of Words

I will use the bag of words technique first. To use this, I will need to process the texts down to sentences. From there, I will extract information on each sentences verbosity and punctuation use. I will use this information to characterize each sentence.

In [35]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_author'] = sentences[1]
    df.loc[:, common_words] = 0
    df['punctuation'] = 0
    df['other punctuation'] = 0
    df.loc[:, '.'] = 0
    df.loc[:, '?'] = 0
    df.loc[:, '!'] = 0
    df.loc[:, ','] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation, stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # Get number of punctuation in a sentence
        puncs = [token.lemma_
                 for token in sentence
                 if (
                     token.is_punct
                 )]
        # Increase punctuation count by how many were use
        for punc in puncs:
            df.loc[i,'punctuation'] += 1
            try:
                df.loc[i,punc] += 1
            except:
                df.loc[i,'other punctuation'] += 1
                
            
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 200 == 0:
            print("Processing row {}".format(i))
            
    return df

In [36]:
# Set up the bags.
persuasionwords = bag_of_words(persuasion_doc)
blakewords = bag_of_words(blake_doc)
bryantwords = bag_of_words(bryant_doc)
busterwords = bag_of_words(buster_doc)
alicewords = bag_of_words(alice_doc)
thursdaywords = bag_of_words(thursday_doc)
parentswords = bag_of_words(parents_doc)
mobywords = bag_of_words(moby_doc)
hamletwords = bag_of_words(hamlet_doc)
leaveswords = bag_of_words(leaves_doc)

# Combine bags to create a set of unique words.
common_words = set(persuasionwords + bryantwords + busterwords + alicewords + thursdaywords + 
                   parentswords + mobywords + blakewords + hamletwords + leaveswords)

In [37]:
# Convert to data frame for the BOW
persuasion_sents = pd.DataFrame(persuasion_sents)
# Get BOW features
persuasion_word_counts = bow_features(persuasion_sents, common_words)

Processing row 0


In [38]:
# Convert to data frame for the BOW
blake_sents = pd.DataFrame(blake_sents)
# Get BOW features
blake_word_counts = bow_features(blake_sents, common_words)

Processing row 0
Processing row 200


In [39]:
# Convert to data frame for the BOW
bryant_sents = pd.DataFrame(bryant_sents)
# Get BOW features
bryant_word_counts = bow_features(bryant_sents, common_words)

Processing row 0
Processing row 200


In [40]:
# Convert to data frame for the BOW
buster_sents = pd.DataFrame(buster_sents)
# Get BOW features
buster_word_counts = bow_features(buster_sents, common_words)

Processing row 0
Processing row 200


In [41]:
# Convert to data frame for the BOW
alice_sents = pd.DataFrame(alice_sents)
# Get BOW features
alice_word_counts = bow_features(alice_sents, common_words)

Processing row 0
Processing row 200


In [42]:
# Convert to data frame for the BOW
thursday_sents = pd.DataFrame(thursday_sents)
# Get BOW features
thursday_word_counts = bow_features(thursday_sents, common_words)

Processing row 0
Processing row 200


In [43]:
# Convert to data frame for the BOW
parents_sents = pd.DataFrame(parents_sents)
# Get BOW features
parents_word_counts = bow_features(parents_sents, common_words)

Processing row 0
Processing row 200


In [44]:
# Convert to data frame for the BOW
moby_sents = pd.DataFrame(moby_sents)
# Get BOW features
moby_word_counts = bow_features(moby_sents, common_words)

Processing row 0
Processing row 200


In [45]:
# Convert to data frame for the BOW
hamlet_sents = pd.DataFrame(hamlet_sents)
# Get BOW features
hamlet_word_counts = bow_features(hamlet_sents, common_words)

Processing row 0
Processing row 200
Processing row 400


In [46]:
# Convert to data frame for the BOW
leaves_sents = pd.DataFrame(leaves_sents)
# Get BOW features
leaves_word_counts = bow_features(leaves_sents, common_words)

Processing row 0
Processing row 200


In [47]:
total_word_counts = pd.concat([persuasion_word_counts,alice_word_counts,thursday_word_counts,
                               parents_word_counts,moby_word_counts,blake_word_counts,hamlet_word_counts,leaves_word_counts])

## TF IDF

In [48]:
persuasion = gutenberg.paras('austen-persuasion.txt')
#processing
persuasion_paras=[]
for paragraph in persuasion:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    persuasion_paras.append(' '.join(para))
    
persuasion_len = len(persuasion_paras)

In [49]:
blake = gutenberg.paras('blake-poems.txt')
#processing
blake_paras=[]
for paragraph in blake:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    blake_paras.append(' '.join(para))
    
blake_len = len(blake_paras)

In [50]:
bryant = gutenberg.paras('bryant-stories.txt')
#processing
bryant_paras=[]
for paragraph in bryant:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    bryant_paras.append(' '.join(para))
    
bryant_len = len(bryant_paras)

In [51]:
buster = gutenberg.paras('burgess-busterbrown.txt')
#processing
buster_paras=[]
for paragraph in buster:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    buster_paras.append(' '.join(para))
    
buster_len = len(buster_paras)

In [52]:
alice = gutenberg.paras('carroll-alice.txt')
#processing
alice_paras=[]
for paragraph in alice:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    alice_paras.append(' '.join(para))
    
alice_len = len(alice_paras)

In [53]:
thursday = gutenberg.paras('chesterton-thursday.txt')
#processing
thursday_paras=[]
for paragraph in thursday:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    thursday_paras.append(' '.join(para))
    
thursday_len = len(thursday_paras)

In [54]:
parents = gutenberg.paras('edgeworth-parents.txt')
#processing
parents_paras=[]
for paragraph in parents:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    parents_paras.append(' '.join(para))
    
parents_len = len(parents_paras)

In [55]:
moby = gutenberg.paras('melville-moby_dick.txt')
#processing
moby_paras=[]
for paragraph in moby:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    moby_paras.append(' '.join(para))
    
moby_len = len(moby_paras)

In [56]:
hamlet = gutenberg.paras('shakespeare-hamlet.txt')
caesar = gutenberg.paras('shakespeare-caesar.txt')
macbeth = gutenberg.paras('shakespeare-macbeth.txt')

#processing
shake_paras=[]
for paragraph in hamlet:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    shake_paras.append(' '.join(para))
    
for paragraph in caesar:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    shake_paras.append(' '.join(para))
    
for paragraph in macbeth:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    shake_paras.append(' '.join(para))
    
shake_len = len(shake_paras)

In [57]:
leaves = gutenberg.paras('whitman-leaves.txt')
#processing
leaves_paras=[]
for paragraph in leaves:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    leaves_paras.append(' '.join(para))
    
leaves_len = len(leaves_paras)

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

In [59]:
all_paras = persuasion_paras + blake_paras + bryant_paras + buster_paras + alice_paras + thursday_paras + parents_paras + moby_paras + shake_paras + leaves_paras

In [60]:
labels = []
for i in range(persuasion_len):
    labels.append('Austen')
for i in range(blake_len):
    labels.append('Blake')
for i in range(bryant_len):
    labels.append('Bryant')
for i in range(buster_len):
    labels.append('Burgess')
for i in range(alice_len):
    labels.append('Carroll')
for i in range(thursday_len):
    labels.append('Chesterton')
for i in range(parents_len):
    labels.append('Edgeworth')
for i in range(moby_len):
    labels.append('Melville')
for i in range(shake_len):
    labels.append('Shakespeare')
for i in range(leaves_len):
    labels.append('Whitman')

In [61]:
#Applying the vectorizer
all_paras_tfidf=vectorizer.fit_transform(all_paras)
print("Number of features: %d" % all_paras_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf, Y_train, Y_test = train_test_split(all_paras_tfidf, labels, test_size=0.5, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()
X_test_tfidf_csr = X_test_tfidf.tocsr()

Number of features: 10703


In [62]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130
svd= TruncatedSVD(500)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.transform(X_test_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

(add code to show what the features are)

SyntaxError: invalid syntax (<ipython-input-62-c135d957f48e>, line 12)

# Creating Clusters

The first technique is to create a series of clusters. Try several techniques and pick the one you think best represents your data. Make sure there is a narrative and reasoning around why you have chosen the given clusters. Are authors consistently grouped into the same cluster?

## K-Means

In [None]:
km0 = KMeans(n_clusters=10, random_state=0)
y_pred0 = km0.fit_predict(X_train_lsa)

In [None]:
km1 = KMeans(n_clusters=10, random_state=7)
y_pred1 = km1.fit_predict(X_train_lsa)

In [None]:
km2 = KMeans(n_clusters=10, random_state=42)
y_pred2 = km2.fit_predict(X_train_lsa)

In [None]:
km3 = KMeans(n_clusters=10, random_state=121)
y_pred3 = km3.fit_predict(X_train_lsa)

In [None]:
km4 = KMeans(n_clusters=10, random_state=1337)
y_pred4 = km4.fit_predict(X_train_lsa)

In [None]:
Y_train = np.array(Y_train)

In [None]:
print('KMeans 0 Inertia: ',km0.inertia_)
print('\nComparing the K-Means 0 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_train,y_pred0))

In [None]:
print('KMeans 1 Inertia: ',km1.inertia_)
print('\nComparing the K-Means 1 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_train,y_pred1))

In [None]:
print('KMeans 2 Inertia: ',km2.inertia_)
print('\nComparing the K-Means 2 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_train,y_pred2))

In [None]:
print('KMeans 3 Inertia: ',km3.inertia_)
print('\nComparing the K-Means 3 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_train,y_pred3))

In [None]:
print('KMeans 4 Inertia: ',km4.inertia_)
print('\nComparing the K-Means 4 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_train,y_pred4))

# Predicting Author Using Various Models and Feature Sets

I will now test my ability to predict author from text. There are many different types of models with many different usesm but I will try 4 different ones here: Logistic Regression, Random Forest, Grandient-Boosted Decision Trees, and Support Vector Classifier. 

Additionally, I will be modeling with both of my features sets.

I will check each models cross validation score to check the overall health of the model

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
# Bag of Words Feature Set
X_bow = total_word_counts.drop(['text_author','text_sentence'],1)
y_bow = total_word_counts.text_author
Xtrain_bow, Xtest_bow, ytrain_bow, ytest_bow = train_test_split(X_bow,y_bow,test_size=0.5,random_state=42)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

### BOW Feature Set

In [None]:
constants = [.01,.1,1,5,7,8,9,10,11,12,15,20,30,50,70,100,150,200]
bestc = 0
bestscore = 0
for c in constants:
    lr_bow = LogisticRegression(C=c,random_state=42)
    lr_bow.fit(Xtrain_bow, ytrain_bow)
    score = lr_bow.score(Xtest_bow, ytest_bow)
    if score > bestscore:
        bestc = c
        bestscore = score
        print('Best C value is ',c)

In [None]:
lr_bow = LogisticRegression(C=bestc,random_state=42)

In [None]:
lr_bow.fit(Xtrain_bow, ytrain_bow)
print('Training set score:', lr_bow.score(Xtrain_bow, ytrain_bow))
print('\nTest set score:', lr_bow.score(Xtest_bow, ytest_bow))

In [None]:
lr_bow_cv = cross_val_score(lr_bow, Xtest_bow, ytest_bow, cv=10)

In [None]:
print('Logistic Regression Cross Validation\n')
display(lr_bow_cv)
print('\nMean and Standard Error:')
print(round(lr_bow_cv.mean(),3),' +/- ',round(lr_bow_cv.std()*2,3))

### TF IDF Feature Set

In [None]:
constants = [.01,.1,1,10,100,200,300,400,500,700,1000]
bestc = 0
bestscore = 0
for c in constants:
    lr_lsa = LogisticRegression(C=c,random_state=42)
    lr_lsa.fit(X_train_lsa, Y_train)
    score = lr_lsa.score(X_test_lsa, Y_test)
    if score > bestscore:
        bestc = c
        bestscore = score
        print('Best C value is ',c)

In [None]:
lr_lsa = LogisticRegression(C=bestc,random_state=42)

In [None]:
lr_lsa.fit(X_train_lsa, Y_train)
print('Training set score:', lr_lsa.score(X_train_lsa, Y_train))
print('\nTest set score:', lr_lsa.score(X_test_lsa, Y_test))

In [None]:
lr_lsa_cv = cross_val_score(lr_lsa, X_test_lsa, Y_test, cv=10)

In [None]:
print('Logistic Regression Cross Validation\n')
display(lr_lsa_cv)
print('\nMean and Standard Error:')
print(round(lr_lsa_cv.mean(),3),' +/- ',round(lr_lsa_cv.std()*2,3))

## Gradient-Boosted Decision Trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

### BOW Feature Set

In [None]:
clf_bow = GradientBoostingClassifier(n_estimators=50,max_depth=2,random_state=42)
clf_bow.fit(Xtrain_bow, ytrain_bow)
print('Training set score:', clf_bow.score(Xtrain_bow, ytrain_bow))
print('\nTest set score:', clf_bow.score(Xtest_bow, ytest_bow))

In [None]:
clf_bow_cv = cross_val_score(clf_bow, Xtest_bow, ytest_bow, cv=5)

In [None]:
print('Gradient-Boosted Tree Cross Validation\n')
display(clf_bow_cv)
print('\nMean and Standard Error:')
print(round(clf_bow_cv.mean(),3),' +/- ',round(clf_bow_cv.std()*2,3))

### TF IDF Feature Set

In [None]:
clf_lsa = GradientBoostingClassifier(n_estimators=50,max_depth=2,random_state=42)
clf_lsa.fit(X_train_lsa, Y_train)
print('Training set score:', clf_lsa.score(X_train_lsa, Y_train))
print('\nTest set score:', clf_lsa.score(X_test_lsa, Y_test))

In [None]:
clf_lsa_cv = cross_val_score(clf_lsa, X_test_lsa, Y_test, cv=5)

In [None]:
print('Gradient-Boosted Tree Cross Validation\n')
display(clf_lsa_cv)
print('\nMean and Standard Error:')
print(round(clf_lsa_cv.mean(),3),' +/- ',round(clf_lsa_cv.std()*2,3))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

### BOW Feature Set

In [None]:
rfc_bow = RandomForestClassifier(n_estimators=50,random_state=42)
rfc_bow.fit(Xtrain_bow, ytrain_bow)
print('Training set score:', rfc_bow.score(Xtrain_bow, ytrain_bow))
print('\nTest set score:', rfc_bow.score(Xtest_bow, ytest_bow))

In [None]:
rfc_bow_cv = cross_val_score(rfc_bow, Xtest_bow, ytest_bow, cv=5)

In [None]:
print('Random Forest Cross Validation\n')
display(rfc_bow_cv)
print('\nMean and Standard Error:')
print(round(rfc_bow_cv.mean(),3),' +/- ',round(rfc_bow_cv.std()*2,3))

### TF IDF Feature Set

In [None]:
rfc_lsa = RandomForestClassifier(n_estimators=50,random_state=42)
rfc_lsa.fit(X_train_lsa, Y_train)
print('Training set score:', rfc_lsa.score(X_train_lsa, Y_train))
print('\nTest set score:', rfc_lsa.score(X_test_lsa, Y_test))

In [None]:
rfc_lsa_cv = cross_val_score(rfc_lsa, X_test_lsa, Y_test, cv=5)

In [None]:
print('Random Forest Cross Validation\n')
display(rfc_lsa_cv)
print('\nMean and Standard Error:')
print(round(rfc_lsa_cv.mean(),3),' +/- ',round(rfc_lsa_cv.std()*2,3))

## Modeling Conclusion

The best model to model the BOW feature set was Logistic Regression, having the highest accuracy score and a fairly stable cross valiadation score.

The best model to predict with the TF IDF feature set was the Random Forest Classifier. 

Though Logistic Regression using BOW Features had the best testing score, the Random Forest using the TF IDF features showed a more stable cross validation score. Because of that, I will treat both of them as the most effective and try to improve on each of them.

# Testing Holdout Group

Lastly return to your holdout group. Does your clustering on those members perform as you'd expect? Have your clusters remained stable or changed dramatically? What about your model? Is it's performance consistent?

If there is a divergence in the relative stability of your model and your clusters, delve into why.

In [None]:
kmt0 = KMeans(n_clusters=10, random_state=0)
y_predt0 = kmt0.fit_predict(X_test_lsa)

In [None]:
kmt1 = KMeans(n_clusters=10, random_state=7)
y_predt1 = kmt1.fit_predict(X_test_lsa)

In [None]:
kmt2 = KMeans(n_clusters=10, random_state=42)
y_predt2 = kmt2.fit_predict(X_test_lsa)

In [None]:
kmt3 = KMeans(n_clusters=10, random_state=121)
y_predt3 = kmt3.fit_predict(X_test_lsa)

In [None]:
kmt4 = KMeans(n_clusters=10, random_state=1337)
y_predt4 = kmt4.fit_predict(X_test_lsa)

In [None]:
Y_test = np.array(Y_test)

In [None]:
print('KMeans 0 Inertia: ',km0.inertia_)
print('\nComparing the K-Means 0 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_test,y_predt0))

In [None]:
print('KMeans 1 Inertia: ',km1.inertia_)
print('\nComparing the K-Means 1 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_test,y_predt1))

In [None]:
print('KMeans 2 Inertia: ',km2.inertia_)
print('\nComparing the K-Means 2 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_test,y_predt2))

In [None]:
print('KMeans 3 Inertia: ',km3.inertia_)
print('\nComparing the K-Means 3 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_test,y_predt3))

In [None]:
print('KMeans 4 Inertia: ',km4.inertia_)
print('\nComparing the K-Means 4 clusters to the actual author groupings:\n')
print(pd.crosstab(Y_test,y_predt4))

## Test Clusters Conclusion

Find real meaning of what the clusters are grouping upon. poke around and see what the each is.