In [172]:
import re # regex
import nltk # natural language toolkit
import spacy # industrial strength natural language processing
import scipy # scientifici computing
import sklearn # machine learning in python
import warnings # Not condoned
import numpy as np # lib handles high dimensional data, etc
import pandas as pd # lib handles data in datastructures and timeseries
import seaborn as sns # lib based on matplotlib
from sklearn import svm
from sklearn import ensemble # combine base estimators to improve robustness
from collections import Counter # mod w/ specialized containers
import matplotlib.pyplot as plt # 2D plotting library 
from nltk.corpus import gutenberg, stopwords # Collection of written text ie Gutenberg
from sklearn.model_selection import cross_val_score # 
from sklearn.linear_model import LogisticRegression # linear combination of input vars
from sklearn.model_selection import train_test_split # Find best prediction params
warnings.filterwarnings("ignore")

In [173]:
def text_cleaner(text): # Clean text
    text = re.sub(r'--',' ',text) # regex replace argument
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split()) # join by space & split into list
    return text     
persuasion = gutenberg.raw('austen-persuasion.txt') # Jane Austen - Persuasion text
alice = gutenberg.raw('carroll-alice.txt') # Lewis Carroll - Alice in Wonderland text
persuasion = re.sub(r'Chapter \d+', '', persuasion) # Chapter heading + 1 or more digits (0-9)
alice = re.sub(r'CHAPTER .*', '', alice) # Chapter heading + any character @ any length
alice = text_cleaner(alice[:int(len(alice)/5)]) # text_clean function grab top portion
persuasion = text_cleaner(persuasion[:int(len(persuasion)/5)]) # Ditto '' 

In [174]:
nlp = spacy.load('en')  # load spacy english set
alice_doc = nlp(alice) # Parse Alice
persuasion_doc = nlp(persuasion) # Parse Persuasion
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents] # Group by sentences
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents] # Group by sentences
sentences = pd.DataFrame(alice_sents + persuasion_sents) # Combine books sentences into DF

In [175]:
def bag_of_words(text): 
  '''
  Bag of words is collection of the most common words.
  We'll create a list of the 2000 most common words.
  Filter out punctuation and stop words then return the most common words.
  '''
  allwords = [token.lemma_ for token in text if not token.is_punct and not token.is_stop]
  return [item[0] for item in Counter(allwords).most_common(2000)] 

def bow_features(sentences, common_words):
  '''
  Create dataframe w/ all words in the common words set
  Create a word count for each common word.
  
  '''
  df = pd.DataFrame(columns=common_words) # Dataframe w/ columns as common words
  df['text_sentence'] = sentences[0] # Create column w/ sentences 
  df['text_source'] = sentences[1] # Create column to categorize each book
  df.loc[:, common_words] = 0 # Initialize count to 0
  
  for i, sentence in enumerate(df['text_sentence']): #Counts of words
    words = [token.lemma_ for token in sentence if (not token.is_punct and not token.is_stop and token.lemma_ in common_words)]
    
    for word in words: # iterate through words
        df.loc[i, word] += 1 # Fill row w/ word count
    
    if i % 50 == 0: # Insure kernel doesn't hang
        print("Processing row {}".format(i))
  return df

alicewords = bag_of_words(alice_doc) # bag of words for alice
persuasionwords = bag_of_words(persuasion_doc) # bag of words for persuasion
common_words = set(alicewords + persuasionwords) # Combine bags to form unique sets.
word_counts = bow_features(sentences, common_words) # Create dataframe 

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400
Processing row 450
Processing row 500
Processing row 550
Processing row 600
Processing row 650
Processing row 700
Processing row 750
Processing row 800
Processing row 850
Processing row 900


In [187]:
word_counts.head()

Unnamed: 0,red,continually,prosecute,guidance,personableness,lazily,numerous,undesirableness,forbearance,possess,...,fall,bottle,severely,recollection,involve,certainty,text_sentence,text_source,num_words,num_punct
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,57,10
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll,55,7
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,28,3
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Oh, dear, !)",Carroll,2,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Oh, dear, !)",Carroll,2,1


In [176]:
rfc = ensemble.RandomForestClassifier() 
Y = word_counts['text_source'] # Book title
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
train = rfc.fit(X_train, y_train)
scores_rfc = cross_val_score(rfc, X, Y, cv=10)
print('Train score:', rfc.score(X_train, y_train))
print('Test score:', rfc.score(X_test, y_test))
print('Accuracy: %0.2f (+/- %0.2f)', scores_rfc.mean())

Train score: 0.962432915921288
Test score: 0.8743315508021391
Accuracy: %0.2f (+/- %0.2f) 0.8284259894760924


In [177]:
svm = svm.SVC(kernel='linear', C=1)
scores_svm = cross_val_score(svm, X, Y, cv=10)
print('Accuracy: %0.2f (+/- %0.2f)', scores_svm.mean())

Accuracy: %0.2f (+/- %0.2f) 0.8380004575611988


In [178]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print('Train score:', lr.score(X_train, y_train))
print('Test score:', lr.score(X_test, y_test))
scores_lr = cross_val_score(lr, X, Y, cv=10)
print('Accuracy: %0.2f (+/- %0.2f)', scores_lr.mean())

Train score: 0.9534883720930233
Test score: 0.8796791443850267
Accuracy: %0.2f (+/- %0.2f) 0.8691374971402425


In [179]:
ensemble = ensemble.GradientBoostingClassifier()
train = ensemble.fit(X_train, y_train)
print('Train score:', ensemble.score(X_train, y_train))
print('Test score:', ensemble.score(X_test, y_test))
scores_ensemble = cross_val_score(ensemble, X, Y, cv=10)
print('Accuracy: %0.2f (+/- %0.2f)', scores_ensemble.mean())

Train score: 0.8801431127012522
Test score: 0.8288770053475936
Accuracy: %0.2f (+/- %0.2f) 0.8112903225806452


In [180]:
emma = gutenberg.raw('austen-emma.txt') # Get text
emma = re.sub(r'VOLUME \w+', '', emma) # Remove Volume headings
emma = re.sub(r'CHAPTER \w+', '', emma) # Remove Chapter headings
emma = text_cleaner(emma[:int(len(emma)/60)]) # Get 60th of total corpus
emma_doc = nlp(emma) # Parse corpus
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents] # group by sent
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents] # group by sent
emma_sentences = pd.DataFrame(emma_sents) # Bag of words from emma word counts
emma_bow = bow_features(emma_sentences, common_words) # Same common words


Processing row 0
Processing row 50
Processing row 100
Processing row 150


In [181]:
X_Emma_test = np.concatenate((X_train[y_train[y_train=='Carroll'].index], # Combine X data from Alice+Emma
                              emma_bow.drop(['text_sentence','text_source'], 1)), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'], # Combine y data from Alice+Emma
                         pd.Series(['Austen'] * emma_bow.shape[0])])
print('\nTest score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test score: 0.5739130434782609


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,153,17
Carroll,130,45


In [None]:
# by increasing the words and changing the size of the bag of words you can get the accuracy for the logistic regression to above 90%