In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from nltk.corpus import gutenberg, stopwords
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
import gensim
import time



In [99]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    return [re.sub(r'--', '', word) for word in text]

def raw_text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].+?[\]]", "", text)
 
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    return text

def load_spacy(list_of_docs, max_length=2000000):
    
    # load spacy
    print('Running Spacy...')
    nlp = spacy.load('en')
    nlp.max_length = max_length
    
    # set empty list; holds processed list of docs
    nlp_docs = []
    
    for i, doc in enumerate(list_of_docs):
        print('Processing file {}'.format(i+1))
        nlp_docs.append(nlp(doc))
    print('Done processing')
    return nlp_docs

# put the tokens of a doc in a list
def convert_to_tokens(doc):
  token_list = []
  for token in doc:
    if not token.is_stop and not token.is_punct:
      token_list.append(token.lemma_.lower())
  return token_list

In [3]:
# load documents
doc_names, doc_titles, docs = [], [], []
for name in gutenberg.fileids():
    doc_titles.append(str(name))

  
print('Getting documents..')
for name in doc_titles:
    clean_doc = gutenberg.paras(name)
    print('Processing',name)
    paras = []
    for paragraph in clean_doc:
        para = paragraph[0]
        para = text_cleaner(para)
        docs.append(' '.join(para))
        doc_names.append(name)

Getting documents..
Processing austen-emma.txt
Processing austen-persuasion.txt
Processing austen-sense.txt
Processing bible-kjv.txt
Processing blake-poems.txt
Processing bryant-stories.txt
Processing burgess-busterbrown.txt
Processing carroll-alice.txt
Processing chesterton-ball.txt
Processing chesterton-brown.txt
Processing chesterton-thursday.txt
Processing edgeworth-parents.txt
Processing melville-moby_dick.txt
Processing milton-paradise.txt
Processing shakespeare-caesar.txt
Processing shakespeare-hamlet.txt
Processing shakespeare-macbeth.txt
Processing whitman-leaves.txt


In [81]:
print('Getting documents..')
raw_docs, all_tokens = [], []
for name in doc_titles:
    clean_doc = raw_text_cleaner(gutenberg.raw(name))
    if len(clean_doc) >= 2000000:
        print('length of {} ({}) is too long, trimming.'.format(name, len(clean_doc)))
        clean_doc = clean_doc[:1500000]
    raw_docs.append(clean_doc)

nlp_processed = load_spacy(raw_docs)
print('Adding tokens..')
for doc in nlp_processed:
    all_tokens.append(convert_to_tokens(doc))
print('Complete.')

Getting documents..
length of bible-kjv.txt (4305662) is too long, trimming.
Running Spacy...
Processing file 1
Processing file 2
Processing file 3
Processing file 4
Processing file 5
Processing file 6
Processing file 7
Processing file 8
Processing file 9
Processing file 10
Processing file 11
Processing file 12
Processing file 13
Processing file 14
Processing file 15
Processing file 16
Processing file 17
Processing file 18
Done processing
Adding tokens..
Complete.


In [5]:
X_train, X_test, y_train, y_test = train_test_split(docs, doc_names, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

# Apply the vectorizer
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [6]:
print("Number of features: %d" % X_train.get_shape()[1])

Number of features: 13332


In [7]:
rfc = ensemble.RandomForestClassifier()

train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9663441459000418

Test set score: 0.7777081701905507


In [8]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

Training set score: 0.8691702631212586

Test set score: 0.8092926128948056


In [9]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.826743700403731

Test set score: 0.7563560428086662


# BOW

In [102]:
# create dictionary from list of document
dic = gensim.corpora.Dictionary(all_tokens)
dic.filter_n_most_frequent(3)
# don't want words that exist in almost all documents
dic.filter_extremes(no_above=0.95)

# create bag of words representation for each document
corpus = [dic.doc2bow(token) for token in all_tokens]
for doc in corpus:
    print(doc[:15])

[(0, 2), (1, 1), (2, 3), (3, 72), (4, 1), (5, 7), (6, 3), (7, 8), (8, 3), (9, 5), (10, 1), (11, 16), (12, 12), (13, 4), (14, 33)]
[(1, 1), (2, 3), (3, 30), (4, 1), (5, 1), (6, 1), (7, 5), (11, 9), (12, 5), (13, 3), (14, 5), (16, 6), (17, 1), (18, 1), (21, 3)]
[(0, 3), (2, 12), (3, 47), (4, 5), (6, 9), (7, 9), (9, 3), (10, 1), (11, 11), (12, 3), (13, 3), (14, 10), (16, 2), (17, 4), (18, 2)]
[(0, 15), (1, 31), (2, 1), (3, 55), (4, 39), (5, 6), (6, 3), (7, 33), (12, 1), (18, 9), (19, 1), (20, 7), (21, 2), (23, 21), (24, 2)]
[(7, 1), (40, 1), (63, 1), (96, 1), (98, 7), (102, 6), (103, 1), (107, 1), (110, 1), (118, 1), (137, 1), (138, 3), (139, 11), (146, 1), (147, 4)]
[(1, 1), (3, 13), (6, 2), (12, 1), (15, 1), (16, 1), (19, 1), (37, 2), (40, 1), (42, 1), (48, 5), (49, 3), (55, 1), (58, 2), (66, 2)]
[(2, 1), (3, 1), (23, 1), (34, 1), (43, 2), (53, 4), (65, 3), (71, 2), (73, 6), (74, 1), (88, 40), (89, 2), (94, 3), (96, 4), (98, 3)]
[(1, 1), (3, 1), (11, 1), (16, 2), (25, 1), (26, 2), (28, 

In [103]:
columns = [value[1] for value in dic.items()]
len(columns)

5421

In [None]:
rows_list = []

start_time = time.time()
# name to add name to source column in row
for i, doc in enumerate(nlp_docs):
    print('Processing document {}'.format(doc_titles[i]))
    # document level, searching by sentence
    for sentence in doc.sents:
        sentence_list = []
        # word is in column
        for word in columns:
            if word in str(sentence):
                sentence_list.append(1)
            else:
                sentence_list.append(0)
        
        # now append source since columns are done
        sentence_list.append(doc_titles[i])
        
        # append sentence_list as a row
        rows_list.append(list(sentence_list))


print("--- Fitted in %s seconds ---" % (time.time() - start_time))

In [84]:
start_time = time.time()
df = pd.DataFrame(rows_list, columns=(columns + ['source']))
print(df.head())
print("--- Generated dataframe in %s seconds ---" % (time.time() - start_time))

list