In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy
from collections import Counter

# 1. Data cleaning / processing / language parsing

In [5]:
# Import stopwords and corpus options from nltk
from nltk.corpus import gutenberg, stopwords

# Look at the corpus options in gutenberg
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [6]:
# Grab and process the raw data.
moby = gutenberg.raw('melville-moby_dick.txt')
paradise = gutenberg.raw('milton-paradise.txt')
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
macbeth = gutenberg.raw('shakespeare-macbeth.txt')

In [7]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

# Get rid of the chapter and actus headings
moby = re.sub(r'CHAPTER .*', '', moby)
paradise = re.sub(r'Book .*', '', paradise)
hamlet = re.sub(r'Actus .*', '', hamlet)
macbeth = re.sub(r'Actus .*', '', macbeth)

# Clean the data
moby = text_cleaner(moby)
paradise = text_cleaner(paradise)
hamlet = text_cleaner(hamlet)
macbeth = text_cleaner(macbeth)

In [8]:
# Parse the cleaned novels.
nlp = spacy.load('en')
moby_doc = nlp(moby)
paradise_doc = nlp(paradise)
hamlet_doc = nlp(hamlet)
macbeth_doc = nlp(macbeth)

# 2. Create features using two different NLP methods: For example, BoW vs tf-idf.

In [9]:
# Group into sentences.
moby_sents = [[sent, 'Melville'] for sent in moby_doc.sents]
paradise_sents = [[sent, 'Milton'] for sent in paradise_doc.sents]
hamlet_sents = [[sent, 'Shakespeare'] for sent in hamlet_doc.sents]
macbeth_sents = [[sent, 'Shakespeare'] for sent in macbeth_doc.sents]

# Combine the sentences from each of the novels into one dataframe
sentences = pd.DataFrame(moby_sents + paradise_sents + hamlet_sents + macbeth_sents)

In [10]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags
mobywords = bag_of_words(moby_doc)
paradisewords = bag_of_words(paradise_doc)
hamletwords = bag_of_words(hamlet_doc)
macbethwords = bag_of_words(macbeth_doc)

# Combine the bags to create a set of unique words
common_words = set(mobywords + paradisewords + hamletwords + macbethwords)

In [11]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000
Processing row 5500
Processing row 6000
Processing row 6500
Processing row 7000
Processing row 7500
Processing row 8000
Processing row 8500
Processing row 9000
Processing row 9500
Processing row 10000
Processing row 10500
Processing row 11000
Processing row 11500
Processing row 12000
Processing row 12500
Processing row 13000
Processing row 13500
Processing row 14000
Processing row 14500
Processing row 15000
Processing row 15500
Processing row 16000
Processing row 16500
Processing row 17000
Processing row 17500


Unnamed: 0,grace,synod,wrackt,courage,minde,falne,rapier,cabin,imagining,skinne,...,sits,sleeue,cerimony,neyth,elevate,who,indeede,trial,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(ETYMOLOGY, .)",Melville
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"((, Supplied, by, a, Late, Consumptive)",Melville
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Usher, to, a, Grammar, School, ))",Melville
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, pale, Usher, threadbare, in, coat, ,, he...",Melville
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(;, I, see, him, now, .)",Melville


# BoW with SVM

In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

svc = SVC()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = svc.fit(X_train, y_train)

print('Training set score:', svc.score(X_train, y_train))
print('\nTest set score:', svc.score(X_test, y_test))

Training set score: 0.571976070294

Test set score: 0.567021873247


In [82]:
rfc = RandomForestClassifier()
fit = rfc.fit(X_train, y_train)
predict = rfc.predict(X_test)
score = rfc.score(X_test, y_test)

print('Random Forest Score: {}'.format(score))

Random Forest Score: 0.8025799214806506


In [83]:
cv_rfc = cross_val_score(rfc, X, Y)
print('Cross-Validated Score: {}'.format(cv_rfc))
print("Accuracy: %0.2f (+/- %0.2f)" % (cv_rfc.mean(), cv_rfc.std() * 2))

Cross-Validated Score: [ 0.77792732  0.79067811  0.75130406]
Accuracy: 0.77 (+/- 0.03)


# tfidf in sklearn

In [13]:
len(gutenberg.paras('melville-moby_dick.txt'))

2793

In [14]:
mel_mob_paras = gutenberg.paras('melville-moby_dick.txt')
mobi_paras=[]
for paragraph in mel_mob_paras:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    mobi_paras.append(' '.join(para))

In [15]:
df_mel_mobi = pd.DataFrame({'author': ['Melville']*2793, 
                            'novel': ['Moby'] * 2793, 
                            'text': mobi_paras})

In [16]:
df_mel_mobi.head(5)

Unnamed: 0,author,novel,text
0,Melville,Moby,[ Moby Dick by Herman Melville 1851 ]
1,Melville,Moby,ETYMOLOGY .
2,Melville,Moby,( Supplied by a Late Consumptive Usher to a Gr...
3,Melville,Moby,"The pale Usher threadbare in coat , heart , b..."
4,Melville,Moby,""" While you take in hand to school others , an..."


In [17]:
len(gutenberg.paras('milton-paradise.txt'))

29

In [18]:
milton_paradise_paras = gutenberg.paras('milton-paradise.txt')
paradise_paras=[]
for paragraph in milton_paradise_paras:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    paradise_paras.append(' '.join(para))

In [19]:
df_milton_paradise = pd.DataFrame({'author': ['Milton']*29, 
                                   'novel': ['Paradise'] *29, 
                                   'text': paradise_paras})

In [20]:
df_milton_paradise.head(5)

Unnamed: 0,author,novel,text
0,Milton,Paradise,[ Paradise Lost by John Milton 1667 ]
1,Milton,Paradise,Book I
2,Milton,Paradise,"Of Man ' s first disobedience , and the fruit ..."
3,Milton,Paradise,Book II
4,Milton,Paradise,"High on a throne of royal state , which far Ou..."


In [21]:
len(gutenberg.paras('shakespeare-hamlet.txt'))

950

In [23]:
shakes_hamlet_paras = gutenberg.paras('shakespeare-hamlet.txt')
hamlet_paras=[]
for paragraph in shakes_hamlet_paras:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    hamlet_paras.append(' '.join(para))

In [24]:
df_shakes_hamlet = pd.DataFrame({'author': ['Shakespeare']*950, 
                                 'novel': ['Hamlet']*950, 
                                 'text': hamlet_paras})

In [25]:
df_shakes_hamlet.head(5)

Unnamed: 0,author,novel,text
0,Shakespeare,Hamlet,[ The Tragedie of Hamlet by William Shakespear...
1,Shakespeare,Hamlet,Actus Primus .
2,Shakespeare,Hamlet,Enter Barnardo and Francisco two Centinels .
3,Shakespeare,Hamlet,Barnardo .
4,Shakespeare,Hamlet,Bar .


In [26]:
len(gutenberg.paras('shakespeare-macbeth.txt'))

678

In [27]:
shakes_macbeth_paras = gutenberg.paras('shakespeare-macbeth.txt')
macbeth_paras=[]
for paragraph in shakes_macbeth_paras:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    macbeth_paras.append(' '.join(para))

In [28]:
df_shakes_macbeth = pd.DataFrame({'author': ['Shakespeare']*678, 
                                  'novel': ['Macbeth']*678, 
                                 'text': macbeth_paras})

In [29]:
df_shakes_macbeth.head(5)

Unnamed: 0,author,novel,text
0,Shakespeare,Macbeth,[ The Tragedie of Macbeth by William Shakespea...
1,Shakespeare,Macbeth,Actus Primus .
2,Shakespeare,Macbeth,Thunder and Lightning .
3,Shakespeare,Macbeth,1 .
4,Shakespeare,Macbeth,3 .


In [30]:
df_all_paras = pd.concat([df_mel_mobi,
                         df_milton_paradise,
                         df_shakes_hamlet,
                         df_shakes_macbeth])
df_all_paras.head(5)

Unnamed: 0,author,novel,text
0,Melville,Moby,[ Moby Dick by Herman Melville 1851 ]
1,Melville,Moby,ETYMOLOGY .
2,Melville,Moby,( Supplied by a Late Consumptive Usher to a Gr...
3,Melville,Moby,"The pale Usher threadbare in coat , heart , b..."
4,Melville,Moby,""" While you take in hand to school others , an..."


In [33]:
author = list(df_all_paras['author'])
novel = list(df_all_paras['novel'])
text = list(df_all_paras['text'])

In [None]:
df.to_csv('/path/filename.csv')

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

text_train, text_test, author_train, author_test = train_test_split(
    text, author, test_size=0.3, random_state=42, stratify=author)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case 
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
train_tfidf=vectorizer.fit_transform(text_train)
print("Number of features: %d" % train_tfidf.get_shape()[1])

test_tfidf = vectorizer.transform(text_test)
print("Number of features: %d" % test_tfidf.get_shape()[1])

Number of features: 2911
Number of features: 2911


In [74]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
fit = rfc.fit(train_tfidf, author_train)
predict = rfc.predict(test_tfidf)
score = rfc.score(test_tfidf, author_test)

In [76]:
print('Random Forest Score: {}'.format(score))

Random Forest Score: 0.9198501872659176


In [80]:
# cross validate

from sklearn.model_selection import cross_val_score

cv_tfidf = vectorizer.fit_transform(text)

cv_scores = cross_val_score(rfc, cv_tfidf, author)
print('Cross Validated Scores: {}'.format(cv_scores))
print("Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))

Cross Validated Scores: [ 0.87601078  0.90566038  0.90013495]
Accuracy: 0.89 (+/- 0.03)
