In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
from sklearn.model_selection import train_test_split, cross_val_score
import gensim
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import xgboost
import nltk

In [2]:
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
macbeth = gutenberg.raw('shakespeare-macbeth.txt')

In [3]:
# Print the first 100 characters of Hamlet
print('\nHamlet Raw:\n', hamlet[0:100])
# Print the first 100 characters of Macbeth
print('\n--------------\nMacbeth Raw:\n', macbeth[0:100])


Hamlet Raw:
 [The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Barnardo a

--------------
Macbeth Raw:
 [The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus. Scoena Prima.

Thunder and Lig


In [4]:
# Print the first 100 characters of Alice again.
print('Hamlet:\n', hamlet[0:100])
# All done with cleanup? Let's see how it looks.
print('\nMacbeth:\n', macbeth[0:100])

Hamlet:
 [The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Barnardo a

Macbeth:
 [The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus. Scoena Prima.

Thunder and Lig


In [5]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

hamlet = text_cleaner(hamlet)
macbeth = text_cleaner(macbeth)

In [6]:
nlp = spacy.load('en')
hamlet_doc = nlp(hamlet)
macbeth_doc = nlp(macbeth)

In [7]:
from collections import Counter

# Utility function to calculate how frequently words appear in the text.
def word_frequencies(text, include_stop=False):
    
    # Build a list of words.
    # Strip out punctuation and, optionally, stop words.
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    # Build and return a Counter object containing word counts.
    return Counter(words)

In [8]:
hamlet_freq = word_frequencies(hamlet_doc).most_common(2000)
macbeth_freq = word_frequencies(macbeth_doc).most_common(2000)

In [9]:
# Pull out just the text from our frequency lists.
hamlet_common = [pair[0] for pair in hamlet_freq]
macbeth_common = [pair[0] for pair in macbeth_freq]

# Use sets to find the unique values in each top ten.
hamlet_unique = set(hamlet_common) - set(macbeth_common)
macbeth_unique = set(macbeth_common) - set(hamlet_common)

In [10]:
# Group into sentences.
hamlet_sents = [[sent, "Hamlet"] for sent in hamlet_doc.sents]
macbeth_sents = [[sent, "Macbeth"] for sent in macbeth_doc.sents]

In [11]:
# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(hamlet_sents + macbeth_sents)

In [12]:
# Define Label
sentences["Hamlet?"] = np.where(sentences[1]== "Hamlet", 1, 0)

# Convert sentences tuples to string
sentences[0] = sentences[0].astype(str)

# Tokenize sentences
sentences[0] = sentences.apply(lambda row: nltk.word_tokenize(row[0]), axis=1)

# Remove all stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
sentences[0] = sentences[0].apply(lambda x: [item for item in x if item not in stop])

# Lower Case everything
sentences[0] = sentences[0].astype(str)
sentences[0] = sentences[0].apply(lambda x: x.lower())

# remove all punctuations
from string import punctuation
sentences[0] = sentences[0].apply(lambda x: ''.join(c for c in x if c not in punctuation))

In [46]:
sentences.head()

Unnamed: 0,0,1,Hamlet?
0,actus primus,Hamlet,1
1,scoena prima,Hamlet,1
2,enter barnardo francisco two centinels,Hamlet,1
3,barnardo,Hamlet,1
4,who s,Hamlet,1


In [48]:
x = sentences[0]
y = sentences['Hamlet?']

# Split data into training and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [49]:
# Create count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(x_train)

xtrain_count = count_vect.transform(x_train)
xtest_count = count_vect.transform(x_test)

In [50]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(sentences[0])
xtrain_tfidf =  tfidf_vect.transform(x_train)
xtest_tfidf =  tfidf_vect.transform(x_test)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(sentences[0])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(x_train)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(x_test)

In [51]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, y_test)

# TF-IDF

In [52]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), y_train, xtest_count.tocsc())
print ("Xgb, Count Vectors: ", accuracy)

Xgb, Count Vectors:  0.7595959595959596


In [53]:
# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), y_train, xtest_tfidf.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram.tocsc(), y_train, xtest_tfidf_ngram.tocsc())
print ("Xgb, CharLevel Vectors: ", accuracy)

Xgb, WordLevel TF-IDF:  0.7474747474747475
Xgb, CharLevel Vectors:  0.6222222222222222


In [59]:
# Random Forest Classifier
from sklearn import ensemble

rfc = ensemble.RandomForestClassifier(n_estimators=10)
train = rfc.fit(xtrain_tfidf, y_train)

print(rfc.score(xtest_tfidf, y_test))

0.803030303030303


In [60]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(xtrain_tfidf, y_train)
print(xtrain_tfidf.shape, y_train.shape)
print('Training set score:', lr.score(xtrain_tfidf, y_train))
print('\nTest set score:', lr.score(xtest_tfidf, y_test))

(3956, 5000) (3956,)
Training set score: 0.8968655207280081

Test set score: 0.798989898989899




In [64]:
clf = ensemble.GradientBoostingClassifier(n_estimators=500)
train = clf.fit(xtrain_tfidf, y_train)

print('Training set score:', clf.score(xtrain_tfidf, y_train))
print('\nTest set score:', clf.score(xtest_tfidf, y_test))

Training set score: 0.8882709807886754

Test set score: 0.794949494949495


# Count Vectorizer

In [89]:
# Random Forest Classifier
from sklearn import ensemble

rfc = ensemble.RandomForestClassifier(n_estimators=8)
train = rfc.fit(xtrain_count, y_train)

print(rfc.score(xtest_count, y_test))

0.7606060606060606


In [23]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(xtrain_count, y_train)
print(xtrain_count.shape, y_train.shape)
print('Training set score:', lr.score(xtrain_count, y_train))
print('\nTest set score:', lr.score(xtest_count, y_test))

(3956, 5726) (3956,)
Training set score: 0.9436299292214358

Test set score: 0.8080808080808081




In [24]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(xtrain_count, y_train)

print('Training set score:', clf.score(xtrain_count, y_train))
print('\nTest set score:', clf.score(xtest_count, y_test))

Training set score: 0.7591001011122346

Test set score: 0.7676767676767676


In [25]:
# Remove words that are found in both texts

In [26]:
hamlet = sentences[sentences[1]=='Hamlet']
hamlet = hamlet[0]

macbeth = sentences[sentences[1]=='Macbeth']
macbeth = macbeth[0]

from nltk.tokenize import word_tokenize

tokenized_hamlet = [word_tokenize(i) for i in list(hamlet)]
tokenized_macbeth = [word_tokenize(i) for i in list(macbeth)]

from itertools import chain
tokenized_hamlet = list(chain.from_iterable(tokenized_hamlet))
tokenized_macbeth = list(chain.from_iterable(tokenized_macbeth))

hamlet_unique = set(tokenized_hamlet) - set(tokenized_macbeth)
macbeth_unique = set(tokenized_macbeth) - set(tokenized_hamlet)

features1 = pd.DataFrame()
features1['Words'] = list(hamlet_unique)
features1['Text'] = 'Hamlet'

features2 = pd.DataFrame()
features2['Words'] = list(macbeth_unique)
features2['Text'] = 'Macbeth'

features = features1.append(features2)

features["Hamlet?"] = np.where(features['Text'] == 'Hamlet', 1, 0)

features.head()

Unnamed: 0,Words,Text,Hamlet?
0,quintessence,Hamlet,1
1,courteous,Hamlet,1
2,purples,Hamlet,1
3,comingled,Hamlet,1
4,leasure,Hamlet,1


In [27]:
x = features['Words']
y = features['Hamlet?']

# Split data into training and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [28]:
# Create count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(x_train)

xtrain_count = count_vect.transform(x_train)
xtest_count = count_vect.transform(x_test)

In [29]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(sentences[0])
xtrain_tfidf =  tfidf_vect.transform(x_train)
xtest_tfidf =  tfidf_vect.transform(x_test)

In [30]:
# Random Forest Classifier
from sklearn import ensemble

rfc = ensemble.RandomForestClassifier()
train = rfc.fit(xtrain_tfidf, y_train)

print(rfc.score(xtest_tfidf, y_test))



0.6533742331288344


In [31]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(xtrain_tfidf, y_train)
print(xtrain_tfidf.shape, y_train.shape)
print('Training set score:', lr.score(xtrain_tfidf, y_train))
print('\nTest set score:', lr.score(xtest_tfidf, y_test))

(3911, 5000) (3911,)
Training set score: 0.6205574021989261

Test set score: 0.6533742331288344




In [32]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(xtrain_tfidf, y_train)

print('Training set score:', clf.score(xtrain_tfidf, y_train))
print('\nTest set score:', clf.score(xtest_tfidf, y_test))

Training set score: 0.6205574021989261

Test set score: 0.6533742331288344


In [33]:
# Random Forest Classifier
from sklearn import ensemble

rfc = ensemble.RandomForestClassifier()
train = rfc.fit(xtrain_count, y_train)

print(rfc.score(xtest_count, y_test))



0.6533742331288344


In [34]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(xtrain_count, y_train)
print(xtrain_count.shape, y_train.shape)
print('Training set score:', lr.score(xtrain_count, y_train))
print('\nTest set score:', lr.score(xtest_count, y_test))

(3911, 3911) (3911,)
Training set score: 0.6205574021989261

Test set score: 0.6533742331288344




In [35]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(xtrain_count, y_train)

print('Training set score:', clf.score(xtrain_count, y_train))
print('\nTest set score:', clf.score(xtest_count, y_test))

Training set score: 0.6205574021989261

Test set score: 0.6533742331288344
