In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
from sklearn.model_selection import train_test_split, cross_val_score
import gensim
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import xgboost
import nltk

In [2]:
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
macbeth = gutenberg.raw('shakespeare-macbeth.txt')

In [3]:
# Print the first 100 characters of Hamlet
print('\nHamlet Raw:\n', hamlet[0:100])
# Print the first 100 characters of Macbeth
print('\n--------------\nMacbeth Raw:\n', macbeth[0:100])


Hamlet Raw:
 [The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Barnardo a

--------------
Macbeth Raw:
 [The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus. Scoena Prima.

Thunder and Lig


In [4]:
# Print the first 100 characters of Alice again.
print('Hamlet:\n', hamlet[0:100])
# All done with cleanup? Let's see how it looks.
print('\nMacbeth:\n', macbeth[0:100])

Hamlet:
 [The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Barnardo a

Macbeth:
 [The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus. Scoena Prima.

Thunder and Lig


In [5]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

hamlet = text_cleaner(hamlet)
macbeth = text_cleaner(macbeth)

In [6]:
nlp = spacy.load('en')
hamlet_doc = nlp(hamlet)
macbeth_doc = nlp(macbeth)

In [7]:
# Group into sentences.
hamlet_sents = [[sent, "Hamlet"] for sent in hamlet_doc.sents]
macbeth_sents = [[sent, "Macbeth"] for sent in macbeth_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(hamlet_sents + macbeth_sents)

# Define Label
sentences["Hamlet?"] = np.where(sentences[1]== "Hamlet", 1, 0)

# Convert sentences tuples to string
sentences[0] = sentences[0].astype(str)

# Tokenize sentences
sentences[0] = sentences.apply(lambda row: nltk.word_tokenize(row[0]), axis=1)

# Remove all stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
sentences[0] = sentences[0].apply(lambda x: [item for item in x if item not in stop])

# Lower Case everything
sentences[0] = sentences[0].astype(str)
sentences[0] = sentences[0].apply(lambda x: x.lower())

# remove all punctuations
from string import punctuation
sentences[0] = sentences[0].apply(lambda x: ''.join(c for c in x if c not in punctuation))

In [8]:
sentences.head()

Unnamed: 0,0,1,Hamlet?
0,actus primus,Hamlet,1
1,scoena prima,Hamlet,1
2,enter barnardo francisco two centinels,Hamlet,1
3,barnardo,Hamlet,1
4,who s,Hamlet,1


In [9]:
x = sentences[0]
y = sentences['Hamlet?']

# Split data into training and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [10]:
# Create count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(x_train)

xtrain_count = count_vect.transform(x_train)
xtest_count = count_vect.transform(x_test)

In [11]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(sentences[0])
xtrain_tfidf =  tfidf_vect.transform(x_train)
xtest_tfidf =  tfidf_vect.transform(x_test)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(sentences[0])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(x_train)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(x_test)

In [12]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, y_test)

In [13]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), y_train, xtest_count.tocsc())
print ("Xgb, Count Vectors: ", accuracy)

Xgb, Count Vectors:  0.7595959595959596


In [14]:
# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), y_train, xtest_tfidf.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram.tocsc(), y_train, xtest_tfidf_ngram.tocsc())
print ("Xgb, CharLevel Vectors: ", accuracy)

Xgb, WordLevel TF-IDF:  0.7474747474747475
Xgb, CharLevel Vectors:  0.6222222222222222
