In [1]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
import nltk
from nltk.corpus import gutenberg
import gensim
import warnings
warnings.filterwarnings("ignore")

nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/abilenky/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
# Utility function for standard text cleaning
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation that spaCy doesn't
    # recognize: the double dash --. Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [3]:
# Load and clean the data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [4]:
# Parse the cleaned novels. This can take some time.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [5]:
# Group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one DataFrame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = ["text", "author"])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


In [6]:
# Get rid of stop words and punctuation,
# and lemmatize the tokens
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = [token.lemma_ for token in sentence if not token.is_punct 
                                and not token.is_stop and]

In [65]:
# Train word2vec on the sentences
size = 100
model = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=8,
    sg=0,
    sample=50,
    size=size,
    hs=1
)

word2vec_arr = np.zeros((sentences.shape[0],size))

for i, sentence in enumerate(sentences["text"]):
    word2vec_arr[i,:] = np.mean([model[lemma] for lemma in sentence], axis=0)

word2vec_arr = pd.DataFrame(word2vec_arr)
sentences = pd.concat([sentences[["author", "text"]],word2vec_arr], axis=1)
sentences.dropna(inplace=True)

sentences.head()

Unnamed: 0,author,text,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
0,Carroll,"[Alice, begin, tired, sit, sister, bank, have,...",0.156718,0.002684,-0.233711,-0.071123,-0.053649,0.253381,-0.154396,0.329343,...,-0.092105,-0.13399,0.00877,0.052168,0.2261,0.158226,0.440264,0.106349,-0.103918,0.050352
1,Carroll,"[consider, mind, hot, day, feel, sleepy, stupi...",0.1112,0.032045,-0.114876,-0.023441,-0.009155,0.127409,-0.075661,0.146582,...,-0.050087,-0.067237,-0.021828,0.022497,0.119705,0.084655,0.194639,0.050525,-0.05951,0.041016
2,Carroll,"[remarkable, Alice, think, way, hear, Rabbit]",0.336701,0.042047,-0.402422,-0.196097,-0.074489,0.520906,-0.262239,0.529839,...,-0.145754,-0.194869,-0.018468,0.137644,0.418958,0.305857,0.671031,0.191104,-0.254096,0.113918
3,Carroll,"[oh, dear]",0.083552,0.007355,-0.129789,-0.042346,-0.028135,0.14197,-0.080866,0.170579,...,-0.057806,-0.066599,-0.009402,0.016764,0.131715,0.092017,0.22631,0.058827,-0.059215,0.034975
4,Carroll,"[oh, dear]",0.083552,0.007355,-0.129789,-0.042346,-0.028135,0.14197,-0.080866,0.170579,...,-0.057806,-0.066599,-0.009402,0.016764,0.131715,0.092017,0.22631,0.058827,-0.059215,0.034975


In [66]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.9002624671916011

Test set score: 0.9015748031496063
----------------------Random Forest Scores----------------------
Training set score: 0.9991251093613298

Test set score: 0.8989501312335958
----------------------Gradient Boosting Scores----------------------
Training set score: 0.9886264216972879

Test set score: 0.9068241469816273


In [68]:
from sklearn.model_selection import cross_val_score

cross_val_score(lr, X, Y, cv=10).mean()

0.8981427390465692