# Can NLP help to distinguish the authors of two books?

## Here we will use Jane Austen's *Persuasion* and Lewis Carroll's *Alice's Adventures in Wonderland* from NLTK's Gutenberg module. 
### The unit of observation (*documents*) will be the sentences of these novels.

In [1]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
import en_core_web_sm
nlp = en_core_web_sm.load()
from nltk.corpus import gutenberg
import nltk
import warnings
from sklearn import (datasets, model_selection, feature_extraction, linear_model, naive_bayes, ensemble)
import collections
from collections import Counter
import nltk
import gensim
import re
import multiprocessing as mp 
import textacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
#!python -m spacy download en
warnings.filterwarnings("ignore")
nltk.download('gutenberg')


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

### A helper function for removing some punctuation marks and numbers from the text:

In [2]:
############################################# DO NOT DELETE ############################################# 
# Function to move specific column to the left side for easier view
def move_to_left(df, column_name):
    df= df[ [str(column_name)] + [ col for col in df.columns if col != str(column_name) ] ]
    return df

In [3]:
# Utility function for standard text cleaning
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [4]:
# Load and clean the data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

### The cleaned texts are stored in two variables called `alice` and `persuasion`. Later, we will split the texts into sentences. We will use spaCy English module and use spaCy to parse both the `alice` and `persuasion` texts:

In [5]:
# Parse the cleaned novels. This can take some time.
#nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [6]:
# POSs
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one DataFrame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = ["text", "author"])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


## Tokenize and Lemmaitize 

In [7]:
tokens = []
lemma = []
pos = []

for doc in nlp.pipe(sentences['text'].astype('unicode').values, batch_size=50,
                        n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
        
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)

sentences['tokens'] = tokens
sentences['lemma'] = lemma
sentences['pos'] = pos

In [8]:
sentences.head(3)

Unnamed: 0,text,author,tokens,lemma,pos
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,"[Alice, was, beginning, to, get, very, tired, ...","[Alice, be, begin, to, get, very, tired, of, s...","[PROPN, AUX, VERB, PART, AUX, ADV, ADJ, ADP, V..."
1,"(So, she, was, considering, in, her, own, mind...",Carroll,"[So, she, was, considering, in, her, own, mind...","[so, -PRON-, be, consider, in, -PRON-, own, mi...","[ADV, PRON, AUX, VERB, ADP, DET, ADJ, NOUN, PU..."
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,"[There, was, nothing, so, VERY, remarkable, in...","[there, be, nothing, so, very, remarkable, in,...","[PRON, AUX, PRON, ADV, ADV, ADJ, ADP, DET, PUN..."


## Vectorize

In [9]:
sentences.columns

Index(['text', 'author', 'tokens', 'lemma', 'pos'], dtype='object')

In [10]:
sentences.head(2)

Unnamed: 0,text,author,tokens,lemma,pos
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,"[Alice, was, beginning, to, get, very, tired, ...","[Alice, be, begin, to, get, very, tired, of, s...","[PROPN, AUX, VERB, PART, AUX, ADV, ADJ, ADP, V..."
1,"(So, she, was, considering, in, her, own, mind...",Carroll,"[So, she, was, considering, in, her, own, mind...","[so, -PRON-, be, consider, in, -PRON-, own, mi...","[ADV, PRON, AUX, VERB, ADP, DET, ADJ, NOUN, PU..."


In [11]:
# Convert Pandas list to string
sentences['lemma'] = [', '.join(map(str, l)) for l in sentences['lemma']]
sentences['pos'] = [', '.join(map(str, l)) for l in sentences['pos']]

In [12]:
# Train word2vec on the sentences
model = gensim.models.Word2Vec(
    sentences["lemma"],
    workers=4,
    min_count=1,
    window=6,
    sg=0,
    sample=1e-3,
    size=100,
    hs=1
)

In [14]:
print(model.most_similar(positive=['lady', 'man'], negative=['woman'], topn=5))
print(model.doesnt_match("dad dinner mom aunt uncle".split()))
print(model.similarity('woman', 'man'))
print(model.similarity('horse', 'cat'))

KeyError: "word 'lady' not in vocabulary"

In [12]:
# Vectorize Lemmas
vectorizer = CountVectorizer(analyzer='word')
X = vectorizer.fit_transform(sentences['lemma'])
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
df_lemma = pd.concat([bow_df, sentences[[ "author"]]], axis=1)

In [13]:
df_lemma.shape

(5632, 5094)

In [14]:
# Vectorize Lemmas + POS
vectorizer = CountVectorizer(analyzer='word')
X = vectorizer.fit_transform(sentences['pos'])
bow_pos = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
df_lemma_pos = pd.concat([bow_pos, df_lemma], axis=1)

In [15]:
df_lemma_pos.shape

(5632, 5109)

In [16]:
# Vectorize 2-grams
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2,2))
X = vectorizer.fit_transform(sentences['lemma'])
bow_ngram = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
df_lemma_pos_2gram = pd.concat([bow_ngram, df_lemma_pos], axis=1)

In [17]:
df_lemma_pos_2gram.shape

(5632, 44899)

## Use multiple machine learning algorithms:

In [19]:
df_lemma_s = df_lemma.sample(5000)
df_lemma_pos_s = df_lemma_pos.sample(5000)
df_lemma_pos_2gram_s = df_lemma_pos_2gram.sample(5000)

## Use of Machine Learning on Lemmas only

In [21]:
Y = df_lemma_s['author']
X = np.array(df_lemma_s.drop(['author'], 1))
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

In [22]:
# Models
lr_params = {"penalty": ["l2"]}
lr = LogisticRegression()

rfc_params = {"n_estimators": [3, 5, 10, 15], "max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
rfc = RandomForestClassifier()

gbc_params = {"n_estimators": [3, 5, 10, 15],"max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
gbc = GradientBoostingClassifier()

SDG_params= {'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e3], 'penalty': ['l2'], 'n_jobs': [-1]}
SDG = linear_model.SGDClassifier()

NB_Multi_params = {'alpha': [1.0]}
NB_Multi= naive_bayes.MultinomialNB()

NB_Bern_params = {'alpha': [1.0]}
NB_Bern = naive_bayes.BernoulliNB()

In [23]:
# GridsearchCV
clf_lr = GridSearchCV(lr, lr_params, cv=5)
clf_lr.fit(X_train, y_train)

clf_rfc = GridSearchCV(rfc, rfc_params, cv=5)
clf_rfc.fit(X_train, y_train)

clf_gbc = GridSearchCV(gbc, gbc_params, cv=5)
clf_gbc.fit(X_train, y_train)

clf_SDG = GridSearchCV(SDG, SDG_params, cv=5)
clf_SDG.fit(X_train, y_train)

clf_NB_Multi = GridSearchCV(NB_Multi, NB_Multi_params, cv=5)
clf_NB_Multi.fit(X_train, y_train)

clf_NB_Bern = GridSearchCV(NB_Bern, NB_Bern_params, cv=5)
clf_NB_Bern.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=BernoulliNB(), param_grid={'alpha': [1.0]})

In [24]:
# Fit the models
lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)
SDG.fit(X_train, y_train)
NB_Multi.fit(X_train, y_train)
NB_Bern.fit(X_train, y_train)

BernoulliNB()

In [25]:
# Check some basic performance
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

print("----------------------Stochastic Gradient Descent Scores----------------------")
print('Training set score:', SDG.score(X_train, y_train))
print('\nTest set score:', SDG.score(X_test, y_test))

print("---------------------- Naive Bayes Multinominal Scores----------------------")
print('Training set score:', NB_Multi.score(X_train, y_train))
print('\nTest set score:', NB_Multi.score(X_test, y_test))

print("----------------------Naive Bayes Bernoulli Scores----------------------")
print('Training set score:', NB_Bern.score(X_train, y_train))
print('\nTest set score:', NB_Bern.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.9523333333333334

Test set score: 0.8825
----------------------Random Forest Scores----------------------
Training set score: 0.9946666666666667

Test set score: 0.8525
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8553333333333333

Test set score: 0.834
----------------------Stochastic Gradient Descent Scores----------------------
Training set score: 0.98

Test set score: 0.8745
---------------------- Naive Bayes Multinominal Scores----------------------
Training set score: 0.9336666666666666

Test set score: 0.911
----------------------Naive Bayes Bernoulli Scores----------------------
Training set score: 0.8923333333333333

Test set score: 0.8715


## Use of Machine Learning on Lemmas + POS only

In [26]:
Y = df_lemma_pos_s['author']
X = np.array(df_lemma_pos_s.drop(['author'], 1))
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

In [27]:
# Models
lr_params = {"penalty": ["l2"]}
lr = LogisticRegression()

rfc_params = {"n_estimators": [3, 5, 10, 15], "max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
rfc = RandomForestClassifier()

gbc_params = {"n_estimators": [3, 5, 10, 15],"max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
gbc = GradientBoostingClassifier()

SDG_params= {'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e3], 'penalty': ['l2'], 'n_jobs': [-1]}
SDG = linear_model.SGDClassifier()

NB_Multi_params = {'alpha': [1.0]}
NB_Multi= naive_bayes.MultinomialNB()

NB_Bern_params = {'alpha': [1.0]}
NB_Bern = naive_bayes.BernoulliNB()

In [28]:
# GridsearchCV
clf_lr = GridSearchCV(lr, lr_params, cv=5)
clf_lr.fit(X_train, y_train)

clf_rfc = GridSearchCV(rfc, rfc_params, cv=5)
clf_rfc.fit(X_train, y_train)

clf_gbc = GridSearchCV(gbc, gbc_params, cv=5)
clf_gbc.fit(X_train, y_train)

clf_SDG = GridSearchCV(SDG, SDG_params, cv=5)
clf_SDG.fit(X_train, y_train)

clf_NB_Multi = GridSearchCV(NB_Multi, NB_Multi_params, cv=5)
clf_NB_Multi.fit(X_train, y_train)

clf_NB_Bern = GridSearchCV(NB_Bern, NB_Bern_params, cv=5)
clf_NB_Bern.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=BernoulliNB(), param_grid={'alpha': [1.0]})

In [29]:
# Fit the models
lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)
SDG.fit(X_train, y_train)
NB_Multi.fit(X_train, y_train)
NB_Bern.fit(X_train, y_train)

BernoulliNB()

In [30]:
# Check some basic performance
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

print("----------------------Stochastic Gradient Descent Scores----------------------")
print('Training set score:', SDG.score(X_train, y_train))
print('\nTest set score:', SDG.score(X_test, y_test))

print("---------------------- Naive Bayes Multinominal Scores----------------------")
print('Training set score:', NB_Multi.score(X_train, y_train))
print('\nTest set score:', NB_Multi.score(X_test, y_test))

print("----------------------Naive Bayes Bernoulli Scores----------------------")
print('Training set score:', NB_Bern.score(X_train, y_train))
print('\nTest set score:', NB_Bern.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.954

Test set score: 0.8775
----------------------Random Forest Scores----------------------
Training set score: 0.9936666666666667

Test set score: 0.8445
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8776666666666667

Test set score: 0.837
----------------------Stochastic Gradient Descent Scores----------------------
Training set score: 0.976

Test set score: 0.882
---------------------- Naive Bayes Multinominal Scores----------------------
Training set score: 0.925

Test set score: 0.893
----------------------Naive Bayes Bernoulli Scores----------------------
Training set score: 0.854

Test set score: 0.8365


## Use of Machine Learning on Lemmas + POS + 2grams only

In [31]:
Y = df_lemma_pos_2gram_s['author']
X = np.array(df_lemma_pos_2gram_s.drop(['author'], 1))
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

In [32]:
# Models
lr_params = {"penalty": ["l2"]}
lr = LogisticRegression()

rfc_params = {"n_estimators": [3, 5, 10, 15], "max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
rfc = RandomForestClassifier()

gbc_params = {"n_estimators": [3, 5, 10, 15],"max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
gbc = GradientBoostingClassifier()

SDG_params= {'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e3], 'penalty': ['l2'], 'n_jobs': [-1]}
SDG = linear_model.SGDClassifier()

NB_Multi_params = {'alpha': [1.0]}
NB_Multi= naive_bayes.MultinomialNB()

NB_Bern_params = {'alpha': [1.0]}
NB_Bern = naive_bayes.BernoulliNB()

In [33]:
# GridsearchCV
clf_lr = GridSearchCV(lr, lr_params, cv=5)
clf_lr.fit(X_train, y_train)

clf_rfc = GridSearchCV(rfc, rfc_params, cv=5)
clf_rfc.fit(X_train, y_train)

clf_gbc = GridSearchCV(gbc, gbc_params, cv=5)
clf_gbc.fit(X_train, y_train)

clf_SDG = GridSearchCV(SDG, SDG_params, cv=5)
clf_SDG.fit(X_train, y_train)

clf_NB_Multi = GridSearchCV(NB_Multi, NB_Multi_params, cv=5)
clf_NB_Multi.fit(X_train, y_train)

clf_NB_Bern = GridSearchCV(NB_Bern, NB_Bern_params, cv=5)
clf_NB_Bern.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=BernoulliNB(), param_grid={'alpha': [1.0]})

In [34]:
# Fit the models
lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)
SDG.fit(X_train, y_train)
NB_Multi.fit(X_train, y_train)
NB_Bern.fit(X_train, y_train)

BernoulliNB()

In [35]:
# Check some basic performance
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

print("----------------------Stochastic Gradient Descent Scores----------------------")
print('Training set score:', SDG.score(X_train, y_train))
print('\nTest set score:', SDG.score(X_test, y_test))

print("---------------------- Naive Bayes Multinominal Scores----------------------")
print('Training set score:', NB_Multi.score(X_train, y_train))
print('\nTest set score:', NB_Multi.score(X_test, y_test))

print("----------------------Naive Bayes Bernoulli Scores----------------------")
print('Training set score:', NB_Bern.score(X_train, y_train))
print('\nTest set score:', NB_Bern.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.9836666666666667

Test set score: 0.886
----------------------Random Forest Scores----------------------
Training set score: 0.9933333333333333

Test set score: 0.841
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8823333333333333

Test set score: 0.847
----------------------Stochastic Gradient Descent Scores----------------------
Training set score: 0.9863333333333333

Test set score: 0.8845
---------------------- Naive Bayes Multinominal Scores----------------------
Training set score: 0.937

Test set score: 0.883
----------------------Naive Bayes Bernoulli Scores----------------------
Training set score: 0.8686666666666667

Test set score: 0.8385


# Conclustion

Adding POS to Lemmatized features did not improve the result. And adding 2-grams to Lemmatized with POS feature gave same result as Lemmatized features alone. Above all models Naive Bayes Multinominal performed best. 