# Use of Word2Vec to classify documents

### Here we will use Jane Austen's *Persuasion* and Lewis Carroll's *Alice's Adventures in Wonderland* from NLTK's Gutenberg module. 
### The unit of observation (*documents*) will be the sentences of these novels.

In [1]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
from nltk.corpus import gutenberg
import nltk
import warnings
from sklearn import (datasets, model_selection, feature_extraction, linear_model, naive_bayes, ensemble)
import collections
from collections import Counter
import nltk
import gensim
import re
import multiprocessing as mp 
#import textacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
#!python -m spacy download en
warnings.filterwarnings("ignore")
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\00233270\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")
#nlp = en_core_web_sm.load()

### A helper function for removing some punctuation marks and numbers from the text:

In [3]:
############################################# DO NOT DELETE ############################################# 
# Function to move specific column to the left side for easier view
def move_to_left(df, column_name):
    df= df[ [str(column_name)] + [ col for col in df.columns if col != str(column_name) ] ]
    return df

In [4]:
# Utility function for standard text cleaning
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [5]:
# Load and clean the data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

### The cleaned texts are stored in two variables called `alice` and `persuasion`. Later, we will split the texts into sentences. We will use spaCy English module and use spaCy to parse both the `alice` and `persuasion` texts:

In [34]:
# Parse the cleaned novels. This can take some time.
#nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [35]:
# POSs
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one DataFrame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = ["text", "author"])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


## Tokenize and Lemmaitize 

In [38]:
tokens = []
lemma = []
#pos = []

for doc in nlp.pipe(sentences['text'].astype('unicode').values, batch_size=50,
                        n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
  #      pos.append([n.pos_ for n in doc])
        
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
 #       pos.append(None)

sentences['tokens'] = tokens
sentences['lemma'] = lemma
#sentences['pos'] = pos

In [41]:
sentences.head(3)

Unnamed: 0,text,author,tokens,lemma
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,"[Alice, was, beginning, to, get, very, tired, ...","(Alice, be, begin, to, get, very, tired, of, s..."
1,"(So, she, was, considering, in, her, own, mind...",Carroll,"[So, she, was, considering, in, her, own, mind...","(so, -PRON-, be, consider, in, -PRON-, own, mi..."
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,"[There, was, nothing, so, VERY, remarkable, in...","(there, be, nothing, so, very, remarkable, in,..."


## Vectorize

In [40]:
sentences['lemma'] = [tuple(x) for x in sentences['lemma']]

In [29]:
# Convert Pandas list to string
sentences['lemma'] = [', '.join(map(str, l)) for l in sentences['lemma']]
#sentences['pos'] = [', '.join(map(str, l)) for l in sentences['pos']]

In [31]:
wrd = sentences['lemma'].str.contains('man').sum()
if wrd>0:
    print ("There are {m}".format(m=wrd))

There are 490


In [59]:
# Train word2vec on the sentences
model = gensim.models.Word2Vec(sentences["lemma"], workers=4, min_count=5, window=6, sg=1, sample=1e-3, size=100, hs=1)

In [60]:
print(model.wv.most_similar(positive=['man'], negative=['woman'], topn=5))

[('with', 0.35816746950149536), ('Captain', 0.25637108087539673), ('towards', 0.25195395946502686), ('cry', 0.20996823906898499), ('smile', 0.20057760179042816)]


In [61]:
print(model.similarity('woman', 'man'))
#print(model.similarity('horse', 'cat'))

0.82254267


In [62]:
model.wv.most_similar(positive=['king', 'queen'], negative=['man'])

[('verse', 0.9277867674827576),
 ('inn', 0.8846830129623413),
 ('executioner', 0.8718481659889221),
 ('stop', 0.870466947555542),
 ('suddenly', 0.8691501021385193),
 ('swam', 0.8632456064224243),
 ('sneeze', 0.860865592956543),
 ('cross', 0.8572542667388916),
 ('sadly', 0.8532841205596924),
 ('Rabbit', 0.8526747226715088)]

In [63]:
model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=30)

[('sneeze', 0.8358640670776367),
 ('carry', 0.830854058265686),
 ('grass', 0.8272877931594849),
 ('adventure', 0.8253511786460876),
 ('cook', 0.8239763379096985),
 ('sadly', 0.8204719424247742),
 ('blow', 0.8202990889549255),
 ('argument', 0.8198114633560181),
 ('summer', 0.8170920610427856),
 ('knock', 0.8148330450057983),
 ('strange', 0.8147205114364624),
 ('bend', 0.8146598935127258),
 ('player', 0.8134366273880005),
 ('execution', 0.8128384947776794),
 ('arch', 0.8126190900802612),
 ('gardener', 0.8099405169487),
 ('swam', 0.808670163154602),
 ('verse', 0.8077516555786133),
 ('ring', 0.806759238243103),
 ('ear', 0.8043306469917297),
 ('slate', 0.8040744066238403),
 ('nibble', 0.8033859729766846),
 ('tail', 0.8025743961334229),
 ('flamingo', 0.802452802658081),
 ('stop', 0.8019906282424927),
 ('startled', 0.8017464876174927),
 ('shoulder', 0.8009440898895264),
 ('procession', 0.8004547357559204),
 ('stare', 0.8000544309616089),
 ('elbow', 0.7985504865646362)]

## Use multiple machine learning algorithms:

In [None]:
# Check some basic performance
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

print("----------------------Stochastic Gradient Descent Scores----------------------")
print('Training set score:', SDG.score(X_train, y_train))
print('\nTest set score:', SDG.score(X_test, y_test))

print("---------------------- Naive Bayes Multinominal Scores----------------------")
print('Training set score:', NB_Multi.score(X_train, y_train))
print('\nTest set score:', NB_Multi.score(X_test, y_test))

print("----------------------Naive Bayes Bernoulli Scores----------------------")
print('Training set score:', NB_Bern.score(X_train, y_train))
print('\nTest set score:', NB_Bern.score(X_test, y_test))

## Use of Machine Learning on Lemmas + POS only

In [None]:
Y = df_lemma_pos_s['author']
X = np.array(df_lemma_pos_s.drop(['author'], 1))
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

In [None]:
# Models
lr_params = {"penalty": ["l2"]}
lr = LogisticRegression()

rfc_params = {"n_estimators": [3, 5, 10, 15], "max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
rfc = RandomForestClassifier()

gbc_params = {"n_estimators": [3, 5, 10, 15],"max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
gbc = GradientBoostingClassifier()

SDG_params= {'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e3], 'penalty': ['l2'], 'n_jobs': [-1]}
SDG = linear_model.SGDClassifier()

NB_Multi_params = {'alpha': [1.0]}
NB_Multi= naive_bayes.MultinomialNB()

NB_Bern_params = {'alpha': [1.0]}
NB_Bern = naive_bayes.BernoulliNB()

In [None]:
# GridsearchCV
clf_lr = GridSearchCV(lr, lr_params, cv=5)
clf_lr.fit(X_train, y_train)

clf_rfc = GridSearchCV(rfc, rfc_params, cv=5)
clf_rfc.fit(X_train, y_train)

clf_gbc = GridSearchCV(gbc, gbc_params, cv=5)
clf_gbc.fit(X_train, y_train)

clf_SDG = GridSearchCV(SDG, SDG_params, cv=5)
clf_SDG.fit(X_train, y_train)

clf_NB_Multi = GridSearchCV(NB_Multi, NB_Multi_params, cv=5)
clf_NB_Multi.fit(X_train, y_train)

clf_NB_Bern = GridSearchCV(NB_Bern, NB_Bern_params, cv=5)
clf_NB_Bern.fit(X_train, y_train)

In [None]:
# Fit the models
lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)
SDG.fit(X_train, y_train)
NB_Multi.fit(X_train, y_train)
NB_Bern.fit(X_train, y_train)

In [None]:
# Check some basic performance
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

print("----------------------Stochastic Gradient Descent Scores----------------------")
print('Training set score:', SDG.score(X_train, y_train))
print('\nTest set score:', SDG.score(X_test, y_test))

print("---------------------- Naive Bayes Multinominal Scores----------------------")
print('Training set score:', NB_Multi.score(X_train, y_train))
print('\nTest set score:', NB_Multi.score(X_test, y_test))

print("----------------------Naive Bayes Bernoulli Scores----------------------")
print('Training set score:', NB_Bern.score(X_train, y_train))
print('\nTest set score:', NB_Bern.score(X_test, y_test))

## Use of Machine Learning on Lemmas + POS + 2grams only

In [None]:
Y = df_lemma_pos_2gram_s['author']
X = np.array(df_lemma_pos_2gram_s.drop(['author'], 1))
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

In [None]:
# Models
lr_params = {"penalty": ["l2"]}
lr = LogisticRegression()

rfc_params = {"n_estimators": [3, 5, 10, 15], "max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
rfc = RandomForestClassifier()

gbc_params = {"n_estimators": [3, 5, 10, 15],"max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
gbc = GradientBoostingClassifier()

SDG_params= {'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e3], 'penalty': ['l2'], 'n_jobs': [-1]}
SDG = linear_model.SGDClassifier()

NB_Multi_params = {'alpha': [1.0]}
NB_Multi= naive_bayes.MultinomialNB()

NB_Bern_params = {'alpha': [1.0]}
NB_Bern = naive_bayes.BernoulliNB()

In [None]:
# GridsearchCV
clf_lr = GridSearchCV(lr, lr_params, cv=5)
clf_lr.fit(X_train, y_train)

clf_rfc = GridSearchCV(rfc, rfc_params, cv=5)
clf_rfc.fit(X_train, y_train)

clf_gbc = GridSearchCV(gbc, gbc_params, cv=5)
clf_gbc.fit(X_train, y_train)

clf_SDG = GridSearchCV(SDG, SDG_params, cv=5)
clf_SDG.fit(X_train, y_train)

clf_NB_Multi = GridSearchCV(NB_Multi, NB_Multi_params, cv=5)
clf_NB_Multi.fit(X_train, y_train)

clf_NB_Bern = GridSearchCV(NB_Bern, NB_Bern_params, cv=5)
clf_NB_Bern.fit(X_train, y_train)

In [None]:
# Fit the models
lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)
SDG.fit(X_train, y_train)
NB_Multi.fit(X_train, y_train)
NB_Bern.fit(X_train, y_train)

In [None]:
# Check some basic performance
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

print("----------------------Stochastic Gradient Descent Scores----------------------")
print('Training set score:', SDG.score(X_train, y_train))
print('\nTest set score:', SDG.score(X_test, y_test))

print("---------------------- Naive Bayes Multinominal Scores----------------------")
print('Training set score:', NB_Multi.score(X_train, y_train))
print('\nTest set score:', NB_Multi.score(X_test, y_test))

print("----------------------Naive Bayes Bernoulli Scores----------------------")
print('Training set score:', NB_Bern.score(X_train, y_train))
print('\nTest set score:', NB_Bern.score(X_test, y_test))

# Conclustion

Adding POS to Lemmatized features did not improve the result. And adding 2-grams to Lemmatized with POS feature gave same result as Lemmatized features alone. Above all models Naive Bayes Multinominal performed best. 