# Use of Word2Vec to classify documents

### Here we will use Jane Austen's *Persuasion* and Lewis Carroll's *Alice's Adventures in Wonderland* from NLTK's Gutenberg module. 
### The unit of observation (*documents*) will be the sentences of these novels.

In [1]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
from nltk.corpus import gutenberg
import nltk
import warnings
from sklearn import (datasets, model_selection, feature_extraction, linear_model, naive_bayes, ensemble)
import collections
from collections import Counter
import nltk
import gensim
import re
import multiprocessing as mp 
#import textacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
#!python -m spacy download en
warnings.filterwarnings("ignore")
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\00233270\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")
#nlp = en_core_web_sm.load()

### A helper function for removing some punctuation marks and numbers from the text:

In [3]:
############################################# DO NOT DELETE ############################################# 
# Function to move specific column to the left side for easier view
def move_to_left(df, column_name):
    df= df[ [str(column_name)] + [ col for col in df.columns if col != str(column_name) ] ]
    return df

In [4]:
# Utility function for standard text cleaning
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [5]:
# Load and clean the data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

### The cleaned texts are stored in two variables called `alice` and `persuasion`. Later, we will split the texts into sentences. We will use spaCy English module and use spaCy to parse both the `alice` and `persuasion` texts:

In [6]:
# Parse the cleaned novels. This can take some time.
#nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [7]:
# POSs
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one DataFrame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = ["text", "author"])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


## Tokenize and Lemmaitize 

In [18]:
tokens = []
lemma = []
#pos = []

for doc in nlp.pipe(sentences['text'].astype('unicode').values, batch_size=50,
                        n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
  #      pos.append([n.pos_ for n in doc])
        
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
 #       pos.append(None)

sentences['tokens'] = tokens
sentences['lemma'] = lemma
#sentences['pos'] = pos

In [21]:
sentences.head(3)

Unnamed: 0,text,author,tokens,lemma
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,"[Alice, was, beginning, to, get, very, tired, ...","(Alice, be, begin, to, get, very, tired, of, s..."
1,"(So, she, was, considering, in, her, own, mind...",Carroll,"[So, she, was, considering, in, her, own, mind...","(so, -PRON-, be, consider, in, -PRON-, own, mi..."
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,"[There, was, nothing, so, VERY, remarkable, in...","(there, be, nothing, so, very, remarkable, in,..."


## Vectorize

In [20]:
sentences['lemma'] = [tuple(x) for x in sentences['lemma']]

In [11]:
# Convert Pandas list to string
#sentences['lemma'] = [', '.join(map(str, l)) for l in sentences['lemma']]
#sentences['pos'] = [', '.join(map(str, l)) for l in sentences['pos']]

In [40]:
wrd = sentences['lemma'].str.contains('peep').sum()
if wrd>0:
    print ("There are {m}".format(m=wrd))

In [41]:
# Train word2vec on the sentences
model = gensim.models.Word2Vec(sentences["lemma"], workers=4, min_count=1, window=6, sg=1, sample=1e-3, size=100, hs=1)

In [42]:
print(model.most_similar(positive=['lady', 'man'], negative=['woman'], topn=5))
print(model.doesnt_match("dad dinner mom aunt uncle".split()))
print(model.similarity('woman', 'man'))
print(model.similarity('horse', 'cat'))

[('people', 0.7728192806243896), ('navy', 0.7627352476119995), ('gentleman', 0.7612074613571167), ('company', 0.7387996912002563), ('between', 0.7269482016563416)]
uncle
0.90191114
0.51556206


In [43]:
print(model.wv.most_similar(positive=['man'], negative=['woman'], topn=5))

[('Turtle', 0.2830783724784851), ('March', 0.1869412511587143), ('voice', 0.1750718355178833), ('Captain', 0.16483615338802338), ('tone', 0.16350901126861572)]


In [44]:
model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=5)

[('shoulder', 0.9129279851913452),
 ('knee', 0.9075636863708496),
 ('queen', 0.9064924716949463),
 ('toe', 0.905070424079895),
 ('farther', 0.9045087695121765)]

In [45]:
sentences.shape

(5632, 4)

In [46]:
sentences.head(2)

Unnamed: 0,text,author,tokens,lemma
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,"[Alice, was, beginning, to, get, very, tired, ...","(Alice, be, begin, to, get, very, tired, of, s..."
1,"(So, she, was, considering, in, her, own, mind...",Carroll,"[So, she, was, considering, in, her, own, mind...","(so, -PRON-, be, consider, in, -PRON-, own, mi..."


In [49]:
word2vec_arr = np.zeros((sentences.shape[0],100))

for i, sentence in enumerate(sentences["lemma"]):
    word2vec_arr[i,:] = np.mean([model[lemma] for lemma in sentence], axis=0)

word2vec_arr = pd.DataFrame(word2vec_arr)
sentences = pd.concat([sentences[["author", "text"]],word2vec_arr], axis=1)
sentences.dropna(inplace=True)

sentences.head()

Unnamed: 0,author,text,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
0,Carroll,"(Alice, was, beginning, to, get, very, tired, ...",0.158378,0.060011,0.15775,-0.029606,-0.236574,0.036433,0.28354,-0.247457,...,-0.002104,-0.223912,-0.268862,0.030572,0.20309,-0.025319,-0.156174,-0.392246,-0.055377,-0.251224
1,Carroll,"(So, she, was, considering, in, her, own, mind...",0.172123,0.068418,0.06649,0.023262,-0.253577,0.043693,0.291482,-0.182503,...,0.002124,-0.208643,-0.264256,0.020851,0.184978,0.031838,-0.096466,-0.332626,-0.098685,-0.210926
2,Carroll,"(There, was, nothing, so, VERY, remarkable, in...",0.243648,0.087739,0.124225,-0.016461,-0.293289,-0.024041,0.241044,-0.261307,...,-0.007751,-0.234767,-0.26595,0.011011,0.256305,-0.002414,-0.096413,-0.379305,-0.086064,-0.248957
3,Carroll,"(Oh, dear, !)",0.042823,0.081108,0.248071,0.15329,-0.109163,0.08095,0.034149,-0.121746,...,-0.048959,-0.059532,-0.493349,0.311079,0.629674,-0.022445,-0.147822,-0.627011,-0.26978,-0.358592
4,Carroll,"(Oh, dear, !)",0.042823,0.081108,0.248071,0.15329,-0.109163,0.08095,0.034149,-0.121746,...,-0.048959,-0.059532,-0.493349,0.311079,0.629674,-0.022445,-0.147822,-0.627011,-0.26978,-0.358592


## Use multiple machine learning algorithms on 1-gram:

In [52]:
Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

In [53]:
# Models
lr_params = {"penalty": ["l2"]}
lr = LogisticRegression()

rfc_params = {"n_estimators": [3, 5, 10, 15], "max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
rfc = RandomForestClassifier()

gbc_params = {"n_estimators": [3, 5, 10, 15],"max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
gbc = GradientBoostingClassifier()

SDG_params= {'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e3], 'penalty': ['l2'], 'n_jobs': [-1]}
SDG = linear_model.SGDClassifier()

In [57]:
# GridsearchCV
clf_lr = GridSearchCV(lr, lr_params, cv=2)
clf_lr.fit(X_train, y_train)

clf_rfc = GridSearchCV(rfc, rfc_params, cv=2)
clf_rfc.fit(X_train, y_train)

clf_gbc = GridSearchCV(gbc, gbc_params, cv=2)
clf_gbc.fit(X_train, y_train)

clf_SDG = GridSearchCV(SDG, SDG_params, cv=2)
clf_SDG.fit(X_train, y_train)

GridSearchCV(cv=2, estimator=SGDClassifier(),
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1000.0],
                         'n_jobs': [-1], 'penalty': ['l2']})

In [58]:
# Fit the models
lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)
SDG.fit(X_train, y_train)

SGDClassifier()

In [59]:
# Check some basic performance
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

print("----------------------Stochastic Gradient Descent Scores----------------------")
print('Training set score:', SDG.score(X_train, y_train))
print('\nTest set score:', SDG.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.9274933412252145

Test set score: 0.9289835774522859
----------------------Random Forest Scores----------------------
Training set score: 0.999408108907961

Test set score: 0.9369729249889037
----------------------Gradient Boosting Scores----------------------
Training set score: 0.977212192956496

Test set score: 0.9445184198845983
----------------------Stochastic Gradient Descent Scores----------------------
Training set score: 0.940514945250074

Test set score: 0.9422991566799822
