# Compare NLP Techniques: Build Model On doc2vec Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

### Create doc2vec Vectors

In [2]:
# Created TaggedDocument vectors for each text message in the training and test sets
tagged_docs_train = [gensim.models.doc2vec.TaggedDocument(v, [i])
                     for i, v in enumerate(X_train['clean_text'])]
tagged_docs_test = [gensim.models.doc2vec.TaggedDocument(v, [i])
                    for i, v in enumerate(X_test['clean_text'])]

In [3]:
# What do these TaggedDocument objects look like?
tagged_docs_train[:10]

[TaggedDocument(words="['dear', 'call']", tags=[0]),
 TaggedDocument(words="['jus', 'came', 'back', 'fr', 'lunch', 'wif', 'sis', 'u', 'leh']", tags=[1]),
 TaggedDocument(words="['played', 'smash', 'bros', 'ltgt', 'religiously']", tags=[2]),
 TaggedDocument(words="['asked', 'hows', 'anthony', 'dad', 'bf']", tags=[3]),
 TaggedDocument(words="['slow', 'using', 'biolas', 'fne']", tags=[4]),
 TaggedDocument(words="['urgent', '4', 'costa', 'del', 'sol', 'holiday', 'å', '5000', 'await', 'collection', 'call', '09050090044', 'toclaim', 'sae', 'tc', 'pobox334', 'stockport', 'sk38xh', 'costå', '150pm', 'max10mins']", tags=[5]),
 TaggedDocument(words="['dunno', 'lei', 'might', 'b', 'eatin', 'wif', 'frens', 'ì', 'wan', 'eat', 'wait', '4', 'ì', 'lar']", tags=[6]),
 TaggedDocument(words="['hiya', 'comin', '2', 'bristol', '1', 'st', 'week', 'april', 'les', 'got', 'rudi', 'new', 'yrs', 'eve', 'snoringthey', 'drunk', 'u', 'bak', 'college', 'yet', 'work', 'sends', 'ink', '2', 'bath']", tags=[7]),
 Tagged

In [5]:
# Train a basic doc2vec model
d2v_model = gensim.models.Doc2Vec(tagged_docs_train,
                                 vector_size=100,
                                 window=5,
                                 min_count=2)

In [7]:
# Infer the vectors to be used in training and testing
train_vectors = [d2v_model.infer_vector(eval(v.words)) for v in tagged_docs_train]
test_vectors = [d2v_model.infer_vector(eval(v.words)) for v in tagged_docs_test]

### Fit RandomForestClassifier On Top Of Document Vectors

In [9]:
# Fit a basic model, make predictions on the holdout test set, and the generate the evaluation metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score

rf = RandomForestClassifier()
rf_model = rf.fit(train_vectors, y_train.values.ravel())

y_pred = rf_model.predict(test_vectors)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.764 / Recall: 0.29 / Accuracy: 0.896
