# Compare NLP Techniques: Build Model On word2vec Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

### Create word2vec Vectors

In [2]:
# Train a basic word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

In [4]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index2word)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train['clean_text']])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test['clean_text']])

  after removing the cwd from sys.path.
  


In [5]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [6]:
# What does the unaveraged version look like?
X_train_vect[0]

array([[-0.00384351, -0.00394247, -0.00390022,  0.00192951, -0.00309063,
        -0.00385922, -0.00446761, -0.00021046, -0.00306967, -0.00063658,
        -0.00223586, -0.00033802,  0.00423961,  0.00039754,  0.00475842,
        -0.00261165,  0.00219575, -0.00346471, -0.00245117,  0.00200349,
        -0.00106253, -0.00180041,  0.00174103, -0.00077727,  0.00227746,
        -0.00280013,  0.00397571,  0.00079452, -0.00040931, -0.0042494 ,
        -0.0042759 , -0.00103996, -0.0013569 ,  0.00138008,  0.00026145,
        -0.0006727 ,  0.00227036,  0.00141514, -0.00489185, -0.00336901,
         0.00275691,  0.00421509,  0.00268406, -0.00478705,  0.00093469,
        -0.00189115, -0.00494519,  0.00166919,  0.0003545 ,  0.00404843,
        -0.00335258, -0.00372556,  0.00142032,  0.00195562,  0.00180276,
        -0.00129065, -0.00030903,  0.00225197, -0.0005384 ,  0.00023738,
         0.00010827, -0.00377346,  0.0033225 ,  0.00310372, -0.00497917,
        -0.00214435, -0.00237416, -0.00101849,  0.0

In [7]:
# What does the averaged version look like?
X_train_vect_avg[0]

array([-0.00384351, -0.00394247, -0.00390022,  0.00192951, -0.00309063,
       -0.00385922, -0.00446761, -0.00021046, -0.00306967, -0.00063658,
       -0.00223586, -0.00033802,  0.00423961,  0.00039754,  0.00475842,
       -0.00261165,  0.00219575, -0.00346471, -0.00245117,  0.00200349,
       -0.00106253, -0.00180041,  0.00174103, -0.00077727,  0.00227746,
       -0.00280013,  0.00397571,  0.00079452, -0.00040931, -0.0042494 ,
       -0.0042759 , -0.00103996, -0.0013569 ,  0.00138008,  0.00026145,
       -0.0006727 ,  0.00227036,  0.00141514, -0.00489185, -0.00336901,
        0.00275691,  0.00421509,  0.00268406, -0.00478705,  0.00093469,
       -0.00189115, -0.00494519,  0.00166919,  0.0003545 ,  0.00404843,
       -0.00335258, -0.00372556,  0.00142032,  0.00195562,  0.00180276,
       -0.00129065, -0.00030903,  0.00225197, -0.0005384 ,  0.00023738,
        0.00010827, -0.00377346,  0.0033225 ,  0.00310372, -0.00497917,
       -0.00214435, -0.00237416, -0.00101849,  0.00204516,  0.00

### Fit RandomForestClassifier On Top Of Word Vectors

In [11]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [12]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [13]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.48 / Recall: 0.166 / Accuracy: 0.868
