# Compare NLP Techniques: Build Model On word2vec Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('../../../data/X_train.csv', header=0)
X_test = pd.read_csv('../../../data/X_test.csv', header=0)
y_train = pd.read_csv('../../../data/y_train.csv', header=0)
y_test = pd.read_csv('../../../data/y_test.csv', header=0)

In [2]:
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))


4457
1115
4457
1115


### Create word2vec Vectors

In [3]:
# Train a basic word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [4]:
print(type(X_train))
print(X_train.shape)

<class 'pandas.core.frame.DataFrame'>
(4457, 1)


In [12]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index_to_key)

# again the following lines do not work 

# X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
#                          for ls in X_train['clean_text']])
# X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
#                          for ls in X_test['clean_text']])



AttributeError: 'list' object has no attribute 'mean'

In [None]:
# A working solution is to use list comprehension instead of np.array as a first step

# X_train_list = [np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key for ls in X_train])]

X_train_list = []
for n in range(len(X_train)):
    ls = X_train.iloc[n,0]
    train_item = [w2v_model.wv[i] for i in ls if i in words]
    X_train_list.append(train_item)
    
print(len(X_train_list))
print(len(X_train_list[0]))
print(type(X_train_list[0]))
print(type(X_train_list))

# X_test_list = [np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key for ls in X_test])]

X_test_list = []
for n in range(len(X_test)):
    ls = X_test.iloc[n,0]
    test_item = [w2v_model.wv[i] for i in ls if i in words]
    X_test_list.append(test_item)

# len(X_train_list)


In [6]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training

# again the following lines do not work
# X_train_vect_avg = []
# for v in X_train_vect:
#     if v.size:
#         X_train_vect_avg.append(v.mean(axis=0))
#     else:
#         X_train_vect_avg.append(np.zeros(100, dtype=float))
        
# X_test_vect_avg = []
# for v in X_test_vect:
#     if v.size:
#         X_test_vect_avg.append(v.mean(axis=0))
#     else:
#         X_test_vect_avg.append(np.zeros(100, dtype=float))

# A working solution is 
X_train_vect = []
for n in range(len(X_train_list)):
    if len(X_train_list[n])!=0:
        X_train_vect.append(np.array(X_train_list[n]).mean(axis=0))
    else:
        X_train_vect.append(np.zeros(100))

X_test_vect = []
for n in range(len(X_test_list)):
    if len(X_test_list[n])!=0:
        X_test_vect.append(np.array(X_test_list[n]).mean(axis=0))
    else:
        X_test_vect.append(np.zeros(100))

In [7]:
print(len(X_train_vect))
print(len(X_test_vect))

4457
1115


### Fit RandomForestClassifier On Top Of Word Vectors

In [8]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [9]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [10]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.712 / Recall: 0.292 / Accuracy: 0.893
