## Verify later
Since word2vec relies on predicting words by context, we do not need to eliminate stop words, since they eliminate valuable contextual information.  

In [26]:
import gensim
import pandas as pd
import numpy as np

In [3]:
train_data_path = '../data/raw_data/train.csv'
test_data_path = '../data/raw_data/test.csv'

In [4]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
print(train_data.shape)
print(test_data.shape)

(159571, 2)
(63978, 2)


In [5]:
comments = list(train_data['Comment'])
comments[0]

"Explanation\r\r\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [6]:
'''Simple preprocess removes common accent marks and converts the text to lowercase. 
Contrast to more advanced preprocessing techniques in tf-idf.'''
preprocessed_comments = []
for i, line in enumerate(comments):
    preprocessed_comments.append(gensim.utils.simple_preprocess(line))

In [51]:
comments_test = list(test_data['Comment'])
preprocessed_test = []
for i, line in enumerate(comments_test):
    preprocessed_test.append(gensim.utils.simple_preprocess(line))

In [8]:
preprocessed_comments[0]

['explanation',
 'why',
 'the',
 'edits',
 'made',
 'under',
 'my',
 'username',
 'hardcore',
 'metallica',
 'fan',
 'were',
 'reverted',
 'they',
 'weren',
 'vandalisms',
 'just',
 'closure',
 'on',
 'some',
 'gas',
 'after',
 'voted',
 'at',
 'new',
 'york',
 'dolls',
 'fac',
 'and',
 'please',
 'don',
 'remove',
 'the',
 'template',
 'from',
 'the',
 'talk',
 'page',
 'since',
 'retired',
 'now']

In [10]:
'''The number of features indicate what dimension of a word vector we shall be using'''
num_features = 300 # The dimension of the word vector(HyperParameter)
min_word_count = 3# The minimum word count(HyperParameter)
num_of_workers = 4 # Number of threads to be used in parallel
context = 10 # The context window size (HyperParameter)
downsampling = 1e-3

In [11]:
model = gensim.models.Word2Vec(
        preprocessed_comments,
        size = num_features,
        window = context,
        min_count = min_word_count,
        workers = num_of_workers,
        sample = downsampling)

In [13]:
model.train(preprocessed_comments, total_examples = len(comments), epochs=10)

(77612069, 101876800)

In [18]:
model.init_sims(replace=True)

<bound method BaseWordEmbeddingsModel.estimate_memory of <gensim.models.word2vec.Word2Vec object at 0x00000235C9A71C88>>

In [27]:
# Vocabulary size
len(model.wv.vocab)

59701

In [23]:
w1 = "hey"
model.wv.most_similar(positive=w1)

[('hi', 0.6357199549674988),
 ('hello', 0.5268846750259399),
 ('yo', 0.472095251083374),
 ('howdy', 0.4563082456588745),
 ('dude', 0.45139196515083313),
 ('sup', 0.4460800886154175),
 ('haha', 0.4426163136959076),
 ('mate', 0.4378395080566406),
 ('btw', 0.4340026080608368),
 ('hahahaha', 0.42958885431289673)]

In [20]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.


'kitchen'

Once we have a word2vec embedding trained, we still need to take into account multiple length comments. Because of which we cannot simply convert a paragraph to a vector embedding. However, we can take many different approaches, one of which is averaging the word vectors. 

In [42]:
def make_feature_vec(words, model, num_features):
    feature_vec = np.zeros((num_features, ), dtype="float32")
    number_of_words_added = 0
    
    #convert the vocabulary of the word2vec model to a set for speed
    word_set = set(model.wv.index2word)
    for word in words:
        if word in word_set:
            number_of_words_added = number_of_words_added + 1
            feature_vec = np.add(feature_vec, model[word])
    
    #Normalize to 1 by dividing by length
    feature_vec = np.divide(feature_vec, number_of_words_added)
    return feature_vec

In [43]:
print(make_feature_vec(preprocessed_comments[0], model, 300))

[-5.36105130e-03 -4.12288075e-03 -8.66610650e-03  1.52602168e-02
 -1.12523902e-02 -3.89550882e-03 -1.98000297e-02  1.18375907e-03
  1.22291446e-02  2.22750939e-02  2.80690915e-03 -1.56473219e-02
 -6.44711405e-03 -2.02375408e-02 -3.34329624e-03 -1.69196120e-03
  6.34606229e-03 -1.97174447e-03  4.01650369e-03 -1.29603120e-02
 -2.20447686e-02  1.83563767e-04  1.12800812e-02  5.34207327e-04
 -1.01370402e-02 -6.78514922e-03  1.98464096e-02 -1.72577868e-03
 -5.87076973e-03 -1.84394326e-02  1.30102551e-02  1.19437659e-02
  2.21704021e-02  6.86688232e-04  1.44102843e-03 -2.87136771e-02
  9.92098916e-03 -9.96799674e-03 -3.16679813e-02 -3.66068911e-03
 -1.28029482e-02 -1.64133925e-02  2.12430907e-03  1.77914314e-02
 -8.32850300e-03 -1.39359117e-03  2.28292346e-02 -1.07039353e-02
  1.70682240e-02 -7.75591098e-03  1.08979968e-02 -2.04704311e-02
 -1.08303018e-02 -9.26215388e-03 -1.66471247e-02 -7.32635520e-03
  1.04776770e-02  5.96658420e-03  2.99230567e-03  6.95470115e-03
  3.58129083e-03 -8.13404

  # Remove the CWD from sys.path while we load stuff.


In [44]:
''' 
Convert a list of sentences (our data) to word2vec embedding
'''
def get_feature_vec_data(model, num_features, data):
    current_count = 0
    
    feature_vec_data = np.zeros((len(data), num_features), dtype="float32")
    
    for comment in data:
        if current_count % 1000 == 0:
            print("Current processing comment %d of %d" % (current_count, len(data)))
            
        feature_vec_data[current_count] = make_feature_vec(comment, model, num_features)
        current_count = current_count + 1
    return feature_vec_data

In [47]:
wv_data = get_feature_vec_data(model, num_features, preprocessed_comments)

Current processing comment 0 of 159571


  # Remove the CWD from sys.path while we load stuff.


Current processing comment 1000 of 159571
Current processing comment 2000 of 159571
Current processing comment 3000 of 159571
Current processing comment 4000 of 159571


  del sys.path[0]


Current processing comment 5000 of 159571
Current processing comment 6000 of 159571
Current processing comment 7000 of 159571
Current processing comment 8000 of 159571
Current processing comment 9000 of 159571
Current processing comment 10000 of 159571
Current processing comment 11000 of 159571
Current processing comment 12000 of 159571
Current processing comment 13000 of 159571
Current processing comment 14000 of 159571
Current processing comment 15000 of 159571
Current processing comment 16000 of 159571
Current processing comment 17000 of 159571
Current processing comment 18000 of 159571
Current processing comment 19000 of 159571
Current processing comment 20000 of 159571
Current processing comment 21000 of 159571
Current processing comment 22000 of 159571
Current processing comment 23000 of 159571
Current processing comment 24000 of 159571
Current processing comment 25000 of 159571
Current processing comment 26000 of 159571
Current processing comment 27000 of 159571
Current processi

In [49]:
wv_data.shape

(159571, 300)

In [52]:
# Do the same for test data
wv_test = get_feature_vec_data(model, num_features, preprocessed_test)

Current processing comment 0 of 63978


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Current processing comment 1000 of 63978
Current processing comment 2000 of 63978
Current processing comment 3000 of 63978
Current processing comment 4000 of 63978
Current processing comment 5000 of 63978
Current processing comment 6000 of 63978
Current processing comment 7000 of 63978
Current processing comment 8000 of 63978
Current processing comment 9000 of 63978
Current processing comment 10000 of 63978
Current processing comment 11000 of 63978
Current processing comment 12000 of 63978
Current processing comment 13000 of 63978
Current processing comment 14000 of 63978
Current processing comment 15000 of 63978
Current processing comment 16000 of 63978
Current processing comment 17000 of 63978
Current processing comment 18000 of 63978
Current processing comment 19000 of 63978
Current processing comment 20000 of 63978
Current processing comment 21000 of 63978
Current processing comment 22000 of 63978
Current processing comment 23000 of 63978
Current processing comment 24000 of 63978
C

In [69]:
from sklearn.preprocessing import Imputer
wv_test = Imputer().fit_transform(wv_test)
wv_data = Imputer().fit_transform(wv_data)

In [70]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

forest = RandomForestClassifier(n_estimators=100)

print("Fitting random forest to data.")
forest = forest.fit(wv_data, train_data['Labels'])

result = forest.predict(wv_test)
confusion_matrix(test_data['Labels'], result)

Fitting random forest to data.


array([[56735,  1000],
       [ 3633,  2610]], dtype=int64)