## Verify later
Since word2vec relies on predicting words by context, we do not need to eliminate stop words, since they eliminate valuable contextual information.  

In [34]:
from time import time
import gensim
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn import svm
from sklearn.metrics import confusion_matrix, f1_score

In [35]:
train_data_path = '../data/raw_data/train.csv'
test_data_path = '../data/raw_data/test.csv'

In [36]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
print(train_data.shape)
print(test_data.shape)

(159571, 2)
(63978, 2)


In [37]:
comments = list(train_data['Comment'])
comments[0]

"Explanation\r\r\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [38]:
'''Simple preprocess removes common accent marks and converts the text to lowercase. 
Contrast to more advanced preprocessing techniques in tf-idf.'''
preprocessed_comments = []
for i, line in enumerate(comments):
    preprocessed_comments.append(gensim.utils.simple_preprocess(line))

In [39]:
comments_test = list(test_data['Comment'])
preprocessed_test = []
for i, line in enumerate(comments_test):
    preprocessed_test.append(gensim.utils.simple_preprocess(line))

In [40]:
preprocessed_comments[0]

['explanation',
 'why',
 'the',
 'edits',
 'made',
 'under',
 'my',
 'username',
 'hardcore',
 'metallica',
 'fan',
 'were',
 'reverted',
 'they',
 'weren',
 'vandalisms',
 'just',
 'closure',
 'on',
 'some',
 'gas',
 'after',
 'voted',
 'at',
 'new',
 'york',
 'dolls',
 'fac',
 'and',
 'please',
 'don',
 'remove',
 'the',
 'template',
 'from',
 'the',
 'talk',
 'page',
 'since',
 'retired',
 'now']

Once we have a word2vec embedding trained, we still need to take into account multiple length comments. Because of which we cannot simply convert a paragraph to a vector embedding. However, we can take many different approaches, one of which is averaging the word vectors. 

In [41]:
def make_feature_vec(words, model, num_features):
    feature_vec = np.zeros((num_features, ), dtype="float32")
    number_of_words_added = 0
    
    #convert the vocabulary of the word2vec model to a set for speed
    word_set = set(model.wv.index2word)
    for word in words:
        if word in word_set:
            number_of_words_added = number_of_words_added + 1
            feature_vec = np.add(feature_vec, model[word])
    
    #Normalize to 1 by dividing by length
    feature_vec = np.divide(feature_vec, number_of_words_added)
    return feature_vec

In [42]:
''' 
Convert a list of sentences (our data) to word2vec embedding
'''
def get_feature_vec_data(model, num_features, data):
    current_count = 0
    
    feature_vec_data = np.zeros((len(data), num_features), dtype="float32")
    
    for comment in data:
        if current_count % 1000 == 0:
            print("Current processing comment %d of %d" % (current_count, len(data)))
            
        feature_vec_data[current_count] = make_feature_vec(comment, model, num_features)
        current_count = current_count + 1
    return feature_vec_data

In [43]:
def crossval():
    
    start = time()
    '''The number of features indicate what dimension of a word vector we shall be using'''
    num_features = 300 # The dimension of the word vector(HyperParameter)
    min_word_count = 3# The minimum word count(HyperParameter)
    num_of_workers = 4 # Number of threads to be used in parallel
    context = 10 # The context window size (HyperParameter)
    downsampling = 1e-3
    
    print("Starting to train the word2vec model.")
    model = gensim.models.Word2Vec(
        preprocessed_comments,
        size = num_features,
        window = context,
        min_count = min_word_count,
        workers = num_of_workers,
        sample = downsampling)
    
    model.train(preprocessed_comments, total_examples = len(comments), epochs=10)
    print("Training complete!")
    model.init_sims(replace=True)
    print("Extracting feature representation from word2vec model.")
    wv_data = get_feature_vec_data(model, num_features, preprocessed_comments)
    # Do the same for test data
    wv_test = get_feature_vec_data(model, num_features, preprocessed_test)
    wv_test = Imputer().fit_transform(wv_test)
    wv_data = Imputer().fit_transform(wv_data)
    clf = svm.LinearSVC(dual=False, class_weight="balanced")

    print("Fitting SVM to data.")
    
    clf.fit(wv_data, train_data['Labels'])

    result = clf.predict(wv_test)
    conf_mat = confusion_matrix(test_data['Labels'], result)
    fscore = f1_score(test_data['Labels'], result, 'weighted')
    end = time()
    print("The total time taken is :", end-start)
    print("F Score is: ", fscore)
    return model, conf_mat, clf
model, conf, clf = crossval()

Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.
Current processing comment 0 of 159571


  # Remove the CWD from sys.path while we load stuff.


Current processing comment 1000 of 159571
Current processing comment 2000 of 159571
Current processing comment 3000 of 159571
Current processing comment 4000 of 159571


  del sys.path[0]


Current processing comment 5000 of 159571
Current processing comment 6000 of 159571
Current processing comment 7000 of 159571
Current processing comment 8000 of 159571
Current processing comment 9000 of 159571
Current processing comment 10000 of 159571
Current processing comment 11000 of 159571
Current processing comment 12000 of 159571
Current processing comment 13000 of 159571
Current processing comment 14000 of 159571
Current processing comment 15000 of 159571
Current processing comment 16000 of 159571
Current processing comment 17000 of 159571
Current processing comment 18000 of 159571
Current processing comment 19000 of 159571
Current processing comment 20000 of 159571
Current processing comment 21000 of 159571
Current processing comment 22000 of 159571
Current processing comment 23000 of 159571
Current processing comment 24000 of 159571
Current processing comment 25000 of 159571
Current processing comment 26000 of 159571
Current processing comment 27000 of 159571
Current processi

Current processing comment 36000 of 63978
Current processing comment 37000 of 63978
Current processing comment 38000 of 63978
Current processing comment 39000 of 63978
Current processing comment 40000 of 63978
Current processing comment 41000 of 63978
Current processing comment 42000 of 63978
Current processing comment 43000 of 63978
Current processing comment 44000 of 63978
Current processing comment 45000 of 63978
Current processing comment 46000 of 63978
Current processing comment 47000 of 63978
Current processing comment 48000 of 63978
Current processing comment 49000 of 63978
Current processing comment 50000 of 63978
Current processing comment 51000 of 63978
Current processing comment 52000 of 63978
Current processing comment 53000 of 63978
Current processing comment 54000 of 63978
Current processing comment 55000 of 63978
Current processing comment 56000 of 63978
Current processing comment 57000 of 63978
Current processing comment 58000 of 63978
Current processing comment 59000 o



Fitting SVM to data.
The total time taken is : 1905.3048725128174
F Score is:  0.5424708533359459


In [46]:
w1 = "hey"
model.wv.most_similar(positive=w1)

[('hi', 0.6419724822044373),
 ('hello', 0.5233088731765747),
 ('dude', 0.483754962682724),
 ('yo', 0.4745360314846039),
 ('howdy', 0.4510461688041687),
 ('haha', 0.43993693590164185),
 ('btw', 0.4390471279621124),
 ('wtf', 0.4210343658924103),
 ('omg', 0.420622318983078),
 ('hahahaha', 0.4120093286037445)]

In [47]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.


'kitchen'

In [48]:
conf

array([[49101,  8634],
       [  706,  5537]], dtype=int64)