## Verify later
Since word2vec relies on predicting words by context, we do not need to eliminate stop words, since they eliminate valuable contextual information.  

In [27]:
from time import time
import gensim
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn import svm
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

In [28]:
train_data_path = '../data/raw_data/train.csv'
test_data_path = '../data/raw_data/test.csv'
unlabelled_test_path = '../data/raw_data/unlabelled_test.csv'

In [29]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
unlabelled_test = pd.read_csv(unlabelled_test_path)
print(train_data.shape)
print(test_data.shape)
print(unlabelled_test.shape)

(159571, 2)
(63978, 2)
(89186, 1)


In [30]:
comments = list(train_data['Comment'])
unlabelled_test['Comment']

0        Yo bitch Ja Rule is more succesful then you'll...
1        == From RfC == \r\r\n\r\r\n The title is fine ...
2        " \r\r\n\r\r\n == Sources == \r\r\n\r\r\n * Za...
3        :If you have a look back at the source, the in...
4                I don't anonymously edit articles at all.
5        Please do not add nonsense to Wikipedia. Such ...
6        " \r\r\n Only a fool can believe in such numbe...
7        == Double Redirects == \r\r\n\r\r\n When fixin...
8        I think its crap that the link to roggenbier i...
9        , 25 February 2010 (UTC) \r\r\n\r\r\n :::Looki...
10       ==Current Position== \r\r\n Anyone have confir...
11       :: Wallamoose was changing the cited material ...
12       ==Indefinitely blocked== \r\r\n I have indefin...
13       :yeah, thanks for reviving the tradition of pi...
14       MLM Software,NBFC software,Non Banking Financi...
16       " \r\r\n *Support Per Jimbo and WP:google ""Cl...
17       :::::I have added more wikilinks to my section.

In [31]:
'''Simple preprocess removes common accent marks and converts the text to lowercase. 
Contrast to more advanced preprocessing techniques in tf-idf.'''
preprocessed_comments = []
for i, line in enumerate(comments):
    preprocessed_comments.append(gensim.utils.simple_preprocess(line))

In [32]:
comments_test = list(test_data['Comment'])
preprocessed_test = []
for i, line in enumerate(comments_test):
    preprocessed_test.append(gensim.utils.simple_preprocess(line))

In [33]:
comments_unlabelled_test = list(unlabelled_test['Comment'])
preprocessed_comments_total = preprocessed_comments.copy()
for i, line in enumerate(comments_unlabelled_test):
    preprocessed_comments_total.append(gensim.utils.simple_preprocess(line))

In [34]:
print(preprocessed_comments_total[159571])
len(preprocessed_comments_total)

['yo', 'bitch', 'ja', 'rule', 'is', 'more', 'succesful', 'then', 'you', 'll', 'ever', 'be', 'whats', 'up', 'with', 'you', 'and', 'hating', 'you', 'sad', 'mofuckas', 'should', 'bitch', 'slap', 'ur', 'pethedic', 'white', 'faces', 'and', 'get', 'you', 'to', 'kiss', 'my', 'ass', 'you', 'guys', 'sicken', 'me', 'ja', 'rule', 'is', 'about', 'pride', 'in', 'da', 'music', 'man', 'dont', 'diss', 'that', 'shit', 'on', 'him', 'and', 'nothin', 'is', 'wrong', 'bein', 'like', 'tupac', 'he', 'was', 'brother', 'too', 'fuckin', 'white', 'boys', 'get', 'things', 'right', 'next', 'time']


248757

In [35]:
train_label = train_data['Labels']
test_label = test_data['Labels']

Once we have a word2vec embedding trained, we still need to take into account multiple length comments. Because of which we cannot simply convert a paragraph to a vector embedding. However, we can take many different approaches, one of which is averaging the word vectors. 

In [36]:
def make_feature_vec(words, model, num_features):
    feature_vec = np.zeros((num_features, ), dtype="float32")
    number_of_words_added = 0
    
    #convert the vocabulary of the word2vec model to a set for speed
    word_set = set(model.wv.index2word)
    for word in words:
        if word in word_set:
            number_of_words_added = number_of_words_added + 1
            feature_vec = np.add(feature_vec, model[word])
    
    #Normalize to 1 by dividing by length
    feature_vec = np.divide(feature_vec, number_of_words_added)
    return feature_vec

In [37]:
''' 
Convert a list of sentences (our data) to word2vec embedding
'''
def get_feature_vec_data(model, num_features, data):
    current_count = 0
    
    feature_vec_data = np.zeros((len(data), num_features), dtype="float32")
    
    for comment in data:
#         if current_count % 10000 == 0:
#             print("Current processing comment %d of %d" % (current_count, len(data)))
            
        feature_vec_data[current_count] = make_feature_vec(comment, model, num_features)
        current_count = current_count + 1
    return feature_vec_data

In [38]:
def eval_model(num_features, min_word_count, context, downsampling, preprocessed_comments, train_label, num_epochs):
    
    start = time()
    X_train, X_test, y_train, y_test = train_test_split(preprocessed_comments, train_label, test_size = 0.2, random_state = 2)
    print("Starting to train the word2vec model.")
    num_of_workers = 6 # Number of threads to be used in parallel

    model = gensim.models.Word2Vec(
        X_train,
        size = num_features,
        window = context,
        min_count = min_word_count,
        workers = num_of_workers,
        sample = downsampling)
    
    model.train(X_train, total_examples = len(X_train), epochs=num_epochs)
    print("Training complete!")
    model.init_sims(replace=True)
    
    print("Extracting feature representation from word2vec model.")
    wv_data = get_feature_vec_data(model, num_features, X_train)
    wv_data = Imputer().fit_transform(wv_data)
    
    wv_test = get_feature_vec_data(model, num_features, X_test)
    wv_test = Imputer().fit_transform(wv_test)
    clf = svm.LinearSVC(dual=False, class_weight="balanced")
    print("Fitting SVM to data.")
    
    clf.fit(wv_data, y_train)

    result = clf.predict(wv_test)
    #conf_mat = confusion_matrix(test_data['Labels'], result)
    fscore = f1_score(y_test, result, 'micro')
    end = time()
    print("The total time taken for this iteration is :", end-start)
    print("F Score is: ", fscore)
    return fscore

In [39]:
def crossval():
    # Parameters will be num_features, min_word_count, context and downsampling
    params = [[25, 3, 10, 1e-3, 10], [50, 3, 10, 1e-3, 10], [100, 3, 10, 1e-3, 10], [200, 3, 10, 1e-3, 10], [300, 3, 10, 1e-3, 10],
              [400, 3, 10, 1e-3, 10],
             [300, 1, 10, 1e-3, 10], [300, 5, 10, 1e-3, 10], [300, 10, 10, 1e-3, 10], [300, 15, 10, 1e-3, 10], [300, 20, 10, 1e-3, 10],
             [300, 3, 2, 1e-3, 10], [300, 3, 4, 1e-3, 10], [300, 3, 6, 1e-3, 10], [300, 3, 8, 1e-3, 10], [300, 3, 10, 1e-3, 10],
             [300, 3, 14, 1e-3, 10], [300, 3, 16, 1e-3, 10], [300, 3, 18, 1e-3, 10], [300, 3, 20, 1e-3, 10],
             [300, 3, 10, 1e-1, 10], [300, 3, 10, 1e-2, 10], [300, 3, 10, 1e-3, 10], [300, 3, 10, 1e-4, 10],
             [300, 3, 10, 1e-1, 5], [300, 3, 10, 1e-1, 10], [300, 3, 10, 1e-1, 15], [300, 3, 10, 1e-1, 20]]
    results_fscore = []
    for i, param in enumerate(params):
        [num_features, min_word_count, context, downsampling, num_epochs] = param
        print("Iteration %d, Current parameters: num_features: %d, min_word_count: %d, context: %d, downsampling: %d, num_epochs: %d" % (i, num_features, min_word_count, context, downsampling, num_epochs))
        results_fscore.append(eval_model(num_features, min_word_count, context, downsampling, preprocessed_comments, train_label, num_epochs))
crossval()

Iteration 0, Current parameters: num_features: 25, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1273.246200799942
F Score is:  0.5806108332446526
Iteration 1, Current parameters: num_features: 50, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1172.086415529251
F Score is:  0.6107837687604225
Iteration 2, Current parameters: num_features: 100, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1192.5031650066376
F Score is:  0.6367784622404745
Iteration 3, Current parameters: num_features: 200, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1195.4171497821808
F Score is:  0.6496460485087617
Iteration 4, Current parameters: num_features: 300, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1196.2022576332092
F Score is:  0.6549295774647886
Iteration 5, Current parameters: num_features: 400, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1211.7028076648712
F Score is:  0.6603129779974114
Iteration 6, Current parameters: num_features: 300, min_word_count: 1, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 3635.063700437546
F Score is:  0.6497722760714703
Iteration 7, Current parameters: num_features: 300, min_word_count: 5, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 820.2226836681366
F Score is:  0.6553312079348211
Iteration 8, Current parameters: num_features: 300, min_word_count: 10, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 536.7127857208252
F Score is:  0.6545540349221332
Iteration 9, Current parameters: num_features: 300, min_word_count: 15, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 276.71703577041626
F Score is:  0.6521483225426721
Iteration 10, Current parameters: num_features: 300, min_word_count: 20, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 243.70531153678894
F Score is:  0.6518275538894096
Iteration 11, Current parameters: num_features: 300, min_word_count: 3, context: 2, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1186.207531929016
F Score is:  0.6533443046902551
Iteration 12, Current parameters: num_features: 300, min_word_count: 3, context: 4, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1190.6870489120483
F Score is:  0.6572304788865299
Iteration 13, Current parameters: num_features: 300, min_word_count: 3, context: 6, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1197.3945755958557
F Score is:  0.6591070163004962
Iteration 14, Current parameters: num_features: 300, min_word_count: 3, context: 8, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1201.0423386096954
F Score is:  0.6552008481564378
Iteration 15, Current parameters: num_features: 300, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1208.89262008667
F Score is:  0.6512443042411497
Iteration 16, Current parameters: num_features: 300, min_word_count: 3, context: 14, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1217.3011827468872
F Score is:  0.6546349466776046
Iteration 17, Current parameters: num_features: 300, min_word_count: 3, context: 16, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1225.8190321922302
F Score is:  0.6503512880562061
Iteration 18, Current parameters: num_features: 300, min_word_count: 3, context: 18, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1240.0390212535858
F Score is:  0.6495447116507121
Iteration 19, Current parameters: num_features: 300, min_word_count: 3, context: 20, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1240.8129189014435
F Score is:  0.65272748510688
Iteration 20, Current parameters: num_features: 300, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1257.7218894958496
F Score is:  0.6554205497227793
Iteration 21, Current parameters: num_features: 300, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1252.1020619869232
F Score is:  0.6574369897053602
Iteration 22, Current parameters: num_features: 300, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1218.0296983718872
F Score is:  0.6574117647058823
Iteration 23, Current parameters: num_features: 300, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1194.8412473201752
F Score is:  0.657888493475682
Iteration 24, Current parameters: num_features: 300, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 5
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1222.8956875801086
F Score is:  0.6524039026683908
Iteration 25, Current parameters: num_features: 300, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 10
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1260.196490764618
F Score is:  0.6564903561708674
Iteration 26, Current parameters: num_features: 300, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 15
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1303.2881739139557
F Score is:  0.6565859421145894
Iteration 27, Current parameters: num_features: 300, min_word_count: 3, context: 10, downsampling: 0, num_epochs: 20
Starting to train the word2vec model.
Training complete!
Extracting feature representation from word2vec model.


  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


Fitting SVM to data.
The total time taken for this iteration is : 1345.8484227657318
F Score is:  0.6534653465346535
