In [2]:
from time import time
import gensim
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn import svm
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

In [3]:
train_data_path = '../../data/raw_data/train.csv'
test_data_path = '../../data/raw_data/test.csv'
unlabelled_test_path = '../../data/raw_data/unlabelled_test.csv'

In [16]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
unlabelled_test = pd.read_csv(unlabelled_test_path)
print(train_data.shape)
print(test_data.shape)
print(unlabelled_test.shape)

(159571, 2)
(63978, 2)
(89186, 1)


In [17]:
type(unlabelled_test)

pandas.core.frame.DataFrame

In [18]:
comments = list(train_data['Comment'])
comments_test = list(test_data['Comment'])
unlabelled_test = list(unlabelled_test['Comment'])
# Assemble labels
data_label = pd.concat([train_data['Labels'], test_data['Labels']])

In [19]:
'''Simple preprocess removes common accent marks and converts the text to lowercase. 
Contrast to more advanced preprocessing techniques in tf-idf.'''
preprocessed_comments = []
for i, line in enumerate(comments):
    preprocessed_comments.append(gensim.utils.simple_preprocess(line))
for i, line in enumerate(comments_test):
    preprocessed_comments.append(gensim.utils.simple_preprocess(line))
    


In [21]:
preprocessed_comments_total = preprocessed_comments.copy()
for i, line in enumerate(unlabelled_test):
    preprocessed_comments_total.append(gensim.utils.simple_preprocess(line))
len(preprocessed_comments_total)

312735

In [22]:
train_data, test_data, train_labels, test_labels = train_test_split(preprocessed_comments, data_label , test_size = 0.15, random_state = 2, stratify=data_label)

In [23]:
'''

'''
def make_feature_vec(words, model, num_features, word_set):
    feature_vec = np.zeros((num_features, ), dtype="float32")
    number_of_words_added = 0

    #convert the vocabulary of the word2vec model to a set for speed
    for word in words:
        if word in word_set:
            number_of_words_added = number_of_words_added + 1
            feature_vec = np.add(feature_vec, model[word])
    
    #Normalize to 1 by dividing by length
    feature_vec = np.divide(feature_vec, number_of_words_added)
    return feature_vec

In [24]:
''' 
Convert a list of sentences (our data) to word2vec embedding
'''
def get_feature_vec_data(model, num_features, data): 
    current_count = 0
    word_set = set(model.wv.index2word)
    feature_vec_data = np.zeros((len(data), num_features), dtype="float32")
    for comment in data:         
        feature_vec_data[current_count] = make_feature_vec(comment, model, num_features, word_set)
        current_count = current_count + 1
    return feature_vec_data

In [25]:
def eval_model(num_features, min_word_count, context, downsampling, preprocessed_comments, train_label, num_epochs):
    
    start = time()
    X_train, X_test, y_train, y_test = train_test_split(preprocessed_comments, train_label, test_size = 0.176, random_state = 2)
    print("Starting to train the word2vec model.")
    num_of_workers = 6 # Number of threads to be used in parallel

    model = gensim.models.Word2Vec(
        X_train,
        size = num_features,
        window = context,
        min_count = min_word_count,
        workers = num_of_workers,
        sample = downsampling)
    
    model.train(X_train, total_examples = len(X_train), epochs=num_epochs)
    print("Training complete!")
    model.init_sims(replace=True)
    
    print("Extracting feature representation from word2vec model.")
    wv_data = get_feature_vec_data(model, num_features, X_train)
    wv_data = Imputer().fit_transform(wv_data)
    
    wv_test = get_feature_vec_data(model, num_features, X_test)
    wv_test = Imputer().fit_transform(wv_test)
    clf = svm.LinearSVC(dual=False, class_weight="balanced")
    print("Fitting SVM to data.")
    
    clf.fit(wv_data, y_train)

    result = clf.predict(wv_test)
    #conf_mat = confusion_matrix(test_data['Labels'], result)
    fscore = f1_score(y_test, result, 'macro')
    end = time()
    print("The total time taken for this iteration is :", end-start)
    print("F Score is: ", fscore)
    return fscore