## Imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import gensim.downloader as gensim_api 

from tqdm import tqdm 
tqdm.pandas()
import gc

In [2]:
train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")

train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

train_dset_y = train_dset_df["target"].to_numpy()

In [3]:
NUM_TRAIN_EXAMPLES = len(train_dset_df)

## Embedder function

In [4]:
def embedding_entire_dset(dset_df_rows, embedding_vectors):
    '''
    dset_df_rows: A pandas Series. Each entry is a string of space-separated words without numbers and special characters.
    embedding_vectors: A non-trainable embedding vectors instance from Gensim 
    num_embedding_columns: The number of columns.
    ''' 
    num_embedding_columns=300
    num_embedding_rows = len(dset_df_rows)
    mean_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    sum_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    min_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    max_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    for row_index in tqdm(range(num_embedding_rows)):
        words = [word for word in dset_df_rows[row_index].split() if word in embedding_vectors]
        if(len(words) > 0):
            sentence_embedding_matrix = embedding_vectors[words]
            mean_embedding_X[row_index,:] = np.mean(sentence_embedding_matrix,axis=0)
            sum_embedding_X[row_index,:] = np.sum(sentence_embedding_matrix, axis=0)
            min_embedding_X[row_index,:] = np.min(sentence_embedding_matrix, axis=0)
            max_embedding_X[row_index,:] = np.max(sentence_embedding_matrix, axis=0)
    embedding_X = np.concatenate([mean_embedding_X, sum_embedding_X, min_embedding_X, max_embedding_X], axis=1)
    return embedding_X


## Metricsifier

In [5]:
from sklearn.metrics import f1_score, plot_confusion_matrix, precision_score, recall_score

In [6]:
def summarize(model, X, y):
    yhat = model.predict(X)
    print("F1 score:", f1_score(y, yhat))
    print("Precision:", precision_score(y, yhat))
    print("Recall:", recall_score(y, yhat))
    print("Confusion matrix:")
    plot_confusion_matrix(model, X, y)
    plt.show()

## LinearSVC

In [7]:
# from sklearn.svm import LinearSVC

In [8]:
# svm = LinearSVC(penalty="l2",dual=True,class_weight={0:1,1:3}, C=1)

## Cross-validation with LinearSVC

In [9]:
# from sklearn.model_selection import KFold

In [10]:
# kfcv = KFold(n_splits=10, shuffle=True)
# kfcv_array = np.array(list(range(NUM_TRAIN_EXAMPLES))).reshape(-1, 1)
# for train_indices, test_indices in kfcv.split(kfcv_array):
#     word2vec = gensim_api.load("word2vec-google-news-300")
#     train_embedded_X = embedding_entire_dset(train_dset_df["preprocessed_joined",word2vec)
#     trainset_embedded_X = train_embedded_X[train_indices,:]
#     trainset_y = train_dset_y[train_indices]
#     testset_embedded_X = train_embedded_X[test_indices,:]
#     testset_y = train_dset_y[test_indices]
#     word2vec = None 
#     train_embedded_X = None 
#     [gc.collect() for i in range(3)]
#     svm.fit(trainset_embedded_X, trainset_y)

#     print("\n\nTRAINING:")
#     summarize(svm, trainset_embedded_X, trainset_y)

#     print("\nTESTING:")
#     summarize(svm, testset_embedded_X, testset_y)


    

## SGDC With Cross-validation

Tried SGDC squared_hinge 400 iters alpha 0.0001 weights 1 : 2.5 eta0 0.00001 validation f1 score 0.58

In [11]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold

In [12]:
sgdc = SGDClassifier(loss="modified_huber", alpha=0.0001, class_weight={0:1, 1:2.5}, learning_rate="constant", eta0 = 0.000001)

In [13]:
kfcv = KFold(n_splits=10, shuffle=True)
kfcv_array = np.array(list(range(NUM_TRAIN_EXAMPLES))).reshape(-1, 1)
for train_indices, test_indices in kfcv.split(kfcv_array):
    word2vec = gensim_api.load("word2vec-google-news-300")
    train_embedded_X = embedding_entire_dset(train_dset_df["preprocessed_joined"],word2vec)
    trainset_embedded_X = train_embedded_X[train_indices,:]
    trainset_y = train_dset_y[train_indices]
    testset_embedded_X = train_embedded_X[test_indices,:]
    testset_y = train_dset_y[test_indices]
    word2vec = None 
    train_embedded_X = None 
    [gc.collect() for i in range(3)]
    for i in range(1600):
        sgdc.partial_fit(trainset_embedded_X, trainset_y, classes=(0, 1))
        trainset_yhat = sgdc.predict(trainset_embedded_X)
        trainset_yhat = np.round(trainset_yhat)
        new_train_f1_score = f1_score(trainset_y, trainset_yhat)
        print(i)
        print("\tTRAIN F1 score:", new_train_f1_score)

    print("\n\nTRAINING:")
    summarize(sgdc, trainset_embedded_X, trainset_y)

    print("\nTESTING:")
    summarize(sgdc, testset_embedded_X, testset_y)


    

100%|██████████| 783673/783673 [01:13<00:00, 10660.84it/s]
0
	TRAIN F1 score: 0.46136203490058997
1
	TRAIN F1 score: 0.5087972525694946
2
	TRAIN F1 score: 0.5143385051507252
3
	TRAIN F1 score: 0.5289523947680181
4
	TRAIN F1 score: 0.5270646458397774
5
	TRAIN F1 score: 0.5267993491050194
6
	TRAIN F1 score: 0.5320700247973648
7
	TRAIN F1 score: 0.5340092006758673
8
	TRAIN F1 score: 0.531867638251148
9
	TRAIN F1 score: 0.5483998053347547
10
	TRAIN F1 score: 0.5433802005854408
11
	TRAIN F1 score: 0.5423117863363655
12
	TRAIN F1 score: 0.5384463673791386
13
	TRAIN F1 score: 0.5402911072384085
14
	TRAIN F1 score: 0.5456205695061878
15
	TRAIN F1 score: 0.5464414090582316
16
	TRAIN F1 score: 0.5450149888792186
17
	TRAIN F1 score: 0.5428875539841199
18
	TRAIN F1 score: 0.5513099793935826
19
	TRAIN F1 score: 0.5501396565044274
20
	TRAIN F1 score: 0.5507105522039563
21
	TRAIN F1 score: 0.5414822324983308
22
	TRAIN F1 score: 0.5493754645057659
23
	TRAIN F1 score: 0.547111650485437
24
	TRAIN F1 sco