## Imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import gensim.downloader as gensim_api 

from tqdm import tqdm 
tqdm.pandas()
import gc

In [2]:
train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")

train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

train_dset_y = train_dset_df["target"].to_numpy()
word2vec = gensim_api.load("word2vec-google-news-300")

## Embedder function

In [3]:
def concatenated_embedding(dset_df_rows, embedding_vectors, num_embedding_columns):
    '''
    dset_df_rows: A pandas Series. Each entry is a string of space-separated words without numbers and special characters.
    embedding_vectors: A non-trainable embedding vectors instance from Gensim 
    num_embedding_columns: The number of columns.
    ''' 
    num_embedding_rows = len(dset_df_rows)
    embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    for row_index in tqdm(range(num_embedding_rows)):
        words = [word for word in dset_df_rows[row_index].split() if word in embedding_vectors]
        if(len(words) > 0):
            embedding_curr_row = embedding_vectors[words].reshape(-1)
            embedding_X[row_index,:min(len(embedding_curr_row), num_embedding_columns)] = embedding_curr_row[:min(len(embedding_curr_row), num_embedding_columns)]
    return embedding_X


## Metricsifier

In [4]:
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score

In [5]:
def summarize(y, yhat):
    print("F1 score:", f1_score(y, yhat))
    print("Precision:", precision_score(y, yhat))
    print("Recall:", recall_score(y, yhat))
    print("Confusion matrix:")
    print(confusion_matrix(y, yhat))

## SGDClassifier

## Pipeline

In [6]:
from sklearn.linear_model import SGDClassifier

In [7]:
sgdc = SGDClassifier(loss="log", class_weight={0:1,1:2.3}, alpha = 0.0001, learning_rate = "constant", eta0=0.0001)

F1_SCORE_DELTA_THRESHOLD = 0.00001
SPLIT_AMOUNT = 400000
NUM_TRAINING_EXAMPLES = len(train_dset_df)
EMBEDDING_SIZE = 300*40

old_f1_score = None
new_f1_score = None 
train_dset_x = train_dset_df["preprocessed_joined"].to_numpy(dtype="object")
train_dset_yhat = np.zeros((NUM_TRAINING_EXAMPLES,))

num_epochs = 0
firstTime = True

while old_f1_score is None or new_f1_score is None or abs(new_f1_score - old_f1_score) > F1_SCORE_DELTA_THRESHOLD:

    old_f1_score = new_f1_score 

    shuffle_markers = np.array(range(NUM_TRAINING_EXAMPLES))
    np.random.shuffle(shuffle_markers)
    train_dset_x = train_dset_x[shuffle_markers]    
    train_dset_y = train_dset_y[shuffle_markers]
    
    [gc.collect() for _ in range(3)]
    split_marker = 0
    print("TRAINING")
    while(split_marker < NUM_TRAINING_EXAMPLES):
        split_end_marker = min(split_marker + SPLIT_AMOUNT, NUM_TRAINING_EXAMPLES)
        train_dset_x_subset = train_dset_x[split_marker:split_end_marker]
        train_dset_x_subset_embedded = concatenated_embedding(train_dset_x_subset, word2vec, EMBEDDING_SIZE)
        train_dset_y_subset = train_dset_y[split_marker:split_end_marker]
        
        if(firstTime):
            sgdc.partial_fit(train_dset_x_subset_embedded, train_dset_y_subset, classes=(0, 1))
        else:
            sgdc.partial_fit(train_dset_x_subset_embedded, train_dset_y_subset)

        train_dset_x_subset_embedded = None 
        train_dset_y_subset = None 
        train_dset_x_subset = None
        [gc.collect() for _ in range(3)]

        split_marker += SPLIT_AMOUNT
    
    split_marker = 0
    print("PREDICTING")
    while(split_marker < NUM_TRAINING_EXAMPLES):
        split_end_marker = min(split_marker + SPLIT_AMOUNT, NUM_TRAINING_EXAMPLES)
        train_dset_x_subset = train_dset_x[split_marker:split_end_marker]
        train_dset_x_subset_embedded = concatenated_embedding(train_dset_x_subset, word2vec, EMBEDDING_SIZE)
        train_dset_y_subset = train_dset_y[split_marker:split_end_marker]
        train_dset_yhat[split_marker:split_end_marker] = sgdc.predict(train_dset_x_subset_embedded)
        
        train_dset_x_subset_embedded = None 
        train_dset_y_subset = None 
        train_dset_x_subset = None
        [gc.collect() for _ in range(3)]
        
        split_marker += SPLIT_AMOUNT
    
    print("Epoch number:", num_epochs)
    num_epochs += 1
    train_dset_yhat =np.round(train_dset_yhat)
    summarize(train_dset_y, train_dset_yhat)
    

    


    
    

  1%|          | 2321/400000 [00:00<00:17, 23209.14it/s]TRAINING
100%|██████████| 400000/400000 [00:17<00:00, 22867.06it/s]
100%|██████████| 383673/383673 [00:16<00:00, 22602.00it/s]
  1%|          | 2267/400000 [00:00<00:17, 22666.56it/s]PREDICTING
100%|██████████| 400000/400000 [00:17<00:00, 22597.23it/s]
100%|██████████| 383673/383673 [00:16<00:00, 22955.18it/s]
Epoch number: 0
F1 score: 0.4762936221419976
Precision: 0.5711564547317383
Recall: 0.40845390188025016
Confusion matrix:
[[720363  14859]
 [ 28661  19790]]
  1%|          | 2214/400000 [00:00<00:17, 22135.06it/s]TRAINING
100%|██████████| 400000/400000 [00:17<00:00, 22929.93it/s]
100%|██████████| 383673/383673 [00:16<00:00, 22760.27it/s]
  1%|          | 2260/400000 [00:00<00:17, 22594.04it/s]PREDICTING
100%|██████████| 400000/400000 [00:17<00:00, 22394.98it/s]
100%|██████████| 383673/383673 [00:16<00:00, 22623.70it/s]
Epoch number: 1
F1 score: 0.512587841986764
Precision: 0.5707376364234888
Recall: 0.4651916369115189
Confusi

## Conclusion: Concatenated embeddings do not seem to have a major effect as opposed to averaged embeddings.