## Imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import gensim.downloader as gensim_api 

from tqdm import tqdm 
tqdm.pandas()
import gc

In [2]:
train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")

train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

train_dset_y = train_dset_df["target"].to_numpy()

In [3]:
NUM_TRAIN_EXAMPLES = len(train_dset_df)

## Embedder function

In [4]:
def embedding_entire_dset(dset_df_rows, embedding_vectors):
    '''
    dset_df_rows: A pandas Series. Each entry is a string of space-separated words without numbers and special characters.
    embedding_vectors: A non-trainable embedding vectors instance from Gensim 
    num_embedding_columns: The number of columns.
    ''' 
    num_embedding_columns=300
    num_embedding_rows = len(dset_df_rows)
    mean_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    sum_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    min_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    max_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    for row_index in tqdm(range(num_embedding_rows)):
        words = [word for word in dset_df_rows[row_index].split() if word in embedding_vectors]
        if(len(words) > 0):
            sentence_embedding_matrix = embedding_vectors[words]
            mean_embedding_X[row_index,:] = np.mean(sentence_embedding_matrix,axis=0)
            sum_embedding_X[row_index,:] = np.sum(sentence_embedding_matrix, axis=0)
            min_embedding_X[row_index,:] = np.min(sentence_embedding_matrix, axis=0)
            max_embedding_X[row_index,:] = np.max(sentence_embedding_matrix, axis=0)
    embedding_X = np.concatenate([mean_embedding_X, sum_embedding_X, min_embedding_X, max_embedding_X], axis=1)
    return embedding_X


## Metricsifier

In [5]:
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score

In [6]:
def summarize(model, X, y):
    yhat = model.predict(X)
    print("F1 score:", f1_score(y, yhat))
    print("Precision:", precision_score(y, yhat))
    print("Recall:", recall_score(y, yhat))
    print("Confusion matrix:")
    confusion_matrix(y, yhat)

# Bagged SVC

In [7]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.svm import SVC


In [8]:
svm = SVC(C=0.01, class_weight={0:1,1:1}, kernel="rbf")
bgclf = BalancedBaggingClassifier(base_estimator=svm, n_estimators=10, max_samples=0.1,verbose=3, n_jobs=4) 

In [9]:
from sklearn.model_selection import KFold
import gc
gc.collect()

0

In [10]:
kfcv = KFold(n_splits=10, shuffle=True)
kfcv_array = np.array(list(range(NUM_TRAIN_EXAMPLES))).reshape(-1, 1)
for train_indices, test_indices in kfcv.split(kfcv_array):
    word2vec = gensim_api.load("word2vec-google-news-300")
    train_embedded_X = embedding_entire_dset(train_dset_df["preprocessed_joined"],word2vec)
    trainset_embedded_X = train_embedded_X[train_indices,:]
    trainset_y = train_dset_y[train_indices]
    testset_embedded_X = train_embedded_X[test_indices,:]
    testset_y = train_dset_y[test_indices]
    word2vec = None 
    train_embedded_X = None 
    [gc.collect() for i in range(3)]
    bgclf.fit(trainset_embedded_X, trainset_y)

    print("\n\nTRAINING:")
    summarize(bgclf, trainset_embedded_X, trainset_y)

    print("\nTESTING:")
    summarize(bgclf, testset_embedded_X, testset_y)


    

100%|██████████| 783673/783673 [01:12<00:00, 10771.48it/s]
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Building estimator 1 of 10 for this parallel run (total 10)...
Building estimator 2 of 10 for this parallel run (total 10)...
Building estimator 3 of 10 for this parallel run (total 10)...
Building estimator 4 of 10 for this parallel run (total 10)...
Building estimator 5 of 10 for this parallel run (total 10)...
Building estimator 6 of 10 for this parallel run (total 10)...
Building estimator 7 of 10 for this parallel run (total 10)...
Building estimator 8 of 10 for this parallel run (total 10)...
Building estimator 9 of 10 for this parallel run (total 10)...
Building estimator 10 of 10 for this parallel run (total 10)...
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 14.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 14.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers