## Imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import gensim.downloader as gensim_api 

from tqdm import tqdm 
tqdm.pandas()
import gc

In [2]:
train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")

train_dset_df["preprocessed_joined"].fillna("", inplace=True)
train_dset_y = train_dset_df["target"].to_numpy()

## Embedder function

In [3]:
def embedding_entire_dset(dset_df_rows, embedding_vectors):
    '''
    dset_df_rows: A pandas Series. Each entry is a string of space-separated words without numbers and special characters.
    embedding_vectors: A non-trainable embedding vectors instance from Gensim 
    num_embedding_columns: The number of columns.
    ''' 
    num_embedding_columns=300
    num_embedding_rows = len(dset_df_rows)
    mean_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    sum_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    min_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    max_embedding_X = np.zeros((num_embedding_rows, num_embedding_columns))
    for row_index in tqdm(range(num_embedding_rows)):
        words = [word for word in dset_df_rows[row_index].split() if word in embedding_vectors]
        if(len(words) > 0):
            sentence_embedding_matrix = embedding_vectors[words]
            mean_embedding_X[row_index,:] = np.mean(sentence_embedding_matrix,axis=0)
            sum_embedding_X[row_index,:] = np.sum(sentence_embedding_matrix, axis=0)
            min_embedding_X[row_index,:] = np.min(sentence_embedding_matrix, axis=0)
            max_embedding_X[row_index,:] = np.max(sentence_embedding_matrix, axis=0)
    embedding_X = np.concatenate([mean_embedding_X, sum_embedding_X, min_embedding_X, max_embedding_X], axis=1)
    return embedding_X


## Metricsifier

In [4]:
from sklearn.metrics import f1_score, plot_confusion_matrix, precision_score, recall_score

In [5]:
def summarize(model, X, y):
    yhat = model.predict(X)
    print("F1 score:", f1_score(y, yhat))
    print("Precision:", precision_score(y, yhat))
    print("Recall:", recall_score(y, yhat))
    print("Confusion matrix:")
    plot_confusion_matrix(model, X, y)
    plt.show()

## Correlation analysis

In [6]:
word2vec = gensim_api.load("word2vec-google-news-300")
train_embedded_X = embedding_entire_dset(train_dset_df["preprocessed_joined"],word2vec)

100%|██████████| 783673/783673 [01:11<00:00, 10944.87it/s]


In [7]:
train_dset_df = None 
gc.collect()

0

In [8]:
corrmatrix = np.corrcoef(train_embedded_X, rowvar=False)

In [9]:
corrmatrix.shape

(1200, 1200)

In [10]:
correlated_pairs = []
threshold = 0.8
for first_column_index in tqdm(range(corrmatrix.shape[0])):
    for second_column_index in range(first_column_index):
        if(corrmatrix[first_column_index][second_column_index] >= threshold):
            correlated_pairs.append([first_column_index, second_column_index])

100%|██████████| 1200/1200 [00:00<00:00, 4331.80it/s]


In [11]:
len(correlated_pairs)

166

In [12]:
to_remove = [pair[1] for pair in correlated_pairs]

In [13]:
to_keep = [index for index in range(corrmatrix.shape[0]) if index not in to_remove]

In [14]:
train_embedded_X = train_embedded_X[:,to_keep]

In [15]:
train_embedded_X.shape

(783673, 1034)

In [16]:
word2vec = None

In [17]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_embedded_X = scaler.fit_transform(train_embedded_X)

## With Train Test Split

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
trainset_X, testset_X, trainset_Y, testset_Y = train_test_split(train_embedded_X, train_dset_y, test_size = 0.1)
train_embedded_X = None 
[gc.collect() for _ in range(3)]

[0, 0, 0]

## Imblearn

In [20]:
from imblearn.over_sampling import RandomOverSampler

In [21]:
ros = RandomOverSampler(sampling_strategy = 0.5)

trainset_X_oversampled, trainset_Y_oversampled = ros.fit_resample(trainset_X, trainset_Y)

## XGBoost

In [23]:
from xgboost import XGBClassifier

xgbc = XGBClassifier(n_estimators = 100, learning_rate=0.3,gamma=0.1, max_depth=2,verbosity=2)

In [24]:
xgbc.fit(trainset_X_oversampled, trainset_Y_oversampled)

SyntaxError: invalid syntax (<ipython-input-24-86edd1a3af55>, line 1)