# Attempt at removing redundant columns

Conclusion - every feature is (somehow) useful -- none can be removed at this stage.

## Imports and vectorization

In [5]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
import gc
from tqdm import tqdm 
tqdm.pandas()

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()



## Removing redundant columns

First, let us remove those columns which have virtually no predictive power.

In [6]:
sparse_train_x_csc = sparse_train_x.tocsc()

In [60]:
train_dset_y = train_dset_y.reshape(-1, 1)
n_y = np.sum(train_dset_y)
n_all = sparse_train_x_csc.shape[0]
p_y = n_y/n_all
probability_epsilon = 0.05
to_keep = []
for column_id in tqdm(range(sparse_train_x_csc.shape[1])):
    curr_col = sparse_train_x_csc[:,column_id]
    curr_col = ((curr_col > 0).astype(int)).toarray().reshape(-1, 1)
    n_curr_col = curr_col.sum()
    # print("n_curr_col:",n_curr_col)
    n_not_curr_col = n_all - n_curr_col
    # print("n_not_curr_col:",n_not_curr_col)
    n_y_and_curr_col = np.sum((np.logical_and(train_dset_y, curr_col)).astype(int))
    # print("n_y_andcurr_col:",n_y_and_curr_col)
    n_y_and_not_curr_col = np.sum(np.sum(np.logical_and(train_dset_y, np.logical_not(curr_col))).astype(int))
    # print("n_y_and_not_curr_col:",n_y_and_not_curr_col)
    p_y_and_curr_col = n_y_and_curr_col/n_curr_col 
    p_y_and_not_curr_col = n_y_and_not_curr_col/n_not_curr_col
    if(abs(p_y_and_curr_col - p_y) > probability_epsilon or abs(p_y_and_not_curr_col - p_y) > probability_epsilon):
        to_keep.append(column_id)

    

100%|██████████| 54972/54972 [05:16<00:00, 173.78it/s]


In [61]:
len(to_keep), sparse_train_x.shape[1]

(45973, 54972)

In [62]:
sparse_train_x_reduced = sparse_train_x[:, to_keep]

## LinearSVC

In [63]:
svm = LinearSVC(penalty="l2",dual=True,class_weight={0:1,1:4}, C=0.01)
svm.fit(sparse_train_x_reduced, train_dset_y)

LinearSVC(C=0.01, class_weight={0: 1, 1: 4})

In [64]:
train_dset_yhat = svm.predict(sparse_train_x_reduced)
f1_score(train_dset_y, train_dset_yhat)

0.6223723232077537

In [65]:
train_dset_df["yhat"] = train_dset_yhat
wrongs = train_dset_df[train_dset_df["target"] != train_dset_df["yhat"]]
print(len(wrongs.groupby(by="target").get_group(0)),len(wrongs.groupby(by="target").get_group(1)))
print(len(train_dset_df.groupby(by="target").get_group(0)), len(train_dset_df.groupby(by="target").get_group(1)))

25177 15188
735222 48451


## Cross-validation

In [66]:
gc.collect()

54

In [67]:
kfcv = KFold(n_splits=10, shuffle=True)
train_f1_scores = []
test_f1_scores = []
i=0
for train_index, test_index in kfcv.split(sparse_train_x_reduced):
    print("TRAINING AGAIN.", i)
    i+=1
    # print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = sparse_train_x_reduced[train_index], sparse_train_x_reduced[test_index]
    y_train, y_test = train_dset_y[train_index], train_dset_y[test_index]
    svm.fit(x_train, y_train)
    train_yhat = svm.predict(x_train)
    train_f1_score = f1_score(y_train, train_yhat)
    test_yhat = svm.predict(x_test)
    test_f1_score = f1_score(y_test, test_yhat)
    train_f1_scores.append(train_f1_score)
    test_f1_scores.append(test_f1_score)
    train_yhat = None 
    test_yhat = None 
    x_train = None 
    x_test = None 
    y_train = None 
    y_test = None 
    print(gc.collect())
    print(gc.collect())
    print(gc.collect())

TRAINING AGAIN. 0
0
0
0
TRAINING AGAIN. 1
0
0
0
TRAINING AGAIN. 2
0
0
0
TRAINING AGAIN. 3
0
0
0
TRAINING AGAIN. 4
0
0
0
TRAINING AGAIN. 5
0
0
0
TRAINING AGAIN. 6
0
0
0
TRAINING AGAIN. 7
0
0
0
TRAINING AGAIN. 8
0
0
0
TRAINING AGAIN. 9
0
0
0


In [68]:
train_f1_scores

[0.6211308037943086,
 0.6228194346142637,
 0.622641901437713,
 0.6213503269607231,
 0.6218340247232704,
 0.6229157804085953,
 0.6229228092474709,
 0.6235417991514027,
 0.6211023425038833,
 0.623167475349881]

In [69]:
test_f1_scores

[0.6117735423903858,
 0.5933213508655757,
 0.5969441017367372,
 0.6084507042253521,
 0.6061682242990654,
 0.5965952563121654,
 0.5987355894384531,
 0.5949024367472692,
 0.6070898292501856,
 0.5946153109130977]

## Testset write