# Dimensionality Reduction

The reason I have returned to dimensionality reduction is because it is simply not possible to handle the current scenario for the nonlinear SVCs.

Update: Threshold of 100 occurences seems to have favorable results. Dimension reduces to 5900 with only a 0.05 decrease in F1 score (test, train).

Test f1 score during crossval:

 0.6193501542929751,
 0.6124941286989197,
 0.6131977784053468,
 0.6122831692451945,
 0.6105302118996947,
 0.6111627906976744,
 0.6039510818438383,
 0.6179420505200595,
 0.610534194031247,
 0.6130737134909596

 Train f1 score:
 0.6475649945075064

## Imports and preprocessing

In [10]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
from tqdm import tqdm 
tqdm.pandas()

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

vectorizer = TfidfVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
original_sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
original_sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()

def summarize(y, yhat):
    '''
    y and yhat are both 1-dimensional ndarrays where every entry is either 0 or 1. 
    y and yhat must have the same size 
    '''
    print("Number of zeros in y:", np.sum( (y == 0).astype(int) ))
    print(" Number of ones in y:", np.sum((y == 1).astype(int)))
    print("            F1 score:", f1_score(y, yhat))
    print(" # of zeros wrong yh:", np.sum(np.logical_and(y == 0, yhat == 1).astype(int)))
    print("  # of ones wrong yh:", np.sum(np.logical_and(y == 1, yhat == 0).astype(int)))


## Dimensionality reduction

In [99]:
original_sparse_train_x_csc = original_sparse_train_x.tocsc()
THRESHOLD = 100
columns_to_keep = []
for column_id in tqdm(range(original_sparse_train_x_csc.shape[1])):
    if np.sum((original_sparse_train_x_csc[:,column_id] > 0).astype(int)) > THRESHOLD:
        columns_to_keep.append(column_id)

100%|██████████| 54972/54972 [00:50<00:00, 1078.46it/s]


In [100]:
original_sparse_train_x.shape[1]

54972

In [101]:
len(columns_to_keep)

5600

In [102]:
sparse_train_x = original_sparse_train_x[:,columns_to_keep]

## LinearSVM

In [113]:
svm = LinearSVC(penalty="l2",dual=True,class_weight={0:1,1:2.9}, C=1)
svm.fit(sparse_train_x, train_dset_y)

LinearSVC(C=1, class_weight={0: 1, 1: 2.9})

In [114]:
train_dset_yhat = svm.predict(sparse_train_x)
summarize(train_dset_y, train_dset_yhat)

Number of zeros in y: 735222
 Number of ones in y: 48451
            F1 score: 0.601800142015468
 # of zeros wrong yh: 24405
  # of ones wrong yh: 17093


## Cross validation

In [115]:
from sklearn.model_selection import KFold
import gc

In [116]:
gc.collect()

0

In [117]:
kfcv = KFold(n_splits=10, shuffle=True)
train_f1_scores = []
test_f1_scores = []

In [118]:
i=0
for train_index, test_index in kfcv.split(sparse_train_x):
    print("TRAINING AGAIN.", i)
    i+=1
    # print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = sparse_train_x[train_index], sparse_train_x[test_index]
    y_train, y_test = train_dset_y[train_index], train_dset_y[test_index]
    svm.fit(x_train, y_train)
    train_yhat = svm.predict(x_train)
    train_f1_score = f1_score(y_train, train_yhat)
    test_yhat = svm.predict(x_test)
    test_f1_score = f1_score(y_test, test_yhat)
    train_f1_scores.append(train_f1_score)
    test_f1_scores.append(test_f1_score)
    train_yhat = None 
    test_yhat = None 
    x_train = None 
    x_test = None 
    y_train = None 
    y_test = None 
    print(gc.collect())
    print(gc.collect())
    print(gc.collect())

TRAINING AGAIN. 0
0
0
0
TRAINING AGAIN. 1
0
0
0
TRAINING AGAIN. 2
0
0
0
TRAINING AGAIN. 3
0
0
0
TRAINING AGAIN. 4
0
0
0
TRAINING AGAIN. 5
0
0
0
TRAINING AGAIN. 6
0
0
0
TRAINING AGAIN. 7
0
0
0
TRAINING AGAIN. 8
0
0
0
TRAINING AGAIN. 9
0
0
0


In [119]:
train_f1_scores


[0.6040131214586973,
 0.6044318917828436,
 0.6028393758066408,
 0.6017199906337144,
 0.6025703977284858,
 0.6025582311780431,
 0.6019121193367336,
 0.6037341522450764,
 0.6039779146858811,
 0.6026827160098601]

In [120]:
test_f1_scores

[0.5795683453237411,
 0.5768404608474907,
 0.5851074077641846,
 0.5889167793010234,
 0.5855556610040809,
 0.5860230687215275,
 0.5929878048780488,
 0.5837025621267579,
 0.5803741886216113,
 0.5785455239549532]

## Testset write

In [367]:
sparse_test_x = vectorizer.transform(test_dset_df["preprocessed_joined"])
test_yhat = svm.predict(sparse_test_x)
output_df = test_dset_df.copy()
output_df.info()
output_df["preprocessed_joined"] = test_yhat
output_df = output_df.rename(columns={"qid":"qid", "preprocessed_joined":"target"})
output_df.target = output_df.target.apply(round)
output_df.to_csv("./outputs/2020_10_19_a_testset_output.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522449 entries, 0 to 522448
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   qid                  522449 non-null  object
 1   preprocessed_joined  522449 non-null  object
dtypes: object(2)
memory usage: 8.0+ MB
