# Further SVM Tuning

## Imports and preprocessing

In [306]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()



## Removing redundant columns

SKIP this for now.

In [209]:
sparse_train_x.shape

(783673, 40882)

In [210]:
usefulness_threshold = 300

In [211]:
sums =  sparse_train_x.sum(axis=0)

In [212]:
sums.shape

(1, 40882)

In [213]:
to_keep = []
for i in range(sums.shape[1]):
    if(sums[0,i]) > usefulness_threshold:
        to_keep.append(i)

In [214]:
len(to_keep)

2516

In [215]:
sparse_train_x = sparse_train_x[:, to_keep]

## Quadratic features

SKIP this as well.

In [236]:
from sklearn.preprocessing import PolynomialFeatures

In [237]:
pf = PolynomialFeatures(degree=2)

In [239]:
sparse_train_x = pf.fit_transform(sparse_train_x)

## LinearSVC

In [307]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score


In [364]:
svm = LinearSVC(penalty="l2",dual=True,class_weight={0:1,1:4}, C=0.01)
svm.fit(sparse_train_x, train_dset_y)

LinearSVC(C=0.01, class_weight={0: 1, 1: 4})

In [365]:
train_dset_yhat = svm.predict(sparse_train_x)
f1_score(train_dset_y, train_dset_yhat)

0.6475649945075064

In [366]:
train_dset_df["yhat"] = train_dset_yhat
wrongs = train_dset_df[train_dset_df["target"] != train_dset_df["yhat"]]
print(len(wrongs.groupby(by="target").get_group(0)),len(wrongs.groupby(by="target").get_group(1)))
print(len(train_dset_df.groupby(by="target").get_group(0)), len(train_dset_df.groupby(by="target").get_group(1)))

25419 13081
735222 48451


## Cross validation

In [324]:
from sklearn.model_selection import KFold
import gc

In [326]:
gc.collect()

0

In [360]:
kfcv = KFold(n_splits=10, shuffle=True)
svm = LinearSVC(penalty="l2",dual=True,class_weight={0:1,1:4}, C=0.01)
train_f1_scores = []
test_f1_scores = []

In [361]:
i=0
for train_index, test_index in kfcv.split(sparse_train_x):
    print("TRAINING AGAIN.", i)
    i+=1
    # print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = sparse_train_x[train_index], sparse_train_x[test_index]
    y_train, y_test = train_dset_y[train_index], train_dset_y[test_index]
    svm.fit(x_train, y_train)
    train_yhat = svm.predict(x_train)
    train_f1_score = f1_score(y_train, train_yhat)
    test_yhat = svm.predict(x_test)
    test_f1_score = f1_score(y_test, test_yhat)
    train_f1_scores.append(train_f1_score)
    test_f1_scores.append(test_f1_score)
    train_yhat = None 
    test_yhat = None 
    x_train = None 
    x_test = None 
    y_train = None 
    y_test = None 
    print(gc.collect())
    print(gc.collect())
    print(gc.collect())

TRAINING AGAIN. 0
0
0
0
TRAINING AGAIN. 1
0
0
0
TRAINING AGAIN. 2
0
0
0
TRAINING AGAIN. 3
0
0
0
TRAINING AGAIN. 4
0
0
0
TRAINING AGAIN. 5
0
0
0
TRAINING AGAIN. 6
0
0
0
TRAINING AGAIN. 7
0
0
0
TRAINING AGAIN. 8
0
0
0
TRAINING AGAIN. 9
0
0
0


In [362]:
train_f1_scores


[0.6361956667595661,
 0.6378047402323884,
 0.6372843788160204,
 0.6365356144102648,
 0.6372809456147939,
 0.6379340465106719,
 0.638624740701185,
 0.6369170025499713,
 0.637918460005547,
 0.6372732700582653]

In [363]:
test_f1_scores

[0.6193501542929751,
 0.6124941286989197,
 0.6131977784053468,
 0.6122831692451945,
 0.6105302118996947,
 0.6111627906976744,
 0.6039510818438383,
 0.6179420505200595,
 0.610534194031247,
 0.6130737134909596]

## Testset write

In [367]:
sparse_test_x = vectorizer.transform(test_dset_df["preprocessed_joined"])
test_yhat = svm.predict(sparse_test_x)
output_df = test_dset_df.copy()
output_df.info()
output_df["preprocessed_joined"] = test_yhat
output_df = output_df.rename(columns={"qid":"qid", "preprocessed_joined":"target"})
output_df.target = output_df.target.apply(round)
output_df.to_csv("./outputs/2020_10_19_a_testset_output.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522449 entries, 0 to 522448
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   qid                  522449 non-null  object
 1   preprocessed_joined  522449 non-null  object
dtypes: object(2)
memory usage: 8.0+ MB
