# Further SVM Tuning

## Imports and preprocessing

In [208]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
train_dset_df = pd.read_csv("2020_10_18_train_dset_df.csv")
test_dset_df = pd.read_csv("2020_10_18_test_dset_df.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()



## Removing redundant columns

In [209]:
sparse_train_x.shape

(783673, 40882)

In [210]:
usefulness_threshold = 300

In [211]:
sums =  sparse_train_x.sum(axis=0)

In [212]:
sums.shape

(1, 40882)

In [213]:
to_keep = []
for i in range(sums.shape[1]):
    if(sums[0,i]) > usefulness_threshold:
        to_keep.append(i)

In [214]:
len(to_keep)

2516

In [215]:
sparse_train_x = sparse_train_x[:, to_keep]

## Quadratic features

In [236]:
from sklearn.preprocessing import PolynomialFeatures

In [237]:
pf = PolynomialFeatures(degree=2)

In [239]:
sparse_train_x = pf.fit_transform(sparse_train_x)

## LinearSVC

In [216]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score


In [277]:
svm = LinearSVC(penalty="l2",dual=True,class_weight={0:1,1:3.5}, C=0.05)
svm.fit(sparse_train_x, train_dset_y)

LinearSVC(C=0.05, class_weight={0: 1, 1: 3.5})

In [278]:
train_dset_yhat = svm.predict(sparse_train_x)
f1_score(train_dset_y, train_dset_yhat)

0.9018491886213192

In [279]:
train_dset_df["yhat"] = train_dset_yhat
wrongs = train_dset_df[train_dset_df["target"] != train_dset_df["yhat"]]
print(len(wrongs.groupby(by="target").get_group(0)),len(wrongs.groupby(by="target").get_group(1)))
print(len(train_dset_df.groupby(by="target").get_group(0)), len(train_dset_df.groupby(by="target").get_group(1)))

5382 4241
735222 48451


## Cross validation

In [252]:
from sklearn.model_selection import KFold
import gc

In [281]:
gc.collect()

0

In [302]:
kfcv = KFold(n_splits=2, shuffle=True)
svm = LinearSVC(penalty="l2",dual=True,class_weight={0:1,1:4}, C=0.1)
train_f1_scores = []
test_f1_scores = []

In [303]:
i=0
for train_index, test_index in kfcv.split(sparse_train_x):
    print("TRAINING AGAIN.", i)
    i+=1
    # print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = sparse_train_x[train_index], sparse_train_x[test_index]
    y_train, y_test = train_dset_y[train_index], train_dset_y[test_index]
    svm.fit(x_train, y_train)
    train_yhat = svm.predict(x_train)
    train_f1_score = f1_score(y_train, train_yhat)
    test_yhat = svm.predict(x_test)
    test_f1_score = f1_score(y_test, test_yhat)
    train_f1_scores.append(train_f1_score)
    test_f1_scores.append(test_f1_score)
    train_yhat = None 
    test_yhat = None 
    x_train = None 
    x_test = None 
    y_train = None 
    y_test = None 
    print(gc.collect())
    print(gc.collect())
    print(gc.collect())

TRAINING AGAIN. 0
0
0
0
TRAINING AGAIN. 1
0
0
0


In [304]:
train_f1_scores


[0.9440556578678126, 0.9453638630695987]

In [305]:
test_f1_scores

[0.503162177701119, 0.5065915042833681]