# Further SVM Tuning

## Imports and preprocessing

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()



## LinearSVC

In [2]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score


In [334]:
svm = LinearSVC(penalty="l2",dual=True,class_weight={0:1,1:3.35}, C=0.0125)
svm.fit(sparse_train_x, train_dset_y)

LinearSVC(C=0.0125, class_weight={0: 1, 1: 3.35})

In [326]:
train_dset_yhat = svm.predict(sparse_train_x)
f1_score(train_dset_y, train_dset_yhat)

0.6504465956079168

In [327]:
train_dset_df["yhat"] = train_dset_yhat
wrongs = train_dset_df[train_dset_df["target"] != train_dset_df["yhat"]]
print(len(wrongs.groupby(by="target").get_group(0)),len(wrongs.groupby(by="target").get_group(1)))
print(len(train_dset_df.groupby(by="target").get_group(0)), len(train_dset_df.groupby(by="target").get_group(1)))

21883 14552
735222 48451


## Cross validation

In [328]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score
import gc

In [329]:
gc.collect()

0

In [330]:
kfcv = KFold(n_splits=10, shuffle=True)
train_f1_scores = []
test_f1_scores = []

In [331]:
i=0
for train_index, test_index in kfcv.split(sparse_train_x):
    print("TRAINING AGAIN.", i)
    i+=1
    # print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = sparse_train_x[train_index], sparse_train_x[test_index]
    y_train, y_test = train_dset_y[train_index], train_dset_y[test_index]
    svm.fit(x_train, y_train)
    train_yhat = svm.predict(x_train)
    train_f1_score = f1_score(y_train, train_yhat)
    test_yhat = svm.predict(x_test)
    test_f1_score = f1_score(y_test, test_yhat)
    n_y_one_and_yhat_zero = np.sum(np.logical_and(y_test == 1, test_yhat == 0).astype(int))
    n_y_zero_and_yhat_one = np.sum(np.logical_and(y_test == 0, test_yhat == 1).astype(int))
    print("Zeros wrong, ones wrong in test = ", n_y_zero_and_yhat_one,n_y_one_and_yhat_zero)
    print("Test f1 score:", test_f1_score)
    print("Test precision score:", precision_score(y_test, test_yhat))
    print("Recall score:", recall_score(y_test, test_yhat))
    train_f1_scores.append(train_f1_score)
    test_f1_scores.append(test_f1_score)
    train_yhat = None 
    test_yhat = None 
    x_train = None 
    x_test = None 
    y_train = None 
    y_test = None 
    print(gc.collect())
    print(gc.collect())
    print(gc.collect())

TRAINING AGAIN. 0
Zeros wrong, ones wrong in test =  2202 1712
Test f1 score: 0.6165752351097179
Test precision score: 0.588334268087493
Recall score: 0.6476641284214859
0
0
0
TRAINING AGAIN. 1
Zeros wrong, ones wrong in test =  2361 1674
Test f1 score: 0.6120565330256706
Test precision score: 0.5741341991341992
Recall score: 0.6553428042001236
0
0
0
TRAINING AGAIN. 2
Zeros wrong, ones wrong in test =  2266 1687
Test f1 score: 0.6215414073719483
Test precision score: 0.5888969521044993
Recall score: 0.6580174336103791
0
0
0
TRAINING AGAIN. 3
Zeros wrong, ones wrong in test =  2321 1559
Test f1 score: 0.6137766275134381
Test precision score: 0.5705033308660251
Recall score: 0.6641533821628608
0
0
0
TRAINING AGAIN. 4
Zeros wrong, ones wrong in test =  2337 1651
Test f1 score: 0.6193930139339568
Test precision score: 0.5813328556073092
Recall score: 0.6627859477124183
0
0
0
TRAINING AGAIN. 5
Zeros wrong, ones wrong in test =  2291 1618
Test f1 score: 0.6165015206514274
Test precision scor

In [332]:
train_f1_scores


[0.6523289074840969,
 0.6516767560295954,
 0.6506225131562059,
 0.6518062048448788,
 0.650933804248943,
 0.6507430884781336,
 0.6511568288149716,
 0.6511399538895055,
 0.6521511634115348,
 0.6512646797961126]

In [333]:
test_f1_scores                                                                  

[0.6165752351097179,
 0.6120565330256706,
 0.6215414073719483,
 0.6137766275134381,
 0.6193930139339568,
 0.6165015206514274,
 0.6122527737578389,
 0.6165020337013365,
 0.6074116965836712,
 0.616070563148202]

## Testset write

In [335]:
sparse_test_x = vectorizer.transform(test_dset_df["preprocessed_joined"])
test_yhat = svm.predict(sparse_test_x)
output_df = test_dset_df.copy()
output_df.info()
output_df["preprocessed_joined"] = test_yhat
output_df = output_df.rename(columns={"qid":"qid", "preprocessed_joined":"target"})
output_df.target = output_df.target.apply(round)
output_df.to_csv("./outputs/2020_10_28_a_testset_output.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522449 entries, 0 to 522448
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   qid                  522449 non-null  object
 1   preprocessed_joined  522449 non-null  object
dtypes: object(2)
memory usage: 8.0+ MB
