# Kernel SVM 

## Imports and vectorization

In [92]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
from tqdm import tqdm 
tqdm.pandas()

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

vectorizer = TfidfVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
original_sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
original_sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()

def summarize(y, yhat):
    '''
    y and yhat are both 1-dimensional ndarrays where every entry is either 0 or 1. 
    y and yhat must have the same size 
    '''
    print("Number of zeros in y:", np.sum( (y == 0).astype(int) ))
    print(" Number of ones in y:", np.sum((y == 1).astype(int)))
    print("            F1 score:", f1_score(y, yhat))
    print(" # of zeros wrong yh:", np.sum(np.logical_and(y == 0, yhat == 1).astype(int)))
    print("  # of ones wrong yh:", np.sum(np.logical_and(y == 1, yhat == 0).astype(int)))

## Dimensionality reduction

Refer to 2020_10_23_dimensionality_reduction.ipynb

In [93]:
original_sparse_train_x_csc = original_sparse_train_x.tocsc()
THRESHOLD = 100
columns_to_keep = []
for column_id in tqdm(range(original_sparse_train_x_csc.shape[1])):
    if np.sum((original_sparse_train_x_csc[:,column_id] > 0).astype(int)) > THRESHOLD:
        columns_to_keep.append(column_id)

100%|██████████| 54972/54972 [00:50<00:00, 1089.39it/s]


In [96]:
sparse_train_x = original_sparse_train_x[:, columns_to_keep]
sparse_test_x = original_sparse_test_x[:,columns_to_keep]

## Kernel SVM on small subset

In [97]:
svc = SVC(C=1.0, kernel='rbf', cache_size=2048, gamma="scale", class_weight={0:1,1:32})

In [98]:
SUBSET_SIZE = 70000
sparse_train_x_subset = sparse_train_x[:SUBSET_SIZE,:]
train_dset_y_subset = train_dset_y[:SUBSET_SIZE]

In [99]:
svc.fit(sparse_train_x_subset, train_dset_y_subset)

SVC(cache_size=2048, class_weight={0: 1, 1: 32})

In [100]:
yhat = svc.predict(sparse_train_x_subset)

In [101]:
f1_score(train_dset_y_subset, yhat)

0.9530500724880117

In [102]:
yhat_test = svc.predict(sparse_train_x[SUBSET_SIZE:,])

In [103]:
y_test = train_dset_y[SUBSET_SIZE:]

In [104]:
f1_score(y_test, yhat_test)

0.5287402716986843

## Kernel SVM on small subset -- balanced sampling

In [53]:
sparse_train_x_subset_one = sparse_train_x[train_dset_y == 1 ,:] 

In [54]:
sparse_train_x_subset_one.shape

(48451, 54972)

In [55]:
n_ones = np.sum(train_dset_y)

In [56]:
sparse_train_x_subset_zero = sparse_train_x[train_dset_y == 0,:][:n_ones,:]

In [62]:
from scipy.sparse import vstack

In [66]:
sparse_train_x_subset = vstack([sparse_train_x_subset_zero, sparse_train_x_subset_one])

In [67]:
train_y_subset = np.concatenate((np.zeros((n_ones,)), np.ones((n_ones,))))

In [68]:
train_y_subset.shape

(96902,)

In [69]:
sparse_train_x_subset.shape

(96902, 54972)

In [70]:
train_on_x = sparse_train_x_subset[::2,:]
train_on_y = train_y_subset[::2]
test_on_x  = sparse_train_x_subset[1::2,:]
test_on_y  = train_y_subset[1::2] 

In [71]:
svc = SVC(C=1.0, kernel='rbf', cache_size=20480, gamma="scale")

In [72]:
svc.fit(train_on_x, train_on_y)

SVC(cache_size=2048)

In [73]:
train_on_yhat = svc.predict(train_on_x)

In [74]:
f1_score(train_on_y, train_on_yhat)

0.9707386944956006

In [75]:
np.sum(np.logical_and(train_on_y == 0, train_on_yhat == 1).astype(int))

925

In [76]:
np.sum(np.logical_and(train_on_y == 1, train_on_yhat == 0).astype(int))

505

In [81]:
test_on_yhat = svc.predict(test_on_x)

In [89]:
summarize(test_on_y, test_on_yhat)

Number of zeros in y: 24225
 Number of ones in y: 24226
            F1 score: 0.8866421821483101
 # of zeros wrong yh: 2754
  # of ones wrong yh: 2740


In [86]:
train_dset_yhat = svc.predict(sparse_train_x)

In [88]:
summarize(train_dset_y, train_dset_yhat)

Number of zeros in y: 735222
 Number of ones in y: 48451
            F1 score: 0.5068931685027892
 # of zeros wrong yh: 84708
  # of ones wrong yh: 3245


Conclusion: Balanced sampling takes it too far. Must sample in correct proportion and apply the balancer accordingly. 

In [90]:
svc.support_vectors_

<26281x54972 sparse matrix of type '<class 'numpy.float64'>'
	with 352159 stored elements in Compressed Sparse Row format>

The number of support vectors is HUGE. Must try to reduce it.