# Nonlinear SVM

## Imports and vectorization

In [1]:
import time
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_extraction.text import CountVectorizer
import gc
from tqdm import tqdm 
tqdm.pandas()

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

vectorizer = CountVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
original_sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
original_sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()

def summarize(y, yhat):
    '''
    y and yhat are both 1-dimensional ndarrays where every entry is either 0 or 1. 
    y and yhat must have the same size 
    '''
    print("Number of zeros in y:", np.sum( (y == 0).astype(int) ))
    print(" Number of ones in y:", np.sum((y == 1).astype(int)))
    print("            F1 score:", f1_score(y, yhat))
    print(" # of zeros wrong yh:", np.sum(np.logical_and(y == 0, yhat == 1).astype(int)))
    print("  # of ones wrong yh:", np.sum(np.logical_and(y == 1, yhat == 0).astype(int)))

In [2]:
start_time = time.time()

## Dimensionality reduction

In [3]:
original_sparse_train_x_csc = original_sparse_train_x.tocsc()
THRESHOLD = 100
columns_to_keep = []
for column_id in tqdm(range(original_sparse_train_x_csc.shape[1])):
    if np.sum((original_sparse_train_x_csc[:,column_id] > 0).astype(int)) > THRESHOLD:
        columns_to_keep.append(column_id)

sparse_train_x = original_sparse_train_x[:, columns_to_keep]
sparse_test_x = original_sparse_test_x[:,columns_to_keep]

100%|██████████| 54972/54972 [00:53<00:00, 1032.69it/s]


## Experiments on a small subset

In [4]:
train_subset_size = sparse_train_x.shape[0]//10
train_subset_x = sparse_train_x[:train_subset_size,]
test_subset_x = sparse_train_x[train_subset_size:,]
train_subset_y = train_dset_y[:train_subset_size]
test_subset_y = train_dset_y[train_subset_size:]

In [5]:
svc = SVC(C=0.3, cache_size=20480, gamma=0.9, verbose=3, class_weight={0:1,1:3.5})

In [6]:
gc.collect()

0

In [7]:
svc.fit(train_subset_x, train_subset_y)

[LibSVM]

SVC(C=0.3, cache_size=20480, class_weight={0: 1, 1: 3.5}, gamma=0.9, verbose=3)

In [8]:
train_subset_yhat = svc.predict(train_subset_x)
summarize(train_subset_y, train_subset_yhat)

Number of zeros in y: 73562
 Number of ones in y: 4805
            F1 score: 0.9949916527545909
 # of zeros wrong yh: 11
  # of ones wrong yh: 37


In [9]:
test_subset_yhat = svc.predict(test_subset_x)
summarize(test_subset_y, test_subset_yhat)

In [None]:
end_time = time.time()

In [None]:
import joblib 
import pickle 
pickled_model = pickle.dumps(svc)
joblib.dump(pickled_model,'2020_10_26b_pickled_model.joblib')
