# Kernel SVM with bagging

## Imports and vectorization

In [1]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
from tqdm import tqdm 
tqdm.pandas()

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

vectorizer = TfidfVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()

## Kernel SVM on small subset

In [11]:
svc = SVC(C=1.0, kernel='rbf', cache_size=2048, gamma="scale", class_weight={0:1,1:8})

In [12]:
SUBSET_SIZE = 70000
sparse_train_x_subset = sparse_train_x[:SUBSET_SIZE,:]
train_dset_y_subset = train_dset_y[:SUBSET_SIZE]

In [13]:
svc.fit(sparse_train_x_subset, train_dset_y_subset)

SVC(cache_size=2048, class_weight={0: 1, 1: 8})

In [14]:
yhat = svc.predict(sparse_train_x_subset)

In [15]:
f1_score(train_dset_y_subset, yhat)

0.9806303724928367

In [19]:
yhat_test = svc.predict(sparse_train_x[SUBSET_SIZE:,])

In [20]:
f1_score(train_dset_y[SUBSET_SIZE:], yhat_test)

0.5139276340199811

10066 25474
735222 48451


## Bagging classifier

In [25]:
bcf = BaggingClassifier(base_estimator=svc, n_estimators=1, n_jobs=6, max_samples=0.1, random_state=42, verbose=3)

In [26]:
bcf.fit(sparse_train_x, train_dset_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Building estimator 1 of 1 for this parallel run (total 1)...


In [23]:
yhat = bcf.predict(sparse_train_x)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed: 69.4min remaining: 34.7min


KeyboardInterrupt: 

In [None]:
f1_score(train_dset_y, yhat)