# ~~Nystroem~~ Kernel Approximation 

## Imports and vectorization

In [1]:
import time
from sklearn.svm import LinearSVC
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
from tqdm import tqdm 
tqdm.pandas()

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

vectorizer = TfidfVectorizer(max_features=10000)
vectorizer.fit(train_dset_df["preprocessed_joined"])
original_sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
original_sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()

def summarize(y, yhat):
    '''
    y and yhat are both 1-dimensional ndarrays where every entry is either 0 or 1. 
    y and yhat must have the same size 
    '''
    print("Number of zeros in y:", np.sum( (y == 0).astype(int) ))
    print(" Number of ones in y:", np.sum((y == 1).astype(int)))
    print("            F1 score:", f1_score(y, yhat))
    print(" # of zeros wrong yh:", np.sum(np.logical_and(y == 0, yhat == 1).astype(int)))
    print("  # of ones wrong yh:", np.sum(np.logical_and(y == 1, yhat == 0).astype(int)))

In [2]:
sparse_train_x = original_sparse_train_x
sparse_test_x = original_sparse_test_x

## ~~Nystroem~~ Random Kitchen Sinks

In [3]:
rbfsampler = RBFSampler( n_components=1000, random_state=42, gamma=2)

In [4]:
rbfsampler.fit(sparse_train_x)

RBFSampler(gamma=2, n_components=1000, random_state=42)

In [5]:
transformed_x = rbfsampler.transform(sparse_train_x)

In [6]:
svm = LinearSVC(penalty="l2",dual=True,class_weight={0:1,1:4}, C=1)
svm.fit(transformed_x, train_dset_y)

LinearSVC(C=1, class_weight={0: 1, 1: 4})

In [7]:
train_yhat = svm.predict(transformed_x)

In [8]:
summarize(train_dset_y, train_yhat)

Number of zeros in y: 735222
 Number of ones in y: 48451
            F1 score: 0.16655498589705628
 # of zeros wrong yh: 10376
  # of ones wrong yh: 43107
