# Imbalanced Learning

## Imports and preprocessing

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()



## Describer

In [2]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score


def describe(model, X, y):
    yhat = model.predict(X)
    print("F1 SCORE:", f1_score(y, yhat))
    print("Precision SCORE:", precision_score(y, yhat))
    print("Recall SCORE:", recall_score(y, yhat))
    print(confusion_matrix(y, yhat))

### Imblearn bagging classifier for LinearSVM

In [3]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.svm import SVC


In [4]:
svm = SVC(C=0.01, class_weight={0:1,1:0.5}, kernel="rbf")
bgclf = BalancedBaggingClassifier(base_estimator=svm, n_estimators=10, max_samples=0.1,verbose=3, n_jobs=6) 

## Cross validation

In [5]:
from sklearn.model_selection import KFold
import gc

In [6]:
gc.collect()

0

In [7]:
i=0
kfcv = KFold(n_splits=10, shuffle=True)
for train_index, test_index in kfcv.split(sparse_train_x):
    print("TRAINING AGAIN.", i)
    i+=1
    # print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = sparse_train_x[train_index], sparse_train_x[test_index]
    y_train, y_test = train_dset_y[train_index], train_dset_y[test_index]
    # x_train, y_train = ros.fit_resample(x_train, y_train)
    bgclf.fit(x_train, y_train)
    print("TRAINING:")
    describe(bgclf, x_train, y_train)
    print("TESTING:")
    describe(bgclf, x_test, y_test)
    x_train = None 
    x_test = None 
    y_train = None 
    y_test = None 
    print(gc.collect())
    print(gc.collect())
    print(gc.collect())

TRAINING AGAIN. 0
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:   19.6s remaining:    9.8s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:   21.2s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
TRAINING:
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed: 22.5min remaining: 11.2min
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed: 22.6min finished
F1 SCORE: 0.0
Precision SCORE: 0.0
  _warn_prf(average, modifier, msg_start, len(result))
Recall SCORE: 0.0
[[661679      0]
 [ 43626      0]]
TESTING:
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:  2.4min remaining:  1.2min
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:  2.5min finished
F1 SCORE: 0.0
Precision SCORE: 0.0
Recall SCORE: 0.0
[[73543     0]
 [ 4825     0]]
276
0
0
TRAINING AGAIN. 1
[Parallel(n_jobs=6)]: Using backend

## Testset write