In [1]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
import gc
from tqdm import tqdm 
tqdm.pandas()

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()



In [84]:
svm = LinearSVC(penalty="l2",dual=True,class_weight={0:1,1:8}, C=1)


In [None]:
svm.fit(sparse_train_x, train_dset_y)

In [27]:
train_dset_yhat = svm.predict(sparse_train_x)
f1_score(train_dset_y, train_dset_yhat)

0.49093287106719125

In [28]:
train_dset_df["yhat"] = train_dset_yhat
wrongs = train_dset_df[train_dset_df["target"] != train_dset_df["yhat"]]
print(len(wrongs.groupby(by="target").get_group(0)),len(wrongs.groupby(by="target").get_group(1)))
print(len(train_dset_df.groupby(by="target").get_group(0)), len(train_dset_df.groupby(by="target").get_group(1)))

96928 1156
735222 48451


## Bagging classifier

In [85]:
bcl = BaggingClassifier(base_estimator=svm,max_samples=0.2, n_estimators=200, n_jobs=6, verbose=3)

In [86]:
bcl.fit(sparse_train_x, train_dset_y)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed: 13.0min remaining:  6.5min
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed: 13.4min finished


BaggingClassifier(base_estimator=LinearSVC(C=1, class_weight={0: 1, 1: 8}),
                  max_samples=0.2, n_estimators=200, n_jobs=6, verbose=3)

In [87]:
yhat = bcl.predict(sparse_train_x)
f1_score(train_dset_y, yhat)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:   31.7s remaining:   15.9s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:   34.1s finished


0.6494870969225814

In [88]:
f1_score(train_dset_y, yhat)

0.6494870969225814

In [89]:
train_dset_df["yhat"] = yhat
wrongs = train_dset_df[train_dset_df["target"] != train_dset_df["yhat"]]
print(len(wrongs.groupby(by="target").get_group(0)),len(wrongs.groupby(by="target").get_group(1)))
print(len(train_dset_df.groupby(by="target").get_group(0)), len(train_dset_df.groupby(by="target").get_group(1)))

24946 13153
735222 48451


## Bagging classifier cross validation

In [90]:
gc.collect()

1213

In [91]:
kfcv = KFold(n_splits=10, shuffle=True)
train_f1_scores = []
test_f1_scores = []
i=0
for train_index, test_index in kfcv.split(sparse_train_x):
    print("TRAINING AGAIN.", i)
    i+=1
    # print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = sparse_train_x[train_index], sparse_train_x[test_index]
    y_train, y_test = train_dset_y[train_index], train_dset_y[test_index]
    bcl.fit(x_train, y_train)
    train_yhat = bcl.predict(x_train)
    train_f1_score = f1_score(y_train, train_yhat)
    test_yhat = bcl.predict(x_test)
    test_f1_score = f1_score(y_test, test_yhat)
    train_f1_scores.append(train_f1_score)
    test_f1_scores.append(test_f1_score)
    train_yhat = None 
    test_yhat = None 
    x_train = None 
    x_test = None 
    y_train = None 
    y_test = None 
    print(gc.collect())
    print(gc.collect())
    print(gc.collect())

TRAINING AGAIN. 0
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed: 11.4min remaining:  5.7min
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed: 11.6min finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:   29.4s remaining:   14.7s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:   30.3s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:    3.5s remaining:    1.7s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:    3.6s finished
405
0
0
TRAINING AGAIN. 1
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed: 11.5min remaining:  5.7min
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed: 11.6min finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 c

In [92]:
train_f1_scores

[0.6522757905650597,
 0.651940013293453,
 0.6509643955315821,
 0.6493697500750913,
 0.6520638005354613,
 0.6514187410616205,
 0.6520383966201032,
 0.650428937459155,
 0.6522485410230003,
 0.6505745449999479]

In [93]:
test_f1_scores

[0.5847497863450766,
 0.5909090909090909,
 0.5987776210625294,
 0.6029617211511596,
 0.5856952634582462,
 0.5961013278086449,
 0.5927043076633046,
 0.5931060322218059,
 0.5946745562130178,
 0.5970971890501562]

## Testset Write

In [26]:
sparse_test_x = vectorizer.transform(test_dset_df["preprocessed_joined"])
test_yhat = bcl.predict(sparse_test_x)
output_df = test_dset_df.copy()
output_df.info()
output_df["preprocessed_joined"] = test_yhat
output_df = output_df.rename(columns={"qid":"qid", "preprocessed_joined":"target"})
output_df.target = output_df.target.apply(round)
output_df.to_csv("./outputs/2020_10_20_a_testset_output.csv", index=False)


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:    1.1s remaining:    0.5s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:    1.1s finished
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522449 entries, 0 to 522448
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   qid                  522449 non-null  object
 1   preprocessed_joined  522449 non-null  object
dtypes: object(2)
memory usage: 8.0+ MB
