# Imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [2]:
train_dset_df = pd.read_csv("2020_10_18_train_dset_df.csv")
test_dset_df = pd.read_csv("2020_10_18_test_dset_df.csv")

In [3]:
train_dset_df["preprocessed_joined"].fillna("", inplace=True)

In [4]:
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

# TFIDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
vectorizer = TfidfVectorizer()

In [7]:
vectorizer.fit(train_dset_df["preprocessed_joined"])

TfidfVectorizer()

In [8]:
sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])

In [9]:
y = train_dset_df["target"]

# Naive Bayes

In [24]:
from sklearn.naive_bayes import MultinomialNB

In [43]:
mnb = MultinomialNB(class_prior=(0.95, 0.05))

In [44]:
mnb.fit(sparse_train_x, y)

MultinomialNB(class_prior=(0.95, 0.05))

In [45]:
yhat = mnb.predict(sparse_train_x)


# Training F1 Score -- Naive Bayes

In [18]:
from sklearn.metrics import f1_score

In [47]:
f1_score(y, yhat)

0.5546479020430912

In [49]:
train_dset_df["yhat"] = yhat
wrongs = train_dset_df[train_dset_df["target"] != train_dset_df["yhat"]]
(len(wrongs.groupby(by="target").get_group(0)),len(wrongs.groupby(by="target").get_group(1)))

(34531, 16607)

# LinearSVM

In [21]:
from sklearn.svm import LinearSVC

In [47]:
svm = LinearSVC(class_weight={0:1,1:3.5}, C=0.1)

In [48]:
svm.fit(sparse_train_x, y)

LinearSVC(C=0.1, class_weight={0: 1, 1: 3.5})

In [49]:
yhat = svm.predict(sparse_train_x)

In [50]:
f1_score(y, yhat)

0.6208430999121008

In [51]:
train_dset_df["yhat"] = yhat
wrongs = train_dset_df[train_dset_df["target"] != train_dset_df["yhat"]]
(len(wrongs.groupby(by="target").get_group(0)),len(wrongs.groupby(by="target").get_group(1)))

(26862, 14548)

# Testset Write

In [272]:
sparse_test_x = vectorizer.transform(test_dset_df["preprocessed_joined"])

In [273]:
test_yhat = svm.predict(sparse_test_x)

In [274]:
output_df = test_dset_df.copy()

In [275]:
output_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522449 entries, 0 to 522448
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   qid                  522449 non-null  object
 1   preprocessed_joined  522449 non-null  object
dtypes: object(2)
memory usage: 8.0+ MB


In [276]:
output_df["preprocessed_joined"] = test_yhat

In [277]:
output_df = output_df.rename(columns={"qid":"qid", "preprocessed_joined":"target"})
output_df.target = output_df.target.apply(round)

In [279]:
output_df.to_csv("./outputs/2020_10_18_e_testset_output.csv", index=False)

# Bagged Linear SVM

In [52]:
from sklearn.ensemble import BaggingClassifier

In [84]:
svm = LinearSVC(class_weight={0:1,1:4}, C=0.1)

In [85]:
bgcf = BaggingClassifier(base_estimator=svm, n_estimators=100, n_jobs=6, random_state=42, verbose=3)

In [86]:
bgcf.fit(sparse_train_x, y)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:  2.4min remaining:  1.2min
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:  2.6min finished


BaggingClassifier(base_estimator=LinearSVC(C=0.1, class_weight={0: 1, 1: 4}),
                  n_estimators=100, n_jobs=6, random_state=42, verbose=3)

In [87]:
yhat = bgcf.predict(sparse_train_x)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:   14.4s remaining:    7.2s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:   15.7s finished


In [88]:
f1_score(y, yhat)

0.6194977527288292

In [89]:
train_dset_df["yhat"] = yhat
wrongs = train_dset_df[train_dset_df["target"] != train_dset_df["yhat"]]
(len(wrongs.groupby(by="target").get_group(0)),len(wrongs.groupby(by="target").get_group(1)))

(28951, 13717)

# Poly features

In [10]:
from sklearn.preprocessing import PolynomialFeatures

In [11]:
pf = PolynomialFeatures(degree=2)

In [12]:
poly_sparse_train_x = pf.fit_transform(sparse_train_x)

In [13]:
from sklearn.svm import LinearSVC

In [14]:
svm = LinearSVC(class_weight={0:1,1:4}, C=0.1)

In [15]:
svm.fit(poly_sparse_train_x, y)

LinearSVC(C=0.1, class_weight={0: 1, 1: 4})

In [16]:
yhat = svm.predict(poly_sparse_train_x)

In [17]:
from sklearn.metrics import f1_score

In [18]:
f1_score(y, yhat)

0.7518560951186728

# K-Fold Cross Validation 

In [76]:
poly_sparse_train_x

<783673x835730286 sparse matrix of type '<class 'numpy.float64'>'
	with 25604397 stored elements in Compressed Sparse Row format>

In [46]:
# y = y.to_numpy()

In [47]:
from sklearn.model_selection import KFold

In [49]:
from sklearn.svm import LinearSVC

In [68]:
from sklearn.metrics import f1_score

In [70]:
import gc

In [82]:
gc.collect()

0

In [86]:
kfcv = KFold(n_splits=10, shuffle=True)
svm = LinearSVC(class_weight={0:1,1:8}, C=1)
train_f1_scores = []
test_f1_scores = []

In [88]:
gc.collect()

0

In [89]:
for train_index, test_index in kfcv.split(poly_sparse_train_x):
    print("TRAINING AGAIN.")
    # print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = poly_sparse_train_x[train_index], poly_sparse_train_x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    svm.fit(x_train, y_train)
    train_yhat = svm.predict(x_train)
    train_f1_score = f1_score(y_train, train_yhat)
    test_yhat = svm.predict(x_test)
    test_f1_score = f1_score(y_test, test_yhat)
    train_f1_scores.append(train_f1_score)
    test_f1_scores.append(test_f1_score)
    train_yhat = None 
    test_yhat = None 
    x_train = None 
    x_test = None 
    y_train = None 
    y_test = None 
    print(gc.collect())
    print(gc.collect())
    print(gc.collect())

TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0


In [100]:
train_f1_scores

[0.3985977438470383,
 0.39761029116546487,
 0.39706825763407744,
 0.3986589454434624,
 0.39799527826794623,
 0.39789978129422365,
 0.3973462898096882,
 0.3974186695848789,
 0.39758939203759136,
 0.39784790347782667]

In [101]:
test_f1_scores

[0.38934842548251347,
 0.3977763148270061,
 0.397607934655776,
 0.38902305159165756,
 0.398705966930266,
 0.39599070307960493,
 0.40177358220696563,
 0.4021864211737629,
 0.39933251106435463,
 0.39677672395350494]

Conclusion: Overfitting a probable conclusion to this situation. Attempting truncated SVD followed by quadratic features.

# Truncated SVD

In [11]:
import gc

In [13]:
gc.collect()

0

In [14]:
from sklearn.decomposition import TruncatedSVD

In [15]:
tsvd = TruncatedSVD(n_components = 800)

In [16]:
decomposed_sparse_train_x = tsvd.fit_transform(sparse_train_x)

In [18]:
from sklearn.svm import LinearSVC

In [39]:
svm = LinearSVC(class_weight={0:1,1:3.5}, C=4)

In [40]:
svm.fit(decomposed_sparse_train_x, y)

LinearSVC(C=4, class_weight={0: 1, 1: 3.5})

In [41]:
yhat = svm.predict(decomposed_sparse_train_x)

In [42]:
from sklearn.metrics import f1_score

In [43]:
f1_score(y, yhat)

0.48184331061460317

In [44]:
train_dset_df["yhat"] = yhat
wrongs = train_dset_df[train_dset_df["target"] != train_dset_df["yhat"]]
(len(wrongs.groupby(by="target").get_group(0)),len(wrongs.groupby(by="target").get_group(1)))

(27617, 24308)

# K-Fold CrossVal -- Truncated SVD

In [46]:
from sklearn.model_selection import KFold

In [47]:
kfcv = KFold(n_splits=10, shuffle=True)
svm = LinearSVC(class_weight={0:1,1:3.5}, C=3)
train_f1_scores = []
test_f1_scores = []

In [48]:
import gc

In [50]:
gc.collect()

0

In [51]:
for train_index, test_index in kfcv.split(decomposed_sparse_train_x):
    print("TRAINING AGAIN.")
    # print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = decomposed_sparse_train_x[train_index], decomposed_sparse_train_x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    svm.fit(x_train, y_train)
    train_yhat = svm.predict(x_train)
    train_f1_score = f1_score(y_train, train_yhat)
    test_yhat = svm.predict(x_test)
    test_f1_score = f1_score(y_test, test_yhat)
    train_f1_scores.append(train_f1_score)
    test_f1_scores.append(test_f1_score)
    train_yhat = None 
    test_yhat = None 
    x_train = None 
    x_test = None 
    y_train = None 
    y_test = None 
    print(gc.collect())
    print(gc.collect())
    print(gc.collect())

TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0
TRAINING AGAIN.
0
0
0


# Polynomial feature generation from decomposed matrix 

In [57]:
from sklearn.preprocessing import PolynomialFeatures

In [58]:
pf = PolynomialFeatures(degree=2)

In [61]:
decomposed_sparse_train_x.dtype 

dtype('float64')

In [63]:
gc.collect()

0