# Embedding AND CountV -- LinearSVC?

## Imports and preprocessing

In [10]:
import gc
import numpy as np 
 
from tqdm import tqdm 
tqdm.pandas()
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.metrics import f1_score, plot_confusion_matrix, confusion_matrix, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer

def summarize(model, X, y):
    yhat = np.round(model.predict(X))
    print("F1 score:", f1_score(y, yhat))
    print("Precision:", precision_score(y, yhat))
    print("Recall:", recall_score(y, yhat))
    print("Confusion matrix:")
    print(confusion_matrix(y, yhat))


train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)

vectorizer = CountVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
train_bow_X = vectorizer.transform(train_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()
train_dset_df = None 
[gc.collect() for i in range(3)]



[0, 0, 0]

## Information Gain metric

In [11]:
from sklearn.preprocessing import Binarizer

In [12]:
bina = Binarizer()

In [13]:
binarized_X = bina.fit_transform(train_bow_X)

In [14]:
binarized_X

<783673x54972 sparse matrix of type '<class 'numpy.int64'>'
	with 8974499 stored elements in Compressed Sparse Row format>

## Feature selection

In [15]:
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2

In [16]:
final_X = SelectKBest(chi2, k=20000).fit_transform(binarized_X, train_dset_y)

## LinearSVC

In [17]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

In [18]:
kfcv = KFold(n_splits=10)
lsvc = LogisticRegression(C=0.25, class_weight={0:1,1:2.3},max_iter=1000)

for train_indices, test_indices in kfcv.split(final_X):
    trainset_X = final_X[train_indices,:]
    trainset_y = train_dset_y[train_indices]
    testset_X  = final_X[test_indices,:]
    testset_y  = train_dset_y[test_indices,]
    lsvc.fit(trainset_X, trainset_y)
    print("\n\nTraining:")
    summarize(lsvc, trainset_X, trainset_y)
    print("Testing:")
    summarize(lsvc, testset_X, testset_y)



Training:
F1 score: 0.6404598596145697
Precision: 0.6401739727595285
Recall: 0.640746001924575
Confusion matrix:
[[645940  15719]
 [ 15680  27966]]
Testing:
F1 score: 0.6157383216637057
Precision: 0.6183879093198993
Recall: 0.6131113423517169
Confusion matrix:
[[71745  1818]
 [ 1859  2946]]


Training:
F1 score: 0.6420091950868044
Precision: 0.6412611377655929
Recall: 0.6427589997251992
Confusion matrix:
[[645935  15702]
 [ 15600  28068]]
Testing:
F1 score: 0.6060351172326779
Precision: 0.6095600676818951
Recall: 0.6025507003972402
Confusion matrix:
[[71739  1846]
 [ 1901  2882]]


Training:
F1 score: 0.6424021006964266
Precision: 0.640909401553637
Recall: 0.6439017691621084
Confusion matrix:
[[645849  15763]
 [ 15559  28134]]
Testing:
F1 score: 0.6043933054393306
Precision: 0.6016243231986672
Recall: 0.60718789407314
Confusion matrix:
[[71697  1913]
 [ 1869  2889]]


Training:
F1 score: 0.6429973523235644
Precision: 0.6417084282460137
Recall: 0.6442914646418443
Confusion matrix:
[[6