In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [2]:
train_dset_df = pd.read_csv("./dataset/quora/train.csv")
test_dset_df = pd.read_csv("./dataset/quora/test.csv")

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=False, token_pattern=r"(?u)\b\w\w+\b|!|\?|\"|\'", ngram_range=(1,2))
train_X = vectorizer.fit_transform(train_dset_df["question_text"])
train_y = train_dset_df["target"].to_numpy()

In [4]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [15]:
from sklearn.svm import LinearSVC

In [16]:
svm = LinearSVC(penalty="l2",dual=True,class_weight='balanced', C=0.65,max_iter=10000)

In [11]:
from sklearn.model_selection import train_test_split

In [23]:
from sklearn.calibration import CalibratedClassifierCV

In [24]:
model2 = CalibratedClassifierCV(svm)

In [25]:
label = train_y
x_train, x_val, y_train, y_val = train_test_split(train_X, label, test_size=0.2, stratify= label)
model2.fit(x_train, y_train)
y_preds = model2.predict(x_val)
val_f1_score = f1_score(y_val, y_preds)
print("f1 score:",val_f1_score)

f1 score: 0.552428393524284


In [26]:
y_probs = model2.predict_proba(x_val)
y_probs
best_threshold = best_f1 = 0

for i in range(0, 100):
    y2_preds = [1 if e[1] >= i / 100 else 0 for e in y_probs]
    cur_f1 = f1_score(y_val, y2_preds)
    print(i, cur_f1)
    if cur_f1 > best_f1:
        best_f1 = cur_f1
        best_threshold = i / 100

print(f"Best f1 score {best_f1}, best threshold {best_threshold}")

0 0.11644885083370887
1 0.22637941833394395
2 0.3228196552611268
3 0.38942642129893124
4 0.43928847641144625
5 0.4770348216193121
6 0.5048230585389732
7 0.5290223720502605
8 0.5462401650973817
9 0.5611389720978762
10 0.5744896887363887
11 0.5864737793851719
12 0.5949796725224721
13 0.603168514327362
14 0.609240222143448
15 0.6155212791730599
16 0.6189670120969407
17 0.6236386661038413
18 0.6247365024736502
19 0.6271453590192645
20 0.6280770602925437
21 0.6290475974819981
22 0.6302238977518274
23 0.6303901437371664
24 0.6297820021752494
25 0.6292867763441891
26 0.6292931746417294
27 0.6288664863535777
28 0.6270173341303048
29 0.6246088624205107
30 0.6239713774597495
31 0.622215330542203
32 0.6202677264170676
33 0.6182318104906938
34 0.6154585246076651
35 0.6130599028602266
36 0.609042262749809
37 0.6061441729634328
38 0.602707671736587
39 0.5992007654640626
40 0.5947727272727272
41 0.589565516845549
42 0.5852414352791141
43 0.5795880149812734
44 0.576271186440678
45 0.5731133480254929
4

In [19]:
test_dset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522449 entries, 0 to 522448
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   qid            522449 non-null  object
 1   question_text  522449 non-null  object
dtypes: object(2)
memory usage: 8.0+ MB


In [21]:
test_X  = vectorizer.transform(test_dset_df["question_text"])

LinearSVC(C=0.65, class_weight='balanced', max_iter=10000)

In [28]:
test_proba = model2.predict_proba(test_X)
test_yhat = [1 if e[1] >= 0.23 else 0 for e in test_proba]
output_df = test_dset_df.copy()
output_df.info()
output_df.drop(inplace=True, axis=1, labels="question_text")
output_df["preprocessed_joined"] = test_yhat
output_df = output_df.rename(columns={"qid":"qid", "preprocessed_joined":"target"})
output_df.target = output_df.target.apply(round)
output_df.to_csv("28_11_test2.csv",index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522449 entries, 0 to 522448
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   qid            522449 non-null  object
 1   question_text  522449 non-null  object
dtypes: object(2)
memory usage: 8.0+ MB
