In [1]:
DSET_FOLDER_PATH = './dataset/quora/'
import pandas as pd
from tqdm import tqdm 
tqdm.pandas()
import numpy as np
train_dset_df = pd.read_csv(DSET_FOLDER_PATH + "train.csv")
test_dset_df = pd.read_csv(DSET_FOLDER_PATH + "test.csv")

In [2]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import KFold
def summarize(model, X, y):
    yhat = np.round(model.predict(X))
    print("F1 score:", f1_score(y, yhat))
    print("Precision:", precision_score(y, yhat))
    print("Recall:", recall_score(y, yhat))
    print("Confusion matrix:")
    print(confusion_matrix(y, yhat))
def cross_validate(model, n_folds, X, y):
    kfcv = KFold(n_splits=10)
    for train_indices, test_indices in kfcv.split(train_X):
        trainset_X = X[train_indices,:]
        testset_X  = X[test_indices,:]
        trainset_y = y[train_indices]
        testset_y  = y[test_indices]
        model.fit(trainset_X, trainset_y)
        print("\n\nTraining:")
        summarize(model,trainset_X, trainset_y)
        print("Testing:")
        summarize(model, testset_X, testset_y)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=False, token_pattern=r"(?u)\b\w\w+\b|!|\?|\"|\'")
train_X = vectorizer.fit_transform(train_dset_df["question_text"])
train_y = train_dset_df["target"].to_numpy()

In [4]:
from sklearn.svm import LinearSVC

In [19]:
model = LinearSVC(C=0.1, class_weight={0:1,1:3.2}, max_iter=10000)


In [20]:
cross_validate(model, 10, train_X, train_y)



Training:
F1 score: 0.6863281413120641
Precision: 0.6304131026811399
Recall: 0.7531274343582459
Confusion matrix:
[[642388  19271]
 [ 10775  32871]]
Testing:
F1 score: 0.6221706326175276
Precision: 0.5812398337249232
Recall: 0.6693028095733611
Confusion matrix:
[[71246  2317]
 [ 1589  3216]]


Training:
F1 score: 0.6863605566268828
Precision: 0.6299003195132684
Recall: 0.7539388110286709
Confusion matrix:
[[642293  19344]
 [ 10745  32923]]
Testing:
F1 score: 0.6147380766223612
Precision: 0.577170122958341
Recall: 0.6575371106000418
Confusion matrix:
[[71281  2304]
 [ 1638  3145]]


Training:
F1 score: 0.6872302720733869
Precision: 0.6310278155330513
Recall: 0.7544229052708672
Confusion matrix:
[[642338  19274]
 [ 10730  32963]]
Testing:
F1 score: 0.6163037194064366
Precision: 0.5690391459074733
Recall: 0.6721311475409836
Confusion matrix:
[[71188  2422]
 [ 1560  3198]]


Training:
F1 score: 0.6864024815497194
Precision: 0.6298786894641322
Recall: 0.7540709907602232
Confusion matrix:


## Testset Write

In [21]:
test_X  = vectorizer.transform(test_dset_df["question_text"])
model.fit(train_X, train_y)

LinearSVC(C=0.1, class_weight={0: 1, 1: 3.2}, max_iter=10000)

In [22]:
test_yhat = model.predict(test_X)
output_df = test_dset_df.copy()
output_df.info()
output_df.drop(inplace=True, axis=1, labels="question_text")
output_df["preprocessed_joined"] = test_yhat
output_df = output_df.rename(columns={"qid":"qid", "preprocessed_joined":"target"})
output_df.target = output_df.target.apply(round)
output_df.to_csv("./outputs/2020_11_28_b_testset_output.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522449 entries, 0 to 522448
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   qid            522449 non-null  object
 1   question_text  522449 non-null  object
dtypes: object(2)
memory usage: 8.0+ MB


In [32]:
output_df

Unnamed: 0,qid,target
0,f56a9a31974dc66186e8,0
1,d957c3758060f45da303,0
2,ad822d5abaedb9e247b9,0
3,4e979c23eeb6a4bd1f2e,0
4,333cc031262566b8da49,0
...,...,...
522444,e8e6aa5226f36c27fe41,0
522445,015fd068afcb9d0b4007,0
522446,9f0ef49eff6a3ff9e735,0
522447,d6b02f52f76dc4c22afd,0
