### Load in data as dataframe

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
train_file = "./data/train.txt"
test_file = "./data/test.txt"

In [3]:
# Read in data file
train_df = pd.read_csv(train_file, sep="\t", header=None, names=["q1", "q2", "label"])
test_df = pd.read_csv(test_file, sep="\t", header=None, names=["q1", "q2", "label"])

In [4]:
# print(train_df.info())
print(train_df.label.value_counts())
# print(test_df.info())
print(test_df.label.value_counts())

0    50220
1    11266
Name: label, dtype: int64
0    25169
1     5575
Name: label, dtype: int64


*相似的（label=1）的数量约为不相似的（label=0）的数量的1/5，样本不均衡，不过暂时不考虑。*

In [5]:
train_df.head()

Unnamed: 0,q1,q2,label
0,如何得知关闭借呗,想永久关闭借呗,0
1,花呗扫码付钱,二维码扫描可以用花呗吗,0
2,花呗逾期后不能分期吗,我这个 逾期后还完了 最低还款 后 能分期吗,0
3,花呗分期清空,花呗分期查询,0
4,借呗逾期短信通知,如何购买花呗短信通知,0


In [6]:
# Read in stopwords from web EDA
with open("./data/stop_words.txt","r",encoding="utf-8") as f:
    stop_words_list = [line.strip() for line in f]

In [7]:
# Read in spelling correction from web EDA
with open("./data/spelling_corrections.json", "r", encoding="utf-8") as f:
    spell_chk = json.loads(f.read())

In [8]:
import jieba
jieba.load_userdict("./data/dict_all.txt")

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.556 seconds.
Prefix dict has been built succesfully.


In [9]:
def preprocessing_n_seq(text):
    for token_str,replac_str in spell_chk.items():
        text = text.replace(token_str, replac_str)
        
    tokens = [t for t in jieba.cut(text.strip()) if t not in stop_words_list]
    return tokens
    

In [10]:
%%time
train_df['q1_tokens'] = train_df['q1'].apply(lambda x: preprocessing_n_seq(x))
train_df['q2_tokens'] = train_df['q2'].apply(lambda x: preprocessing_n_seq(x))
test_df['q1_tokens'] = test_df['q1'].apply(lambda x: preprocessing_n_seq(x))
test_df['q2_tokens'] = test_df['q2'].apply(lambda x: preprocessing_n_seq(x))


CPU times: user 13.8 s, sys: 30.7 ms, total: 13.8 s
Wall time: 13.8 s


In [11]:
train_df.sample(n=5)

Unnamed: 0,q1,q2,label,q1_tokens,q2_tokens
17641,如何才能让蚂蚁借呗重新恢复额度,蚂蚁借呗还款成功后没恢复额度,0,"[如何, 才能, 让, 蚂蚁借呗, 重新, 恢复, 额度]","[蚂蚁借呗, 还款, 成功, 没, 恢复, 额度]"
20682,我的借呗为什么不能正常使用了,我的蚂蚁借呗为什么用不了,0,"[借呗, 为什么, 不能, 正常, 使用]","[蚂蚁借呗, 为什么, 用不了]"
43083,一个身份证只能开一个手机号的花呗吗,公司手机已绑订我的身份证，已开通花呗，自己新办手机号也是绑定本人身份证，还能开通花呗吗,1,"[一个, 身份证, 只能, 开, 一个, 手机号, 花呗]","[公司, 手机, 已, 绑, 订, 身份证, 已, 开通, 花呗, 自己, 新办, 手机号,..."
9428,花呗和信用卡收款。是怎么收费,花呗收款的手续费,0,"[花呗, 信用度, 卡, 收款, 是, 怎么, 收费]","[花呗, 收款, 手续费]"
32099,借呗已还怎么还有负面记录消息来,蚂蚁花呗有负面信息是怎么回事,1,"[借呗, 已, 怎么, 还有, 负面, 记录, 消息, 来]","[蚂蚁花呗, 有, 负面, 信息, 是]"


In [12]:
# Check the tokens length distribution
print(train_df.q1_tokens.str.len().describe())
print(train_df.q2_tokens.str.len().describe())

count    61486.000000
mean         5.720782
std          2.483575
min          0.000000
25%          4.000000
50%          5.000000
75%          7.000000
max         38.000000
Name: q1_tokens, dtype: float64
count    61486.000000
mean         5.724946
std          2.482776
min          0.000000
25%          4.000000
50%          5.000000
75%          7.000000
max         40.000000
Name: q2_tokens, dtype: float64


# Traditional methods to get features like word count, common tokens and various distances, etc. Then fit to traditional Machine Learning models to see how is it.
## metrics: F1 score and accuracy

### Try gensim word2vec first to get word to vector

In [13]:
from gensim.models import word2vec

In [14]:
texts = []
texts_q1_test = [token for token in test_df['q1_tokens'].tolist()]
texts_q2_test = [token for token in test_df['q2_tokens'].tolist()]

texts_q1_train = [token for token in train_df['q1_tokens'].tolist()]
texts_q2_train = [token for token in train_df['q2_tokens'].tolist()]

texts.extend(texts_q1_test)
texts.extend(texts_q2_test)
texts.extend(texts_q1_train)
texts.extend(texts_q2_train)


In [15]:
%%time
gensim_w2v_model = word2vec.Word2Vec(sentences=texts,size=300,window=2,min_count=3,workers=2)
norm_gensim_w2v_model = word2vec.Word2Vec(sentences=texts,size=300,window=2,min_count=3,workers=2)
norm_gensim_w2v_model.init_sims(replace=True)

CPU times: user 13.5 s, sys: 86.1 ms, total: 13.6 s
Wall time: 7.1 s


In [16]:
gensim_w2v_model['借呗']

  """Entry point for launching an IPython kernel.


array([ 4.44445685e-02,  2.73065835e-01,  2.52912194e-01,  9.98831093e-02,
       -5.26858168e-03, -7.32327163e-01, -1.72038257e-01,  6.28688157e-01,
       -3.58915418e-01, -4.23523009e-01,  1.04867674e-01,  1.43855929e-01,
       -1.58883799e-02, -6.84987307e-02,  7.94039607e-01,  3.15837972e-02,
       -1.15724072e-01, -2.36712068e-01, -1.24650836e-01, -5.91551773e-02,
        3.90599161e-01, -3.87325108e-01, -5.00738561e-01, -8.35867301e-02,
        3.01632285e-01,  9.87665504e-02, -4.05652106e-01, -3.91770571e-01,
       -4.27176893e-01, -5.03384829e-01,  2.37702787e-01, -6.74819827e-01,
       -1.53576404e-01, -1.77623071e-02, -1.43175974e-01,  1.58097432e-03,
        1.28692597e-01, -8.18581045e-01, -5.08349985e-02, -6.74639195e-02,
       -8.46400857e-01,  1.80441961e-01,  1.11271538e-01,  6.80248082e-01,
        6.69473946e-01,  6.80518210e-01,  1.65595621e-01,  1.70601696e-01,
       -3.03341091e-01, -1.53055146e-01,  5.77953607e-02,  3.44372749e-01,
        7.41366446e-02, -

### Build features for feeding model

In [17]:
from tqdm import tqdm_notebook
from fuzzywuzzy import fuzz
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis



In [18]:
def get_traditional_features(df):
    feature_df = pd.DataFrame()
    feature_df['len_q1'] = df.q1.apply(lambda x: len(str(x)))
    feature_df['len_q2'] = df.q2.apply(lambda x: len(str(x)))
    feature_df['diff_len'] = np.abs(feature_df.len_q1 - feature_df.len_q2)
    feature_df['len_q1_valid_tokens'] = df.q1_tokens.apply(lambda x: len(x))
    feature_df['len_q2_valid_tokens'] = df.q2_tokens.apply(lambda x: len(x))
    feature_df['common_tokens'] = df.apply(lambda x: len(set(x['q1_tokens']).intersection(set(x['q2_tokens']))), axis=1)
    feature_df['fuzz_ratio'] = df.apply(lambda x: fuzz.ratio(str(x['q1']), str(x['q2'])), axis=1)
    feature_df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['q1']), str(x['q2'])), axis=1)
    feature_df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['q1']), str(x['q2'])), axis=1)
    feature_df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['q1']), str(x['q2'])), axis=1)
    feature_df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['q1']), str(x['q2'])), axis=1)
    feature_df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x['q1']), str(x['q2'])), axis=1)
    
    feature_df['wmd'] = df.apply(lambda x: gensim_w2v_model.wv.wmdistance(x['q1'], x['q2']), axis=1)
    feature_df['norm_wmd'] = df.apply(lambda x: norm_gensim_w2v_model.wv.wmdistance(x['q1'], x['q2']), axis=1)
    
    feature_df['wmd_tokens'] = df.apply(lambda x: gensim_w2v_model.wv.wmdistance(x['q1_tokens'], x['q2_tokens']), axis=1)
    feature_df['norm_wmd_tokens'] = df.apply(lambda x: norm_gensim_w2v_model.wv.wmdistance(x['q1_tokens'], x['q2_tokens']), axis=1)
    return feature_df

In [19]:
%%time
train_feature_df = get_traditional_features(train_df)

CPU times: user 4min 33s, sys: 473 ms, total: 4min 33s
Wall time: 4min 33s


In [20]:
train_feature_df.sample(n=5)

Unnamed: 0,len_q1,len_q2,diff_len,len_q1_valid_tokens,len_q2_valid_tokens,common_tokens,fuzz_ratio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,wmd,norm_wmd,wmd_tokens,norm_wmd_tokens
4907,9,6,3,4,3,2,53,33,33,33,53,53,2.864579,0.796665,2.135019,0.468668
46002,9,15,6,3,7,1,33,31,24,24,17,17,3.487158,0.698987,4.367845,0.874672
6734,13,14,1,6,5,2,52,56,56,56,52,52,3.263695,0.622171,4.26473,0.659276
18556,8,10,2,5,6,3,56,62,62,62,56,56,3.281595,0.632771,2.947248,0.583106
2415,9,9,0,4,5,3,67,67,67,67,67,67,1.941517,0.282135,3.849705,0.525148


In [21]:
def _mean_tokens2vec(tokens, model):
    M = []
    for w in tokens:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

def get_gensim_vec(df, model):
    q1_vectors = np.zeros((df.shape[0], 300))

    for i, q in enumerate(tqdm_notebook(df.q1_tokens.values)):
        q1_vectors[i, :] = _mean_tokens2vec(q, model)

    q2_vectors  = np.zeros((df.shape[0], 300))
    for i, q in enumerate(tqdm_notebook(df.q2_tokens.values)):
        q2_vectors[i, :] = _mean_tokens2vec(q, model)
    return q1_vectors, q2_vectors

In [22]:
def get_distance_features(q1v, q2v):
    df = pd.DataFrame()
    df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['skew_q1vec'] = [skew(x) for x in np.nan_to_num(q1v)]
    df['skew_q2vec'] = [skew(x) for x in np.nan_to_num(q2v)]
    df['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(q1v)]
    df['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(q2v)]
    return df

In [23]:
train_df_q1_vec, train_df_q2_vec = get_gensim_vec(train_df, gensim_w2v_model)

HBox(children=(IntProgress(value=0, max=61486), HTML(value='')))

  """
  # Remove the CWD from sys.path while we load stuff.





HBox(children=(IntProgress(value=0, max=61486), HTML(value='')))




In [24]:
%%time
train_distance_df = get_distance_features(train_df_q1_vec, train_df_q2_vec)

  dist = 1.0 - uv / np.sqrt(uu * vv)
  return l1_diff.sum() / l1_sum.sum()


CPU times: user 54.5 s, sys: 990 ms, total: 55.5 s
Wall time: 54.4 s


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [26]:
# Create model input dataframe with label and features columns.
train_input_df = pd.concat([train_df['label'], train_feature_df, train_distance_df], axis=1)

# Replace all infinite value as nan.
train_input_df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [30]:
# Check nan(null)
train_input_df.isnull().sum() > 0

label                            False
len_q1                           False
len_q2                           False
diff_len                         False
len_q1_valid_tokens              False
len_q2_valid_tokens              False
common_tokens                    False
fuzz_ratio                       False
fuzz_partial_ratio               False
fuzz_partial_token_set_ratio     False
fuzz_partial_token_sort_ratio    False
fuzz_token_set_ratio             False
fuzz_token_sort_ratio            False
wmd                              False
norm_wmd                         False
wmd_tokens                       False
norm_wmd_tokens                  False
cosine_distance                  False
cityblock_distance               False
jaccard_distance                 False
canberra_distance                False
euclidean_distance               False
minkowski_distance               False
braycurtis_distance              False
skew_q1vec                       False
skew_q2vec               

In [28]:
# Check infinite
# np.isfinite(train_input_df).all()
ori_train_input_df = train_input_df.copy()

In [29]:
# Remove those row with null
# train_input_df = train_input_df[pd.notnull(train_input_df['cosine_distance'])]
# train_input_df = train_input_df[pd.notnull(train_input_df['braycurtis_distance'])]
# train_input_df = train_input_df[pd.notnull(train_input_df['wmd'])]
# train_input_df = train_input_df[pd.notnull(train_input_df['norm_wmd'])]
# train_input_df = train_input_df[pd.notnull(train_input_df['wmd_tokens'])]
# train_input_df = train_input_df[pd.notnull(train_input_df['norm_wmd_tokens'])]

In [29]:
train_input_df.fillna(0, inplace=True)

In [31]:
# Define feature columns and label columns
x_col = [col for col in train_input_df.columns if col != 'label']
y_col = ['label']

In [32]:
X = train_input_df[x_col]
y = train_input_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### Try some baseline models
- SVC
- XGBoost

In [33]:
# SVC classifier.
from sklearn.svm import SVC

In [35]:
%%time
svc_clf = SVC(gamma='auto')
svc_clf.fit(X_train, y_train)
y_pred = svc_clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print()
print("F1 score", f1_score(y_test, y_pred))
print('Accuracy', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[14940   162]
 [ 3221   121]]

F1 score 0.06675862068965517
Accuracy 0.8165799175883757
              precision    recall  f1-score   support

           0       0.82      0.99      0.90     15102
           1       0.43      0.04      0.07      3342

    accuracy                           0.82     18444
   macro avg       0.63      0.51      0.48     18444
weighted avg       0.75      0.82      0.75     18444

CPU times: user 5min 28s, sys: 2.99 s, total: 5min 31s
Wall time: 5min 34s


SVC without finetune got 0.81 accuracy but only 0.06 F1 score. The recall is very low (0.04) for label 1. 

In [34]:
%%time
# xgboost classifier
import xgboost as xgb

model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train.values.ravel()) 
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print()
print("F1 score", f1_score(y_test, y_pred))
print('Accuracy', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[14661   418]
 [ 2836   531]]

F1 score 0.24606116774791473
Accuracy 0.8235931909357043
              precision    recall  f1-score   support

           0       0.84      0.97      0.90     15079
           1       0.56      0.16      0.25      3367

    accuracy                           0.82     18446
   macro avg       0.70      0.56      0.57     18446
weighted avg       0.79      0.82      0.78     18446

CPU times: user 23.2 s, sys: 152 ms, total: 23.3 s
Wall time: 23.5 s


xgboost got 0.82 acc and got 0.239 F1 score which it's much better than SVC(without tunning). The recall for label 1 is still rather low with 0.15. May try to handle the unbalance classes distribution.

#### Try to handle the imbalance data
Upsample data with label == 1

In [35]:
from sklearn.utils import resample

In [36]:
ready_to_upsampled_df = pd.concat([X_train, y_train], axis=1)

In [37]:
df_majority = ready_to_upsampled_df[ready_to_upsampled_df.label==0]
df_minority = ready_to_upsampled_df[ready_to_upsampled_df.label==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=df_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
upsampled_df = pd.concat([df_majority, df_minority_upsampled])

print(upsampled_df.label.value_counts())
X_train = upsampled_df[x_col]
y_train = upsampled_df['label']

1    35141
0    35141
Name: label, dtype: int64


In [38]:
upsampled_df.head()

Unnamed: 0,len_q1,len_q2,diff_len,len_q1_valid_tokens,len_q2_valid_tokens,common_tokens,fuzz_ratio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,...,jaccard_distance,canberra_distance,euclidean_distance,minkowski_distance,braycurtis_distance,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,label
27129,21,12,9,8,5,2,42,50,50,50,...,1.0,127.972577,0.562474,0.252353,0.29304,0.01549,0.008702,0.032413,-0.137284,0
45835,8,12,4,3,4,1,40,50,50,50,...,1.0,152.599768,0.824399,0.376181,0.441109,0.06742,0.192468,-0.276271,-0.251577,0
33446,11,17,6,5,6,3,36,38,38,38,...,1.0,149.24295,0.730654,0.319568,0.395178,0.075762,0.039882,-0.37198,-0.425226,0
8336,11,6,5,5,3,1,47,67,67,67,...,1.0,138.238877,0.699606,0.313598,0.366224,0.252862,-0.12906,-0.371942,-0.119832,0
1702,10,14,4,5,6,3,33,40,40,40,...,1.0,120.027857,0.513289,0.232205,0.263658,-0.012085,0.001224,-0.054914,-0.290441,0


In [42]:
%%time
svc_clf = SVC(gamma='auto')
svc_clf.fit(X_train, y_train)
y_pred = svc_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(cm)  
print()
print("F1 score", f1_score(y_test, y_pred))
print('Accuracy', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[14670   432]
 [ 2828   514]]

F1 score 0.21446563512054695
Accuracy 0.763283452613316
              precision    recall  f1-score   support

           0       0.83      0.89      0.86     15102
           1       0.27      0.18      0.21      3342

    accuracy                           0.76     18444
   macro avg       0.55      0.54      0.54     18444
weighted avg       0.73      0.76      0.74     18444

CPU times: user 15min 32s, sys: 7.5 s, total: 15min 39s
Wall time: 15min 51s


For SVC(no tunning), after upsampled the traning set from 1:5 to 1:5, F1 score raise from 0.066 to 0.21.

In [46]:
%%time
model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train.values.ravel()) 
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print()
print("F1 score", f1_score(y_test, y_pred))
print('Accuracy', accuracy_score(y_test, y_pred))
print('ROC AUC SCORE', roc_auc_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[22929  2240]
 [ 3767  1808]]

F1 score 0.3757663930167307
Accuracy 0.8046122820712984
ROC AUC SCORE 0.6176532808617334
              precision    recall  f1-score   support

           0       0.86      0.91      0.88     25169
           1       0.45      0.32      0.38      5575

    accuracy                           0.80     30744
   macro avg       0.65      0.62      0.63     30744
weighted avg       0.78      0.80      0.79     30744

CPU times: user 41 s, sys: 23.9 ms, total: 41 s
Wall time: 41 s


For xgboost, after upsampled the traning set from 1:5 to 1:5, F1 score raise from 0.239 to 0.384.  
Also the roc_auc_score for xgboost is below:

# Run xgboost

In [47]:
xgb_model = model # From above

In [43]:
%%time
test_feature_df = get_traditional_features(test_df)
test_df_q1_vec, test_df_q2_vec = get_gensim_vec(test_df, gensim_w2v_model)
test_distance_df = get_distance_features(test_df_q1_vec, test_df_q2_vec)
test_input_df = pd.concat([test_df['label'], test_feature_df, test_distance_df], axis=1)

# Replace all infinite value as nan.
test_input_df.replace([np.inf, -np.inf], np.nan, inplace=True)
test_input_df.fillna(0, inplace=True)

HBox(children=(IntProgress(value=0, max=30744), HTML(value='')))

  """





HBox(children=(IntProgress(value=0, max=30744), HTML(value='')))

  # Remove the CWD from sys.path while we load stuff.





  dist = 1.0 - uv / np.sqrt(uu * vv)


CPU times: user 2min 46s, sys: 457 ms, total: 2min 47s
Wall time: 2min 46s


In [48]:
X_run_test = test_input_df[x_col]
y_run_test = test_input_df['label']

In [50]:
%%time
y_run_pred = model.predict(X_run_test)
cm = confusion_matrix(y_run_test, y_run_pred)  
print(cm)  
print()
print("F1 score", f1_score(y_run_test, y_run_pred))
print('Accuracy', accuracy_score(y_run_test, y_run_pred))
print('ROC AUC SCORE', roc_auc_score(y_run_test, y_run_pred))
print(classification_report(y_run_test, y_run_pred))

[[22929  2240]
 [ 3767  1808]]

F1 score 0.3757663930167307
Accuracy 0.8046122820712984
ROC AUC SCORE 0.6176532808617334
              precision    recall  f1-score   support

           0       0.86      0.91      0.88     25169
           1       0.45      0.32      0.38      5575

    accuracy                           0.80     30744
   macro avg       0.65      0.62      0.63     30744
weighted avg       0.78      0.80      0.79     30744

CPU times: user 555 ms, sys: 12 ms, total: 567 ms
Wall time: 565 ms


The result run on test.txt is above, F1 score around 0.37.