### Load in data as dataframe

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
train_file = "./data/train.txt"
test_file = "./data/test.txt"

In [3]:
# Read in data file
train_df = pd.read_csv(train_file, sep="\t", header=None, names=["q1", "q2", "label"])
test_df = pd.read_csv(test_file, sep="\t", header=None, names=["q1", "q2", "label"])

In [4]:
# print(train_df.info())
print(train_df.label.value_counts())
# print(test_df.info())
print(test_df.label.value_counts())

0    50220
1    11266
Name: label, dtype: int64
0    25169
1     5575
Name: label, dtype: int64


*相似的（label=1）的数量约为不相似的（label=0）的数量的1/5，样本不均衡，不过暂时不考虑。*

In [5]:
train_df.head()

Unnamed: 0,q1,q2,label
0,如何得知关闭借呗,想永久关闭借呗,0
1,花呗扫码付钱,二维码扫描可以用花呗吗,0
2,花呗逾期后不能分期吗,我这个 逾期后还完了 最低还款 后 能分期吗,0
3,花呗分期清空,花呗分期查询,0
4,借呗逾期短信通知,如何购买花呗短信通知,0


In [6]:
# Read in stopwords from web EDA
with open("./data/stop_words.txt","r",encoding="utf-8") as f:
    stop_words_list = [line.strip() for line in f]

In [7]:
# Read in spelling correction from web EDA
with open("./data/spelling_corrections.json", "r", encoding="utf-8") as f:
    spell_chk = json.loads(f.read())

In [8]:
import jieba
jieba.load_userdict("./data/dict_all.txt")

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/2x/34c79f294593wb79lvjc34g00000gn/T/jieba.cache
Loading model cost 0.857 seconds.
Prefix dict has been built succesfully.


In [9]:
def preprocessing_n_seq(text):
    for token_str,replac_str in spell_chk.items():
        text = text.replace(token_str, replac_str)
        
    tokens = [t for t in jieba.cut(text.strip()) if t not in stop_words_list]
    return tokens
    

In [10]:
%%time
train_df['q1_tokens'] = train_df['q1'].apply(lambda x: preprocessing_n_seq(x))
train_df['q2_tokens'] = train_df['q2'].apply(lambda x: preprocessing_n_seq(x))
test_df['q1_tokens'] = test_df['q1'].apply(lambda x: preprocessing_n_seq(x))
test_df['q2_tokens'] = test_df['q2'].apply(lambda x: preprocessing_n_seq(x))


CPU times: user 19.6 s, sys: 242 ms, total: 19.8 s
Wall time: 20.1 s


In [11]:
train_df.sample(n=5)

Unnamed: 0,q1,q2,label,q1_tokens,q2_tokens
14180,为什么提示退款方式是花呗，但是我银行卡没收到,花呗显示我还款，可我没收到货,0,"[为什么, 提示, 退款, 方式, 是, 花呗, 但是, 银行卡, 没收, 到]","[花呗, 显示, 还款, 可, 没收, 到货]"
15005,借呗我都是提前还款的，为什么额度不能用了,借呗提前还款能撤回吗,0,"[借呗, 都, 是, 提前, 还款, 为什么, 额度, 不能, 用]","[借呗, 提前, 还款, 能, 撤回]"
10773,花呗还款后还可以分期吗,蚂蚁花呗最低还款后还能不能分期,0,"[花呗, 还款, 可以, 分期]","[蚂蚁花呗, 最低还款, 能不能, 分期]"
23243,分期买手机花呗额度不够怎么办,想分期买手机 但是花呗额度不够 怎么办,1,"[分期, 买手机, 花呗, 额度, 不够, 怎么办]","[分期, 买手机, 但是, 花呗, 额度, 不够, 怎么办]"
36495,问网商贷变成借呗,有网商贷无借呗了,0,"[问网, 商贷, 变成, 借呗]","[有网, 商贷, 无, 借呗]"


In [12]:
# Check the tokens length distribution
print(train_df.q1_tokens.str.len().describe())
print(train_df.q2_tokens.str.len().describe())

count    61486.000000
mean         5.720782
std          2.483575
min          0.000000
25%          4.000000
50%          5.000000
75%          7.000000
max         38.000000
Name: q1_tokens, dtype: float64
count    61486.000000
mean         5.724946
std          2.482776
min          0.000000
25%          4.000000
50%          5.000000
75%          7.000000
max         40.000000
Name: q2_tokens, dtype: float64


# Traditional methods to get features like word count, common tokens and various distances, etc. Then fit to traditional Machine Learning models to see how is it.
## metrics: F1 score and accuracy

### Try gensim word2vec first to get word to vector

In [13]:
from gensim.models import word2vec

In [14]:
texts = []
texts_q1_test = [token for token in test_df['q1_tokens'].tolist()]
texts_q2_test = [token for token in test_df['q2_tokens'].tolist()]

texts_q1_train = [token for token in train_df['q1_tokens'].tolist()]
texts_q2_train = [token for token in train_df['q2_tokens'].tolist()]

texts.extend(texts_q1_test)
texts.extend(texts_q2_test)
texts.extend(texts_q1_train)
texts.extend(texts_q2_train)


In [15]:
%%time
gensim_w2v_model = word2vec.Word2Vec(sentences=texts,size=300,window=2,min_count=3,workers=2)
norm_gensim_w2v_model = word2vec.Word2Vec(sentences=texts,size=300,window=2,min_count=3,workers=2)
norm_gensim_w2v_model.init_sims(replace=True)

CPU times: user 23.5 s, sys: 283 ms, total: 23.8 s
Wall time: 13.4 s


In [16]:
gensim_w2v_model['借呗']

  """Entry point for launching an IPython kernel.


array([-0.16388924,  0.12370202, -0.32948422,  0.31775227,  0.30674124,
        0.48292193, -0.44773853,  0.5484049 ,  0.47294316, -0.22289571,
        0.47108287, -0.20134819, -0.43011713, -0.12828882, -0.34325904,
        0.00419096, -0.24503604, -0.10676538, -0.2408275 , -0.4916334 ,
        0.45288894,  0.08521263, -0.32905066,  0.11315393, -0.40717593,
        0.0571479 ,  0.21153755,  0.28471413, -0.32342988, -0.9595714 ,
        0.10519151, -0.02142261, -0.23709074,  0.24291766,  0.474836  ,
        0.12801127,  0.09043673, -0.09429657, -0.53035665, -0.3552063 ,
        0.3035092 ,  0.03315999,  0.78071964, -0.33745334,  0.4885339 ,
        0.4576801 ,  0.1887161 ,  0.24093445, -0.02487267,  0.17115097,
        1.1247121 ,  0.37978643,  0.20268022, -0.10992901,  0.21299408,
        0.22083037, -0.16118012, -0.78722584, -0.6230208 ,  0.06565747,
       -0.00892398,  0.10780478,  0.91244274,  0.12706824, -0.2808622 ,
       -0.09230115,  0.35025463, -0.2606098 ,  0.1135275 , -0.39

### Build features for feeding model

In [17]:
from tqdm import tqdm_notebook
from fuzzywuzzy import fuzz
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis



In [18]:
def get_traditional_features(df):
    feature_df = pd.DataFrame()
    feature_df['len_q1'] = df.q1.apply(lambda x: len(str(x)))
    feature_df['len_q2'] = df.q2.apply(lambda x: len(str(x)))
    feature_df['diff_len'] = np.abs(feature_df.len_q1 - feature_df.len_q2)
    feature_df['len_q1_valid_tokens'] = df.q1_tokens.apply(lambda x: len(x))
    feature_df['len_q2_valid_tokens'] = df.q2_tokens.apply(lambda x: len(x))
    feature_df['common_tokens'] = df.apply(lambda x: len(set(x['q1_tokens']).intersection(set(x['q2_tokens']))), axis=1)
    feature_df['fuzz_ratio'] = df.apply(lambda x: fuzz.ratio(str(x['q1']), str(x['q2'])), axis=1)
    feature_df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['q1']), str(x['q2'])), axis=1)
    feature_df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['q1']), str(x['q2'])), axis=1)
    feature_df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['q1']), str(x['q2'])), axis=1)
    feature_df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['q1']), str(x['q2'])), axis=1)
    feature_df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x['q1']), str(x['q2'])), axis=1)
    
    feature_df['wmd'] = df.apply(lambda x: gensim_w2v_model.wv.wmdistance(x['q1'], x['q2']), axis=1)
    feature_df['norm_wmd'] = df.apply(lambda x: norm_gensim_w2v_model.wv.wmdistance(x['q1'], x['q2']), axis=1)
    
    feature_df['wmd_tokens'] = df.apply(lambda x: gensim_w2v_model.wv.wmdistance(x['q1_tokens'], x['q2_tokens']), axis=1)
    feature_df['norm_wmd_tokens'] = df.apply(lambda x: norm_gensim_w2v_model.wv.wmdistance(x['q1_tokens'], x['q2_tokens']), axis=1)
    return feature_df

In [19]:
%%time
train_feature_df = get_traditional_features(train_df)

CPU times: user 5min 55s, sys: 1.04 s, total: 5min 56s
Wall time: 5min 58s


In [20]:
train_feature_df.sample(n=5)

Unnamed: 0,len_q1,len_q2,diff_len,len_q1_valid_tokens,len_q2_valid_tokens,common_tokens,fuzz_ratio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,wmd,norm_wmd,wmd_tokens,norm_wmd_tokens
59503,19,20,1,7,8,2,26,26,26,26,26,26,2.089146,0.544929,3.74435,0.724484
28953,11,8,3,4,3,3,84,88,88,88,84,84,0.999256,0.213689,1.946794,0.350498
57776,7,10,3,3,4,0,24,40,40,40,24,24,2.55722,0.602747,7.096186,0.989645
18241,10,26,16,5,12,2,39,50,50,50,39,39,2.625573,0.564653,4.325525,0.737729
41539,18,13,5,7,7,2,32,38,32,32,19,19,4.255405,0.770281,6.0705,0.957819


In [21]:
def _mean_tokens2vec(tokens, model):
    M = []
    for w in tokens:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

def get_gensim_vec(df, model):
    q1_vectors = np.zeros((df.shape[0], 300))

    for i, q in enumerate(tqdm_notebook(df.q1_tokens.values)):
        q1_vectors[i, :] = _mean_tokens2vec(q, model)

    q2_vectors  = np.zeros((df.shape[0], 300))
    for i, q in enumerate(tqdm_notebook(df.q2_tokens.values)):
        q2_vectors[i, :] = _mean_tokens2vec(q, model)
    return q1_vectors, q2_vectors

In [22]:
def get_distance_features(q1v, q2v):
    df = pd.DataFrame()
    df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(q1v), np.nan_to_num(q2v))]
    df['skew_q1vec'] = [skew(x) for x in np.nan_to_num(q1v)]
    df['skew_q2vec'] = [skew(x) for x in np.nan_to_num(q2v)]
    df['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(q1v)]
    df['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(q2v)]
    return df

In [23]:
train_df_q1_vec, train_df_q2_vec = get_gensim_vec(train_df, gensim_w2v_model)

HBox(children=(IntProgress(value=0, max=61486), HTML(value='')))

  """
  # Remove the CWD from sys.path while we load stuff.





HBox(children=(IntProgress(value=0, max=61486), HTML(value='')))




In [24]:
%%time
train_distance_df = get_distance_features(train_df_q1_vec, train_df_q2_vec)

  dist = 1.0 - uv / np.sqrt(uu * vv)
  return l1_diff.sum() / l1_sum.sum()


CPU times: user 1min 4s, sys: 1.81 s, total: 1min 6s
Wall time: 1min 6s


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [26]:
# Create model input dataframe with label and features columns.
train_input_df = pd.concat([train_df['label'], train_feature_df, train_distance_df], axis=1)

# Replace all infinite value as nan.
train_input_df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [27]:
# Check nan(null)
train_input_df.isnull().sum() > 0

label                            False
len_q1                           False
len_q2                           False
diff_len                         False
len_q1_valid_tokens              False
len_q2_valid_tokens              False
common_tokens                    False
fuzz_ratio                       False
fuzz_partial_ratio               False
fuzz_partial_token_set_ratio     False
fuzz_partial_token_sort_ratio    False
fuzz_token_set_ratio             False
fuzz_token_sort_ratio            False
wmd                               True
norm_wmd                          True
wmd_tokens                        True
norm_wmd_tokens                   True
cosine_distance                   True
cityblock_distance               False
jaccard_distance                 False
canberra_distance                False
euclidean_distance               False
minkowski_distance               False
braycurtis_distance               True
skew_q1vec                       False
skew_q2vec               

In [28]:
# Check infinite
# np.isfinite(train_input_df).all()

In [29]:
# Remove those row with null
train_input_df = train_input_df[pd.notnull(train_input_df['cosine_distance'])]
train_input_df = train_input_df[pd.notnull(train_input_df['braycurtis_distance'])]
train_input_df = train_input_df[pd.notnull(train_input_df['wmd'])]
train_input_df = train_input_df[pd.notnull(train_input_df['norm_wmd'])]
train_input_df = train_input_df[pd.notnull(train_input_df['wmd_tokens'])]
train_input_df = train_input_df[pd.notnull(train_input_df['norm_wmd_tokens'])]

In [30]:
# Define feature columns and label columns
x_col = [col for col in train_input_df.columns if col != 'label']
y_col = ['label']

In [31]:
X = train_input_df[x_col]
y = train_input_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### Try some baseline models
- SVC
- XGBoost

In [32]:
# SVC classifier.
from sklearn.svm import SVC

In [35]:
%%time
svc_clf = SVC(gamma='auto')
svc_clf.fit(X_train, y_train)
y_pred = svc_clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print()
print("F1 score", f1_score(y_test, y_pred))
print('Accuracy', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[14940   162]
 [ 3221   121]]

F1 score 0.06675862068965517
Accuracy 0.8165799175883757
              precision    recall  f1-score   support

           0       0.82      0.99      0.90     15102
           1       0.43      0.04      0.07      3342

    accuracy                           0.82     18444
   macro avg       0.63      0.51      0.48     18444
weighted avg       0.75      0.82      0.75     18444

CPU times: user 5min 28s, sys: 2.99 s, total: 5min 31s
Wall time: 5min 34s


SVC without finetune got 0.81 accuracy but only 0.06 F1 score. The recall is very low (0.04) for label 1. 

In [36]:
%%time
# xgboost classifier
import xgboost as xgb

model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train.values.ravel()) 
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print()
print("F1 score", f1_score(y_test, y_pred))
print('Accuracy', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[14670   432]
 [ 2828   514]]

F1 score 0.23973880597014927
Accuracy 0.8232487529819996
              precision    recall  f1-score   support

           0       0.84      0.97      0.90     15102
           1       0.54      0.15      0.24      3342

    accuracy                           0.82     18444
   macro avg       0.69      0.56      0.57     18444
weighted avg       0.78      0.82      0.78     18444

CPU times: user 30.9 s, sys: 271 ms, total: 31.1 s
Wall time: 31.6 s


xgboost got 0.82 acc and got 0.239 F1 score which it's much better than SVC(without tunning). The recall for label 1 is still rather low with 0.15. May try to handle the unbalance classes distribution.

#### Try to handle the imbalance data
Upsample data with label == 1

In [37]:
from sklearn.utils import resample

In [38]:
ready_to_upsampled_df = pd.concat([X_train, y_train], axis=1)

In [39]:
df_majority = ready_to_upsampled_df[ready_to_upsampled_df.label==0]
df_minority = ready_to_upsampled_df[ready_to_upsampled_df.label==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=df_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
upsampled_df = pd.concat([df_majority, df_minority_upsampled])

print(upsampled_df.label.value_counts())
X_train = upsampled_df[x_col]
y_train = upsampled_df['label']

1    35112
0    35112
Name: label, dtype: int64


In [41]:
upsampled_df.head()

Unnamed: 0,len_q1,len_q2,diff_len,len_q1_valid_tokens,len_q2_valid_tokens,common_tokens,fuzz_ratio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,...,jaccard_distance,canberra_distance,euclidean_distance,minkowski_distance,braycurtis_distance,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,label
33418,10,19,9,5,9,4,28,40,40,40,...,1.0,130.852996,0.627109,0.283735,0.331299,-0.227624,0.016756,0.191222,0.211301,0
50787,8,13,5,4,5,1,29,29,29,29,...,1.0,172.853957,0.968921,0.433302,0.559625,-0.019332,0.042006,0.66647,0.310729,0
8337,20,10,10,8,4,2,60,50,50,50,...,1.0,132.802012,0.643083,0.291153,0.330427,0.071679,0.018534,-0.506822,-0.25035,0
1702,10,14,4,5,6,3,33,40,40,40,...,1.0,112.936358,0.515693,0.238553,0.258521,-0.042675,0.080588,-0.093799,-0.23779,0
7188,12,29,17,6,13,5,49,83,83,83,...,1.0,92.647931,0.362148,0.166803,0.179454,-0.226362,-0.111589,0.035104,0.117231,0


In [42]:
%%time
svc_clf = SVC(gamma='auto')
svc_clf.fit(X_train, y_train)
y_pred = svc_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(cm)  
print()
print("F1 score", f1_score(y_test, y_pred))
print('Accuracy', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[14670   432]
 [ 2828   514]]

F1 score 0.21446563512054695
Accuracy 0.763283452613316
              precision    recall  f1-score   support

           0       0.83      0.89      0.86     15102
           1       0.27      0.18      0.21      3342

    accuracy                           0.76     18444
   macro avg       0.55      0.54      0.54     18444
weighted avg       0.73      0.76      0.74     18444

CPU times: user 15min 32s, sys: 7.5 s, total: 15min 39s
Wall time: 15min 51s


For SVC(no tunning), after upsampled the traning set from 1:5 to 1:5, F1 score raise from 0.066 to 0.21.

In [43]:
%%time
model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train.values.ravel()) 
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print()
print("F1 score", f1_score(y_test, y_pred))
print('Accuracy', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[13842  1260]
 [ 2248  1094]]

F1 score 0.3841292134831461
Accuracy 0.8098026458468879
              precision    recall  f1-score   support

           0       0.86      0.92      0.89     15102
           1       0.46      0.33      0.38      3342

    accuracy                           0.81     18444
   macro avg       0.66      0.62      0.64     18444
weighted avg       0.79      0.81      0.80     18444

CPU times: user 55 s, sys: 522 ms, total: 55.5 s
Wall time: 56.3 s


For xgboost, after upsampled the traning set from 1:5 to 1:5, F1 score raise from 0.239 to 0.384.  
Also the roc_auc_score for xgboost is below:

In [44]:
roc_auc_score(y_test, y_pred)

0.6219581174762067