In [273]:
import pandas as pd


def load_dataset(dataname):
    train = pd.read_csv('{}/train.tsv'.format(dataname),
                        sep='[\t\f\r\v]', names=['query1', 'query2', 'label'])

    valid = pd.read_csv('{}/dev.tsv'.format(dataname),
                        sep='\t', names=['query1', 'query2', 'label'])

    test = pd.read_csv('{}/test.tsv'.format(dataname),
                       sep='\t', names=['query1', 'query2', 'label'])

    return train, valid, test


name = 'paws-x-zh'
train, valid, test = load_dataset(name)
print(train.values.shape, valid.values.shape, test.values.shape)
train.head()

(49401, 3) (2000, 3) (2000, 3)


  train = pd.read_csv('{}/train.tsv'.format(dataname),


Unnamed: 0,query1,query2,label
0,1560年10月，他在巴黎秘密会见了英国大使Nicolas Throckmorton，要求他...,1560年10月，他在巴黎秘密会见了英国大使尼古拉斯·斯罗克莫顿，并要求他通过英格兰返回苏格...,0.0
1,1975年的NBA赛季 - 76赛季是全美篮球协会的第30个赛季。,1975-76赛季的全国篮球协会是NBA的第30个赛季。,1.0
2,还有具体的讨论，公众形象辩论和项目讨论。,还有公开讨论，特定档案讨论和项目讨论。,0.0
3,当可以保持相当的流速时，结果很高。,当可以保持可比较的流速时，结果很高。,1.0
4,它是Akmola地区Zerendi区的所在地。,它是Akmola地区Zerendi区的所在地。,1.0


jieba库的分词原理

1. 根据字典统计的每个词的词频，构建前缀树trie树
2. 根据当前的句子，统计出现在词典的词构建DAG，利用动态规划进行最大概率路径计算
3. 对于OOV的词，采用序列标注，利用HMM模型，viterbi算法进行解码。

In [274]:
# 1. 句子A\B包含的字符个数
# 2. 句子A与B的编辑距离
# 3. 句子A与B的字符个数

In [275]:
# import nltk
#
# nltk.download()

In [276]:
from nltk.corpus import stopwords
import jieba

stops = set(stopwords.words('chinese'))


# A、B共有的字符数
def word_match_share(row):
    q1words = {}
    q2words = {}
    if row['query1'] is not None:
        for word in row['query1']:
            if word not in stops:
                q1words[word] = 1
    if row['query2'] is not None:
        for word in row['query2']:
            if word not in stops:
                q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2)) / (len(q2words) + len(q1words))
    return R


def token_match_share(row):
    q1words = {}
    q2words = {}
    if row['query1'] is not None:
        q1 = jieba.lcut(row['query1'])
        for word in q1:
            if word not in stops:
                q1words[word] = 1
    if row['query2'] is not None:
        q2 = jieba.lcut(row['query2'])
        for word in q2:
            if word not in stops:
                q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2)) / (len(q2words) + len(q1words))
    return R

In [277]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import numpy as np

tfidf_model = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")


def tf_idf(row):
    if row['query1'] is None or row['query2'] is None:
        return np.array([0, 0])
    q1_words = jieba.lcut(row['query1'])
    q2_words = jieba.lcut(row['query2'])
    document_1 = ' '.join(q1_words)
    document_2 = ' '.join(q2_words)
    vector = tfidf_model.fit_transform([document_1, document_2]).toarray()
    return np.array(vector)


def tf_idf_q1(row):
    vector = tf_idf(row)
    return vector[0]


def tf_idf_q2(row):
    vector = tf_idf(row)
    return vector[1]


S = StandardScaler()
for index, dataset in enumerate([train, valid, test]):
    if index != 2:
        dataset.dropna(axis=0, subset='label', inplace=True)
    dataset["characters_of_query1"] = dataset.apply(lambda x: len(x["query1"]) if x['query1'] is not None else 0,
                                                    axis=1)  # 有可能为空
    dataset["characters_of_query2"] = dataset.apply(lambda x: len(x["query2"]) if x['query2'] is not None else 1,
                                                    axis=1)
    dataset["word_match_share"] = dataset.apply(word_match_share, axis=1)
    dataset["token_match_share"] = dataset.apply(token_match_share, axis=1)
    dataset["tf_idf_q1"] = dataset.apply(tf_idf_q1, axis=1)
    dataset["tf_idf_q2"] = dataset.apply(tf_idf_q2, axis=1)
    dataset['dist'] = dataset.apply(lambda row: np.sum(np.multiply(row['tf_idf_q1'], row['tf_idf_q2'])), axis=1)
    if index == 0:
        dataset['dist'] = S.fit_transform(dataset['dist'][:, np.newaxis])
    else:
        dataset['dist'] = S.transform(dataset['dist'][:, np.newaxis])
train

  dataset['dist'] = S.fit_transform(dataset['dist'][:, np.newaxis])
  dataset['dist'] = S.transform(dataset['dist'][:, np.newaxis])
  dataset['dist'] = S.transform(dataset['dist'][:, np.newaxis])


Unnamed: 0,query1,query2,label,characters_of_query1,characters_of_query2,word_match_share,token_match_share,tf_idf_q1,tf_idf_q2,dist
0,1560年10月，他在巴黎秘密会见了英国大使Nicolas Throckmorton，要求他...,1560年10月，他在巴黎秘密会见了英国大使尼古拉斯·斯罗克莫顿，并要求他通过英格兰返回苏格...,0.0,56,51,0.666667,0.722222,"[0.1963024680302519, 0.1963024680302519, 0.275...","[0.16950940407963241, 0.16950940407963241, 0.0...",-0.467640
1,1975年的NBA赛季 - 76赛季是全美篮球协会的第30个赛季。,1975-76赛季的全国篮球协会是NBA的第30个赛季。,1.0,34,28,0.904762,0.818182,"[0.1963024680302519, 0.1963024680302519, 0.196...","[0.2295648050491987, 0.2295648050491987, 0.229...",0.574701
2,还有具体的讨论，公众形象辩论和项目讨论。,还有公开讨论，特定档案讨论和项目讨论。,0.0,20,19,0.560000,0.615385,"[0.36408901109085745, 0.0, 0.36408901109085745...","[0.0, 0.3319543885703534, 0.0, 0.2361882814843...",-1.095266
3,当可以保持相当的流速时，结果很高。,当可以保持可比较的流速时，结果很高。,1.0,17,18,0.956522,1.000000,"[0.30184998836473537, 0.0, 0.30184998836473537...","[0.2778778796561673, 0.39054766417182263, 0.27...",0.020937
4,它是Akmola地区Zerendi区的所在地。,它是Akmola地区Zerendi区的所在地。,1.0,23,23,1.000000,1.000000,"[0.35355339059327373, 0.35355339059327373, 0.3...","[0.35355339059327373, 0.35355339059327373, 0.3...",1.360523
...,...,...,...,...,...,...,...,...,...,...
49396,``我们的学校是精神和精神，热爱（时间路径）是我们的第一承诺''。,``我们的学校属于时间和精神，对Rehit的爱（精神之路）是我们的第一承诺。 “”,0.0,33,41,0.714286,0.689655,"[0.0, 0.0, 0.20019763520116748, 0.200197635201...","[0.23807087006547265, 0.23807087006547265, 0.1...",-0.212755
49397,她于6月24日在科克，并于7月8日抵达。,她于6月24日在科克，并于7月8日抵达唐斯。,1.0,20,22,0.928571,0.956522,"[0.23570226039551587, 0.23570226039551587, 0.2...","[0.223744822803537, 0.223744822803537, 0.22374...",1.083258
49398,Cornelia Stuyvesant Vanderbilt（George和Edith Va...,John John F. A. Cecil（George和Cornelia Stuyvesa...,0.0,110,117,0.949495,0.920000,"[0.17414275484853994, 0.17414275484853994, 0.3...","[0.15826034850909818, 0.15826034850909818, 0.3...",0.715150
49399,第三季于2010年6月7日首播，第四季是混合情侣竞赛系统。,第四季于2010年6月7日首播。就像第三季一样，比赛系统是混合情侣。,0.0,29,34,0.956522,0.933333,"[0.25019294054773356, 0.25019294054773356, 0.2...","[0.21368043965123457, 0.21368043965123457, 0.2...",-0.014250


In [278]:
columns = ["characters_of_query1", "characters_of_query2", "word_match_share", "token_match_share", "dist"]
train_feature_data = train[columns]
train_target_data = train["label"]
valid_feature_data = valid[columns]
valid_target_data = valid["label"]
test_feature_data = test[columns]
train_feature_data

Unnamed: 0,characters_of_query1,characters_of_query2,word_match_share,token_match_share,dist
0,56,51,0.666667,0.722222,-0.467640
1,34,28,0.904762,0.818182,0.574701
2,20,19,0.560000,0.615385,-1.095266
3,17,18,0.956522,1.000000,0.020937
4,23,23,1.000000,1.000000,1.360523
...,...,...,...,...,...
49396,33,41,0.714286,0.689655,-0.212755
49397,20,22,0.928571,0.956522,1.083258
49398,110,117,0.949495,0.920000,0.715150
49399,29,34,0.956522,0.933333,-0.014250


In [279]:
import xgboost as xgb

d_train_data = xgb.DMatrix(train_feature_data.values, label=train_target_data.values)
d_eval_data = xgb.DMatrix(valid_feature_data.values, label=valid_target_data.values)
params = {'max_depth': 4, 'objective': 'binary:logistic', 'eval_metric': ['logloss', 'auc'], 'eta': 0.02}
watchlist = [(d_train_data, 'train'), (d_eval_data, 'valid')]
bst = xgb.train(params, d_train_data, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)

[0]	train-logloss:0.69236	train-auc:0.59032	valid-logloss:0.69241	valid-auc:0.52625
[10]	train-logloss:0.68599	train-auc:0.59265	valid-logloss:0.68682	valid-auc:0.52726
[20]	train-logloss:0.68165	train-auc:0.59475	valid-logloss:0.68359	valid-auc:0.52735




[30]	train-logloss:0.67862	train-auc:0.59554	valid-logloss:0.68188	valid-auc:0.53074
[40]	train-logloss:0.67647	train-auc:0.59621	valid-logloss:0.68104	valid-auc:0.53206
[50]	train-logloss:0.67491	train-auc:0.59709	valid-logloss:0.68082	valid-auc:0.53326
[60]	train-logloss:0.67375	train-auc:0.59795	valid-logloss:0.68085	valid-auc:0.53366
[70]	train-logloss:0.67288	train-auc:0.59854	valid-logloss:0.68113	valid-auc:0.53386
[80]	train-logloss:0.67218	train-auc:0.59947	valid-logloss:0.68148	valid-auc:0.53388
[90]	train-logloss:0.67160	train-auc:0.60050	valid-logloss:0.68181	valid-auc:0.53458
[100]	train-logloss:0.67114	train-auc:0.60125	valid-logloss:0.68230	valid-auc:0.53469
[110]	train-logloss:0.67075	train-auc:0.60198	valid-logloss:0.68272	valid-auc:0.53514
[120]	train-logloss:0.67037	train-auc:0.60286	valid-logloss:0.68312	valid-auc:0.53531
[130]	train-logloss:0.67005	train-auc:0.60367	valid-logloss:0.68346	valid-auc:0.53542
[140]	train-logloss:0.66978	train-auc:0.60446	valid-logloss:0

In [280]:
d_test_data = xgb.DMatrix(test_feature_data)
predict_test_data = bst.predict(d_test_data)
predict_test_data = [0 if data < 0.5 else 1 for data in predict_test_data]
sub = pd.DataFrame(columns=['index', 'prediction'])
sub['index'] = test_feature_data.index
sub['prediction'] = predict_test_data
sub.to_csv('{}.tsv'.format(name), index=False, sep='\t')

In [281]:
sub

Unnamed: 0,index,prediction
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
1995,1995,0
1996,1996,0
1997,1997,1
1998,1998,0
