# Read in data

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
train_file = "./data/train.txt"
test_file = "./data/test.txt"

In [3]:
# Read in data file
train_df = pd.read_csv(train_file, sep="\t", header=None, names=["q1", "q2", "label"])
test_df = pd.read_csv(test_file, sep="\t", header=None, names=["q1", "q2", "label"])

In [4]:
# Read in stopwords from web EDA
with open("./data/stop_words.txt","r",encoding="utf-8") as f:
    stop_words_list = [line.strip() for line in f]
    
# Read in spelling correction from web EDA
with open("./data/spelling_corrections.json", "r", encoding="utf-8") as f:
    spell_chk = json.loads(f.read())
    
import jieba
jieba.load_userdict("./data/dict_all.txt")

def preprocessing_n_seq(text):
    for token_str,replac_str in spell_chk.items():
        text = text.replace(token_str, replac_str)
        
    tokens = [t for t in jieba.cut(text.strip()) if t not in stop_words_list]
    return tokens

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.559 seconds.
Prefix dict has been built succesfully.


In [5]:
%%time
train_df['q1_tokens'] = train_df['q1'].apply(lambda x: preprocessing_n_seq(x))
train_df['q2_tokens'] = train_df['q2'].apply(lambda x: preprocessing_n_seq(x))
test_df['q1_tokens'] = test_df['q1'].apply(lambda x: preprocessing_n_seq(x))
test_df['q2_tokens'] = test_df['q2'].apply(lambda x: preprocessing_n_seq(x))


CPU times: user 13.8 s, sys: 34.7 ms, total: 13.9 s
Wall time: 13.9 s


In [6]:
# Load in tencent pretrained word vector
tencent_pretrained_vec_loc = "/data/Tencent_AILab_ChineseEmbedding.txt"
def process_raw_fileline(line, number_dim):
    line = line.rstrip().decode('utf8')
    pieces = line.rsplit(' ', int(number_dim))
    word = pieces[0]
    vector = np.asarray([float(v) for v in pieces[1:]], dtype='f')
    return word, vector

def load_tencent_pretrained_vectors():
    from multiprocessing import Pool  # For CPU
    from multiprocessing.dummy import Pool as ThreadPool  # For IO
    from functools import partial

    with open(tencent_pretrained_vec_loc, "rb") as f:
        header = f.readline()
        number_row, number_dim = header.split()

        pool = Pool(8)
        # pool = ThreadPool(8)
        process_raw_fileline_100dim = partial(process_raw_fileline, number_dim=number_dim)
        wv_set_list = pool.map(process_raw_fileline_100dim, f)
        pool.close()
        pool.join()
        # print("pool done.")
        w2v = dict(wv_set_list)
    return w2v

In [7]:
%%time
tencent_w2v = load_tencent_pretrained_vectors()

CPU times: user 43.3 s, sys: 37.5 s, total: 1min 20s
Wall time: 5min 19s


In [8]:
tencent_w2v["我"]

array([ 0.250116, -0.366958,  0.065014,  0.010725,  0.231398, -0.177817,
        0.064359, -0.005259,  0.115888,  0.154   ,  0.171935,  0.07247 ,
       -0.003175, -0.09248 ,  0.20276 , -0.030792, -0.306991, -0.289693,
       -0.055264, -0.189153,  0.122888,  0.081699,  0.017909,  0.158469,
        0.147464,  0.079238, -0.224966,  0.145837,  0.182973, -0.149864,
       -0.156044,  0.044855, -0.237059,  0.174146, -0.108293, -0.066462,
        0.140773,  0.092687, -0.124868, -0.026098,  0.167881, -0.117048,
        0.39075 , -0.036812, -0.051702, -0.161367, -0.355791, -0.311515,
       -0.090306,  0.084679, -0.184472,  0.090339, -0.098312,  0.256595,
        0.292554,  0.274648,  0.039325,  0.150774,  0.239049, -0.011787,
       -0.014104,  0.135062,  0.151537, -0.208729, -0.171538,  0.08003 ,
       -0.116087,  0.159768, -0.061878, -0.149166, -0.065586,  0.029528,
       -0.020271,  0.098718, -0.068513,  0.238489,  0.174631, -0.003655,
        0.161002, -0.002617,  0.202535,  0.276254, 

# Define the pre-processing pipeline

In [9]:
import keras
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.base import BaseEstimator
from sklearn.pipeline import TransformerMixin
import jieba

jieba.load_userdict("./data/dict_all.txt")

Using TensorFlow backend.


In [11]:
EMBEDDING_DIM = np.shape(next(iter(tencent_w2v.values())))[0]
MAX_NUM_WORDS = 20000
print(EMBEDDING_DIM, MAX_NUM_WORDS, len(tencent_w2v))

200 20000 8824330


In [12]:
def preprocess_texts(raw_texts):
    import re
    from bs4 import BeautifulSoup as bs
    def bs_unescape_html(text):
        return bs(text, "lxml").get_text()
    def preprocess_text(raw_text):
        # text = re.sub(r'\(.*\)', '', raw_text)
        text = " ".join(jieba.cut(raw_text))
        return bs_unescape_html(text.lower().strip())
    
    return [preprocess_text(t) for t in raw_texts]


class Text2SeqTransformer(Tokenizer, BaseEstimator, TransformerMixin):
    def __init__(self, **kwargs):
        """
        params: num_words: to be a vocab_size of returning tokenzier and dictionary.
        """
        super().__init__(**kwargs)
        
    def fit(self, texts, y=None):
        """
        params: texts: list of strings
        """
        self.fit_on_texts(texts)
        return self

    def transform(self, texts, y=None):
        return np.array(self.texts_to_sequences(texts))
    
# Implement a padder transformer base on keras pad_sequences
class PaddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, maxlen=200):
        self.maxlen = maxlen
        self.max_index = None
    def fit(self, X, y=None):
        self.max_index = pad_sequences(X, maxlen=self.maxlen).max()
        return self
    
    def transform(self, X, y=None):
        X = pad_sequences(X, maxlen=self.maxlen)
        X[X > self.max_index] = 0
        return X

In [13]:
# build preprocess pipeline
import re
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer

max_words = 200
vocab_size = len(tencent_w2v)

preprocess_pipeline = make_pipeline(
    FunctionTransformer(preprocess_texts, validate=False)
    , Text2SeqTransformer(num_words=MAX_NUM_WORDS)
    , PaddingTransformer(maxlen=max_words)
)


In [14]:
%%time
preprocess_pipeline.fit(list(train_df['q1']) + list(train_df['q2']))

CPU times: user 34.2 s, sys: 475 ms, total: 34.7 s
Wall time: 37.3 s


Pipeline(memory=None,
         steps=[('functiontransformer',
                 FunctionTransformer(accept_sparse=False, check_inverse=True,
                                     func=<function preprocess_texts at 0x7f9f2ce2d0d0>,
                                     inv_kw_args=None, inverse_func=None,
                                     kw_args=None, pass_y='deprecated',
                                     validate=False)),
                ('text2seqtransformer', Text2SeqTransformer()),
                ('paddingtransformer', PaddingTransformer(maxlen=200))],
         verbose=False)

In [15]:
x_left = preprocess_pipeline.transform(train_df['q1'])
x_right = preprocess_pipeline.transform(train_df['q2'])
# this will be the input of the siamese network
x_pairs = [x_left, x_right]   

y_pairs = train_df['label'].values
print(np.shape(x_pairs))

(2, 61486, 200)


# build the network

In [16]:
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Dropout, Lambda, Subtract, Conv1D, MaxPooling1D, Flatten, Embedding, LSTM, Bidirectional
from keras.layers import BatchNormalization, concatenate
from keras import backend as K

In [18]:
# get word dict
tokenize_texts = preprocess_texts(list(train_df['q1']) + list(train_df['q2']))
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(tokenize_texts)
word_index = tokenizer.word_index
print(len(word_index))

10278


In [19]:
word_index

{'花呗': 1,
 '我': 2,
 '的': 3,
 '，': 4,
 '了': 5,
 '吗': 6,
 '借呗': 7,
 '怎么': 8,
 '还': 9,
 '可以': 10,
 '还款': 11,
 '为什么': 12,
 '额度': 13,
 '蚂蚁借呗': 14,
 '是': 15,
 '分期': 16,
 '开通': 17,
 '用花呗': 18,
 '钱': 19,
 '用': 20,
 '没有': 21,
 '有': 22,
 '什么': 23,
 '不能': 24,
 '不': 25,
 '在': 26,
 '能': 27,
 '支付宝': 28,
 '付款': 29,
 '月': 30,
 '支付': 31,
 '蚂蚁花呗': 32,
 '多少': 33,
 '使用': 34,
 '显示': 35,
 '没': 36,
 '现在': 37,
 '提前': 38,
 '到': 39,
 '会': 40,
 '退款': 41,
 '银行卡': 42,
 '时候': 43,
 '后': 44,
 '不了': 45,
 '申请': 46,
 '关闭': 47,
 '要': 48,
 '这个': 49,
 '和': 50,
 '借': 51,
 '如何': 52,
 '已经': 53,
 '怎么办': 54,
 '么': 55,
 '收款': 56,
 '都': 57,
 '使': 58,
 '账单': 59,
 '买': 60,
 '淘宝': 61,
 '影响': 62,
 '想': 63,
 '上': 64,
 '账号': 65,
 '还清': 66,
 '就': 67,
 '还了': 68,
 '临时': 69,
 '还是': 70,
 '号': 71,
 '里': 72,
 '利息': 73,
 '自动': 74,
 '逾期': 75,
 '绑定': 76,
 '信用卡': 77,
 '支持': 78,
 '把': 79,
 '手续费': 80,
 '余额宝': 81,
 '但是': 82,
 '取消': 83,
 '给': 84,
 '用不了': 85,
 '扣款': 86,
 '商家': 87,
 '扣': 88,
 '一个': 89,
 '退': 90,
 '花呗逾期': 91,
 '东西': 92,
 '恢复': 93,
 '需要'

In [20]:
def get_embeddings(w2v):
    num_words = min(MAX_NUM_WORDS, len(w2v) + 1) if MAX_NUM_WORDS else len(w2v) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= MAX_NUM_WORDS:
            continue
        embedding_vector = w2v.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def get_embeddings_loop():
    num_words = min(MAX_NUM_WORDS, len(w2v) + 1) if MAX_NUM_WORDS else len(w2v) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = w2v.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def get_keras_embeddings_layers(w2v, maxlen):
    from keras.layers import Embedding
    embeddings = get_embeddings(w2v)
    x_embedded = Embedding(
        embeddings.shape[0]
        , embeddings.shape[1]
        , input_length=maxlen
        , trainable=False
        , weights=[embeddings]
    )
    return x_embedded

In [21]:
def exponent_neg_manhattan_distance(difference):
    """ Compute the exponent of the opposite of the L1 norm of a vector, to get the left/right inputs
    similarity from the inputs differences. This function is used to turned the unbounded
    L1 distance to a similarity measure between 0 and 1"""
    return K.exp(-K.sum(K.abs(difference), axis=1, keepdims=True))

In [77]:
def siamese_lstm(max_length, embedding_layer):
    """ Define, compile and return a siamese LSTM model """
    input_shape = (max_length,)
    left_input = Input(input_shape, name='left_input')
    right_input = Input(input_shape, name='right_input')

    # Define a single sequential model for both arms.
    # In this example I've chosen a simple bidirectional LSTM with no dropout
    seq = Sequential(name='sequential_network')
    seq.add(embedding_layer)
    seq.add(Bidirectional(LSTM(32, dropout=0., recurrent_dropout=0.)))
    
    left_output = seq(left_input)
    right_output = seq(right_input)

    # Here we subtract the neuron values of the last layer from the left arm 
    # with the corresponding values from the right arm
    subtracted = Subtract(name='pair_representations_difference')([left_output, right_output])
    
    # 1 This is exponent negative manhattan distance
    
    manhattan_lstm_distance = Lambda(exponent_neg_manhattan_distance, name='masltsm_distance')(subtracted)
    
    # 2 This is sigmoid 
    L1_layer = Lambda(lambda tensors: K.abs(tensors))
    L1_distance = L1_layer(subtracted)
    prediction = Dense(1, activation='sigmoid')(L1_distance)
    
    # 3 Use bn and dense to make the distance.
    
    concated = concatenate([left_output, right_output])
    concated = BatchNormalization()(concated)
    concated = Dropout(0.3)(concated)
    concated = Dense(64, activation='relu')(concated)
    concated = BatchNormalization()(concated)
    concated = Dropout(0.3)(concated)
    preds = Dense(1, activation='sigmoid')(concated)

    
    # siamese_net = Model(inputs=[left_input, right_input], outputs=manhattan_lstm_distance)
    # siamese_net = Model(inputs=[left_input, right_input], outputs=prediction)
    siamese_net = Model(inputs=[left_input, right_input], outputs=preds)
    siamese_net.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])
    return siamese_net

In [78]:
embedding_layer = get_keras_embeddings_layers(tencent_w2v, maxlen=max_words)

In [79]:
one_epo_siamese_lstm = siamese_lstm(max_words, embedding_layer)

In [80]:
one_epo_siamese_lstm.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
left_input (InputLayer)         (None, 200)          0                                            
__________________________________________________________________________________________________
right_input (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
sequential_network (Sequential) (None, 64)           4059648     left_input[0][0]                 
                                                                 right_input[0][0]                
__________________________________________________________________________________________________
concatenate_3 (Concatenate)     (None, 128)          0           sequential_network[1][0]   

In [81]:
%%time
one_epo_siamese_lstm.fit(x_pairs, y_pairs, validation_split=0.1, epochs=2)

Train on 55337 samples, validate on 6149 samples
Epoch 1/2
Epoch 2/2
CPU times: user 40min 59s, sys: 3min 23s, total: 44min 22s
Wall time: 18min 24s


<keras.callbacks.callbacks.History at 0x7f9f9e499dd8>

In [27]:
x_test_left = preprocess_pipeline.transform(test_df['q1'])
x_test_right = preprocess_pipeline.transform(test_df['q2'])
# this will be the input of the siamese network
x_test_pairs = [x_test_left, x_test_right]   

y_test_pairs = test_df['label'].values
print(np.shape(x_test_pairs))

(2, 30744, 200)


In [37]:
dry_run_x_test_pairs = [x_test_left[:3], x_test_right[:3]]

In [70]:
print(y_test_pairs[:3])
one_epo_siamese_lstm.predict(dry_run_x_test_pairs)

[1 0 0]


array([[0.23769563],
       [0.11171392],
       [0.13364395]], dtype=float32)

In [71]:
score = one_epo_siamese_lstm.evaluate(x_test_pairs, y_test_pairs, verbose=0)

In [72]:
print("loss, accuracy:\n".format(score))
[0.4546530999896615, 0.8186963200569153]

loss, accuracy:



[0.4546530999896615, 0.8186963200569153]

In [82]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [83]:
y_prop = one_epo_siamese_lstm.predict(x_test_pairs)
y_pred = np.where(y_prop > 0.5, 1, 0).flatten()

In [84]:
cm = confusion_matrix(y_test_pairs, y_pred)  
print(cm)  
print()
print("F1 score", f1_score(y_test_pairs, y_pred))
print('Accuracy', accuracy_score(y_test_pairs, y_pred))
print('ROC AUC SCORE', roc_auc_score(y_test_pairs, y_pred))
print(classification_report(y_test_pairs, y_pred))

[[25162     7]
 [ 5556    19]]

F1 score 0.006784502767362971
Accuracy 0.8190541243819932
ROC AUC SCORE 0.5015649759197333
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     25169
           1       0.73      0.00      0.01      5575

    accuracy                           0.82     30744
   macro avg       0.77      0.50      0.45     30744
weighted avg       0.80      0.82      0.74     30744



In [None]:
'''
manhattan_lstm_distance
[[24975   194]
 [ 5157   418]]

F1 score 0.1351220300630354
Accuracy 0.8259497788186313
ROC AUC SCORE 0.5336348419215253
              precision    recall  f1-score   support

           0       0.83      0.99      0.90     25169
           1       0.68      0.07      0.14      5575

    accuracy                           0.83     30744
   macro avg       0.76      0.53      0.52     30744
weighted avg       0.80      0.83      0.76     30744
'''

'''
bn and dense to make the distance.
[[25162     7]
 [ 5556    19]]

F1 score 0.006784502767362971
Accuracy 0.8190541243819932
ROC AUC SCORE 0.5015649759197333
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     25169
           1       0.73      0.00      0.01      5575

    accuracy                           0.82     30744
   macro avg       0.77      0.50      0.45     30744
weighted avg       0.80      0.82      0.74     30744


'''