# Read in data

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
train_file = "./data/train.txt"
test_file = "./data/test.txt"

In [3]:
# Read in data file
train_df = pd.read_csv(train_file, sep="\t", header=None, names=["q1", "q2", "label"])
test_df = pd.read_csv(test_file, sep="\t", header=None, names=["q1", "q2", "label"])

In [4]:
# Read in stopwords from web EDA
with open("./data/stop_words.txt","r",encoding="utf-8") as f:
    stop_words_list = [line.strip() for line in f]
    
# Read in spelling correction from web EDA
with open("./data/spelling_corrections.json", "r", encoding="utf-8") as f:
    spell_chk = json.loads(f.read())
    
import jieba
jieba.load_userdict("./data/dict_all.txt")

def preprocessing_n_seq(text):
    for token_str,replac_str in spell_chk.items():
        text = text.replace(token_str, replac_str)
        
    tokens = [t for t in jieba.cut(text.strip()) if t not in stop_words_list]
    return tokens

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.728 seconds.
Prefix dict has been built succesfully.


In [5]:
%%time
train_df['q1_tokens'] = train_df['q1'].apply(lambda x: preprocessing_n_seq(x))
train_df['q2_tokens'] = train_df['q2'].apply(lambda x: preprocessing_n_seq(x))
test_df['q1_tokens'] = test_df['q1'].apply(lambda x: preprocessing_n_seq(x))
test_df['q2_tokens'] = test_df['q2'].apply(lambda x: preprocessing_n_seq(x))


CPU times: user 13.9 s, sys: 54.8 ms, total: 14 s
Wall time: 14 s


In [6]:
# Load in tencent pretrained word vector
tencent_pretrained_vec_loc = "/data/Tencent_AILab_ChineseEmbedding.txt"
def process_raw_fileline(line, number_dim):
    line = line.rstrip().decode('utf8')
    pieces = line.rsplit(' ', int(number_dim))
    word = pieces[0]
    vector = np.asarray([float(v) for v in pieces[1:]], dtype='f')
    return word, vector

def load_tencent_pretrained_vectors():
    from multiprocessing import Pool  # For CPU
    from multiprocessing.dummy import Pool as ThreadPool  # For IO
    from functools import partial

    with open(tencent_pretrained_vec_loc, "rb") as f:
        header = f.readline()
        number_row, number_dim = header.split()

        pool = Pool(8)
        # pool = ThreadPool(8)
        process_raw_fileline_100dim = partial(process_raw_fileline, number_dim=number_dim)
        wv_set_list = pool.map(process_raw_fileline_100dim, f)
        pool.close()
        pool.join()
        # print("pool done.")
        w2v = dict(wv_set_list)
    return w2v

In [8]:
%%time
tencent_w2v = load_tencent_pretrained_vectors()

CPU times: user 43.5 s, sys: 35.5 s, total: 1min 19s
Wall time: 6min 27s


In [10]:
# tencent_w2v["我"]

In [11]:
# Try to handle the imbalance data
# Upsample data with label == 1

from sklearn.utils import resample

ori_train_df = train_df.copy()
ready_to_upsampled_df = train_df.copy()

df_majority = ready_to_upsampled_df[ready_to_upsampled_df.label==0]
df_minority = ready_to_upsampled_df[ready_to_upsampled_df.label==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=df_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
upsampled_df = pd.concat([df_majority, df_minority_upsampled])

print(upsampled_df.label.value_counts())

train_df = upsampled_df

1    50220
0    50220
Name: label, dtype: int64


# Define the pre-processing pipeline

In [12]:
import keras
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.base import BaseEstimator
from sklearn.pipeline import TransformerMixin
import jieba

jieba.load_userdict("./data/dict_all.txt")

Using TensorFlow backend.


In [13]:
EMBEDDING_DIM = np.shape(next(iter(tencent_w2v.values())))[0]
MAX_NUM_WORDS = 20000
print(EMBEDDING_DIM, MAX_NUM_WORDS, len(tencent_w2v))

200 20000 8824330


In [14]:
def preprocess_texts(raw_texts):
    import re
    from bs4 import BeautifulSoup as bs
    def bs_unescape_html(text):
        return bs(text, "lxml").get_text()
    def preprocess_text(raw_text):
        # text = re.sub(r'\(.*\)', '', raw_text)
        text = " ".join(jieba.cut(raw_text))
        return bs_unescape_html(text.lower().strip())
    
    return [preprocess_text(t) for t in raw_texts]


class Text2SeqTransformer(Tokenizer, BaseEstimator, TransformerMixin):
    def __init__(self, **kwargs):
        """
        params: num_words: to be a vocab_size of returning tokenzier and dictionary.
        """
        super().__init__(**kwargs)
        
    def fit(self, texts, y=None):
        """
        params: texts: list of strings
        """
        self.fit_on_texts(texts)
        return self

    def transform(self, texts, y=None):
        return np.array(self.texts_to_sequences(texts))
    
# Implement a padder transformer base on keras pad_sequences
class PaddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, maxlen=200):
        self.maxlen = maxlen
        self.max_index = None
    def fit(self, X, y=None):
        self.max_index = pad_sequences(X, maxlen=self.maxlen).max()
        return self
    
    def transform(self, X, y=None):
        X = pad_sequences(X, maxlen=self.maxlen)
        X[X > self.max_index] = 0
        return X

In [15]:
# build preprocess pipeline
import re
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer

max_words = 200
vocab_size = len(tencent_w2v)

preprocess_pipeline = make_pipeline(
    FunctionTransformer(preprocess_texts, validate=False)
    , Text2SeqTransformer(num_words=MAX_NUM_WORDS)
    , PaddingTransformer(maxlen=max_words)
)


In [16]:
%%time
preprocess_pipeline.fit(list(train_df['q1']) + list(train_df['q2']))

CPU times: user 55.8 s, sys: 414 ms, total: 56.3 s
Wall time: 59.4 s


Pipeline(memory=None,
         steps=[('functiontransformer',
                 FunctionTransformer(accept_sparse=False, check_inverse=True,
                                     func=<function preprocess_texts at 0x7fb62e13e268>,
                                     inv_kw_args=None, inverse_func=None,
                                     kw_args=None, pass_y='deprecated',
                                     validate=False)),
                ('text2seqtransformer', Text2SeqTransformer()),
                ('paddingtransformer', PaddingTransformer(maxlen=200))],
         verbose=False)

In [17]:
x_left = preprocess_pipeline.transform(train_df['q1'])
x_right = preprocess_pipeline.transform(train_df['q2'])
# this will be the input of the siamese network
x_pairs = [x_left, x_right]   

y_pairs = train_df['label'].values
print(np.shape(x_pairs))

(2, 100440, 200)


# build the network

In [18]:
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Dropout, Lambda, Subtract, Conv1D, Flatten, Embedding, LSTM, GRU, Bidirectional
from keras.layers import BatchNormalization, concatenate, Concatenate, Multiply, AveragePooling1D, MaxPooling1D
from keras import backend as K

In [19]:
# get word dict
tokenize_texts = preprocess_texts(list(train_df['q1']) + list(train_df['q2']))
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(tokenize_texts)
word_index = tokenizer.word_index
print(len(word_index))

10269


In [20]:
# word_index

In [20]:
def get_embeddings(w2v):
    num_words = min(MAX_NUM_WORDS, len(w2v) + 1) if MAX_NUM_WORDS else len(w2v) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= MAX_NUM_WORDS:
            continue
        embedding_vector = w2v.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def get_embeddings_loop():
    num_words = min(MAX_NUM_WORDS, len(w2v) + 1) if MAX_NUM_WORDS else len(w2v) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = w2v.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def get_keras_embeddings_layers(w2v, maxlen):
    from keras.layers import Embedding
    embeddings = get_embeddings(w2v)
    x_embedded = Embedding(
        embeddings.shape[0]
        , embeddings.shape[1]
        , input_length=maxlen
        , trainable=False
        , weights=[embeddings]
    )
    return x_embedded

In [21]:
def exponent_neg_manhattan_distance(difference):
    """ Compute the exponent of the opposite of the L1 norm of a vector, to get the left/right inputs
    similarity from the inputs differences. This function is used to turned the unbounded
    L1 distance to a similarity measure between 0 and 1"""
    return K.exp(-K.sum(K.abs(difference), axis=1, keepdims=True))

def cosine_distance(inputs):
    return K.sum(inputs[0] * inputs[1],axis=1,keepdims=True)/(K.sum(inputs[0]**2,axis=1,keepdims=True) * K.sum(inputs[1]**2,axis=1,keepdims=True))

In [23]:
def siamese_lstm(max_length, embedding_layer):
    """ Define, compile and return a siamese LSTM model """
    input_shape = (max_length,)
    left_input = Input(input_shape, name='left_input')
    right_input = Input(input_shape, name='right_input')

    # Define a single sequential model for both arms.
    # In this example I've chosen a simple bidirectional LSTM with no dropout
    seq = Sequential(name='sequential_network')
    seq.add(embedding_layer)
    seq.add(Bidirectional(LSTM(32, dropout=0., recurrent_dropout=0.)))
    
    left_output = seq(left_input)
    right_output = seq(right_input)

    # Here we subtract the neuron values of the last layer from the left arm 
    # with the corresponding values from the right arm
    subtracted = Subtract(name='pair_representations_difference')([left_output, right_output])
    
    # 1 This is exponent negative manhattan distance
    
    manhattan_lstm_distance = Lambda(exponent_neg_manhattan_distance, name='masltsm_distance')(subtracted)
    
    # 2 This is sigmoid 
    L1_layer = Lambda(lambda tensors: K.abs(tensors))
    L1_distance = L1_layer(subtracted)
    prediction = Dense(1, activation='sigmoid')(L1_distance)
    
    # 3 Use bn and dense to make the distance.
    
    concated = concatenate([left_output, right_output])
    concated = BatchNormalization()(concated)
    concated = Dropout(0.3)(concated)
    concated = Dense(64, activation='relu')(concated)
    concated = BatchNormalization()(concated)
    concated = Dropout(0.3)(concated)
    preds = Dense(1, activation='sigmoid')(concated)

    
    siamese_net = Model(inputs=[left_input, right_input], outputs=manhattan_lstm_distance)
    # siamese_net = Model(inputs=[left_input, right_input], outputs=prediction)
    # siamese_net = Model(inputs=[left_input, right_input], outputs=preds)
    siamese_net.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])
    return siamese_net

In [24]:
embedding_layer = get_keras_embeddings_layers(tencent_w2v, maxlen=max_words)

In [25]:
one_epo_siamese_lstm = siamese_lstm(max_words, embedding_layer)

In [26]:
one_epo_siamese_lstm.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
left_input (InputLayer)         (None, 200)          0                                            
__________________________________________________________________________________________________
right_input (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
sequential_network (Sequential) (None, 64)           4059648     left_input[0][0]                 
                                                                 right_input[0][0]                
__________________________________________________________________________________________________
pair_representations_difference (None, 64)           0           sequential_network[1][0]   

In [27]:
%%time
one_epo_siamese_lstm.fit(x_pairs, y_pairs, validation_split=0.1, epochs=2)

Train on 90396 samples, validate on 10044 samples
Epoch 1/2
Epoch 2/2
CPU times: user 1h 8min 38s, sys: 5min 31s, total: 1h 14min 10s
Wall time: 30min 7s


<keras.callbacks.callbacks.History at 0x7f2afe228d68>

In [29]:
x_test_left = preprocess_pipeline.transform(test_df['q1'])
x_test_right = preprocess_pipeline.transform(test_df['q2'])
# this will be the input of the siamese network
x_test_pairs = [x_test_left, x_test_right]   

y_test_pairs = test_df['label'].values
print(np.shape(x_test_pairs))

(2, 30744, 200)


In [29]:
dry_run_x_test_pairs = [x_test_left[:3], x_test_right[:3]]

In [30]:
print(y_test_pairs[:3])
one_epo_siamese_lstm.predict(dry_run_x_test_pairs)

[1 0 0]


array([[0.47568128],
       [0.24673058],
       [0.14412421]], dtype=float32)

In [31]:
score = one_epo_siamese_lstm.evaluate(x_test_pairs, y_test_pairs, verbose=0)

In [32]:
print("loss, accuracy:\n".format(score))
[0.4546530999896615, 0.8186963200569153]

loss, accuracy:



[0.4546530999896615, 0.8186963200569153]

In [32]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [34]:
y_prop = one_epo_siamese_lstm.predict(x_test_pairs)
y_pred = np.where(y_prop > 0.5, 1, 0).flatten()

In [35]:
cm = confusion_matrix(y_test_pairs, y_pred)  
print(cm)  
print()
print("F1 score", f1_score(y_test_pairs, y_pred))
print('Accuracy', accuracy_score(y_test_pairs, y_pred))
print('ROC AUC SCORE', roc_auc_score(y_test_pairs, y_pred))
print(classification_report(y_test_pairs, y_pred))

[[22562  2607]
 [ 3144  2431]]

F1 score 0.4581174031847734
Accuracy 0.8129391100702577
ROC AUC SCORE 0.6662370055554496
              precision    recall  f1-score   support

           0       0.88      0.90      0.89     25169
           1       0.48      0.44      0.46      5575

    accuracy                           0.81     30744
   macro avg       0.68      0.67      0.67     30744
weighted avg       0.81      0.81      0.81     30744



# Results

In [None]:
'''
manhattan_lstm_distance (imbalanced)
[[24975   194]
 [ 5157   418]]

F1 score 0.1351220300630354
Accuracy 0.8259497788186313
ROC AUC SCORE 0.5336348419215253
              precision    recall  f1-score   support

           0       0.83      0.99      0.90     25169
           1       0.68      0.07      0.14      5575

    accuracy                           0.83     30744
   macro avg       0.76      0.53      0.52     30744
weighted avg       0.80      0.83      0.76     30744
'''

'''
manhattan_lstm_distance (upsampled)

[[22562  2607]
 [ 3144  2431]]

F1 score 0.4581174031847734
Accuracy 0.8129391100702577
ROC AUC SCORE 0.6662370055554496
              precision    recall  f1-score   support

           0       0.88      0.90      0.89     25169
           1       0.48      0.44      0.46      5575

    accuracy                           0.81     30744
   macro avg       0.68      0.67      0.67     30744
weighted avg       0.81      0.81      0.81     30744

'''

'''
bn and dense to make the distance.
[[25162     7]
 [ 5556    19]]

F1 score 0.006784502767362971
Accuracy 0.8190541243819932
ROC AUC SCORE 0.5015649759197333
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     25169
           1       0.73      0.00      0.01      5575

    accuracy                           0.82     30744
   macro avg       0.77      0.50      0.45     30744
weighted avg       0.80      0.82      0.74     30744


'''

In [22]:
def siamese_gru_nn(max_length, embedding_layer):
    
    input_shape = (max_length,)
    left_input = Input(input_shape, name='left_input')
    right_input = Input(input_shape, name='right_input')

    # Define a single sequential model for both arms.
    seq = Sequential(name='sequential_network')
    seq.add(embedding_layer)
    seq.add(Conv1D(16, kernel_size=3, activation='relu'))
    seq.add(AveragePooling1D(2))
    seq.add(MaxPooling1D(2))
    seq.add(Bidirectional(GRU(32, dropout=0., recurrent_dropout=0.)))
    
    left_output = seq(left_input)
    right_output = seq(right_input)

    # Here we subtract the neuron values of the last layer from the left arm 
    # with the corresponding values from the right arm
    subtracted = Subtract(name='pair_representations_difference')([left_output, right_output])
    
    
    manhattan_distance = Lambda(exponent_neg_manhattan_distance, name='manh_distance')(subtracted)
    cos_dist = Lambda(cosine_distance, name='cosine_distance')([left_output, right_output])
    
    L1_layer = Lambda(lambda tensors: K.abs(tensors))
    L1_distance = L1_layer(subtracted)
    
    l_r_mul = Multiply(name="pair_rep_multiply")([left_output, right_output])
    
    
    concated = Concatenate(axis=1)([L1_distance, l_r_mul, manhattan_distance, cos_dist])
    concated = Dropout(0.05)(concated)
    concated = BatchNormalization()(concated)
    concated = Dropout(0.05)(concated)
    concated = Dense(64, activation='relu')(concated)
    concated = BatchNormalization()(concated)
    concated = Dropout(0.05)(concated)
    preds = Dense(1, activation='sigmoid')(concated)
    
    _net = Model(inputs=[left_input, right_input], outputs=preds)
    _net.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])
    return _net

In [25]:
simple_nn = siamese_gru_nn(max_words, embedding_layer)

In [26]:
simple_nn.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
left_input (InputLayer)         (None, 200)          0                                            
__________________________________________________________________________________________________
right_input (InputLayer)        (None, 200)          0                                            
__________________________________________________________________________________________________
sequential_network (Sequential) (None, 64)           4019024     left_input[0][0]                 
                                                                 right_input[0][0]                
__________________________________________________________________________________________________
pair_representations_difference (None, 64)           0           sequential_network[1][0]   

In [34]:
%%time
simple_nn.fit(x_pairs, y_pairs, validation_split=0.1, epochs=3)

Train on 90396 samples, validate on 10044 samples
Epoch 1/1
CPU times: user 10min 27s, sys: 48.4 s, total: 11min 15s
Wall time: 4min 46s


<keras.callbacks.callbacks.History at 0x7fb78b1e2828>

In [35]:
y_prop = simple_nn.predict(x_test_pairs)
y_pred = np.where(y_prop > 0.5, 1, 0).flatten()

In [36]:
cm = confusion_matrix(y_test_pairs, y_pred)  
print(cm)  
print()
print("F1 score", f1_score(y_test_pairs, y_pred))
print('Accuracy', accuracy_score(y_test_pairs, y_pred))
print('ROC AUC SCORE', roc_auc_score(y_test_pairs, y_pred))
print(classification_report(y_test_pairs, y_pred))

[[18389  6780]
 [ 2895  2680]]

F1 score 0.35650149650814766
Accuracy 0.685304449648712
ROC AUC SCORE 0.6056692454077699
              precision    recall  f1-score   support

           0       0.86      0.73      0.79     25169
           1       0.28      0.48      0.36      5575

    accuracy                           0.69     30744
   macro avg       0.57      0.61      0.57     30744
weighted avg       0.76      0.69      0.71     30744



In [None]:
"""
simple nn with concate L1 distance, cosine distance, exp neg manhattan distance, and multiply and then MLP.
2 epochs.

Confusion matrix:
[[14502 10667]
 [ 2013  3562]]

F1 score 0.3597253080185821
Accuracy 0.5875618006765547
ROC AUC SCORE 0.6075543781436592
              precision    recall  f1-score   support

           0       0.88      0.58      0.70     25169
           1       0.25      0.64      0.36      5575

    accuracy                           0.59     30744
   macro avg       0.56      0.61      0.53     30744
weighted avg       0.76      0.59      0.63     30744
"""

"""

"""



"""
simple nn with concate L1 distance, cosine distance, exp neg manhattan distance, and multiply and then MLP.
6 epochs.

[[17591  7578]
 [ 2686  2889]]

F1 score 0.36017952873706516
Accuracy 0.6661462399167317
ROC AUC SCORE 0.6085608051900987
              precision    recall  f1-score   support

           0       0.87      0.70      0.77     25169
           1       0.28      0.52      0.36      5575

    accuracy                           0.67     30744
   macro avg       0.57      0.61      0.57     30744
weighted avg       0.76      0.67      0.70     30744

"""