## Siamese Network

Use Quora Question Pairs competition data and pretrained word embeddings (GloVe), build a simple siamese network.

Siamese network is a type of neural network that uses same weight for two different inputs to get a comparison. Here is the example of using siamese network on supervised text similarity task.

In [1]:
import os
import string
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from abc import ABC, abstractmethod

from keras.preprocessing import text, sequence
from keras.layers import Input, Embedding, LSTM, CuDNNLSTM, Dense, Layer, Lambda
from keras.layers import concatenate, subtract, add, maximum, multiply
from keras.models import Model
from keras import optimizers
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras import backend as K
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cosine

warnings.filterwarnings('ignore')
%matplotlib inline

Using TensorFlow backend.


In [2]:
def process_text_to_sequence(X_train, X_test, **kwargs):
    """
    Process text data (array) to equal length sequences use keras
    :param X_train : np.array with shape (m, )
    :param X_test  : np.array with shape (n, )
    :param kwargs  : other parameters needed
    :return:
    """
    max_features = kwargs.get('max_features', 10000)
    max_len = kwargs.get('max_len', 50)

    tokenizer = text.Tokenizer(num_words=max_features, lower=True, split=' ',
                               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                               char_level=False)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))

    # process text to sequence
    X_train_sequence = tokenizer.texts_to_sequences(X_train)
    X_test_sequence = tokenizer.texts_to_sequences(X_test)

    # truncate / padding
    X_train_sequence_pad = sequence.pad_sequences(X_train_sequence, maxlen=max_len)
    X_test_sequence_pad = sequence.pad_sequences(X_test_sequence, maxlen=max_len)

    return dict(X_train_sequence=X_train_sequence,
                X_test_sequence=X_test_sequence,
                X_train_sequence_pad=X_train_sequence_pad,
                X_test_sequence_pad=X_test_sequence_pad,
                tokenizer=tokenizer)

def load_pretrained_word_embeddings(embedding_path, tokenizer, **kwargs):
    """
    Load pretrained word embeddings
    :param embedding_path : str, example: './embeddings/glove.840B.300d/glove.840B.300d.txt'
    :param tokenizer      : keras tokenizer, return from process_text_to_sequence
    :param kwargs         : other parameters needed
    :return:
    """

    embedding_size = kwargs.get('embedding_size', 300)
    max_features = kwargs.get('max_features', 10000)
    
    # word_pretrained_index: key = word, value = index in pretrained embeddings
    word_pretrained_index = {}
    
    def _get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    embeddings_index = dict(
        _get_coefs(*o.strip().rsplit(' ')) for o in open(embedding_path))
    embeddings_vocab = list(embeddings_index.keys())
    word_index = tokenizer.word_index
    num_words = min(max_features, len(word_index))
    embeddings_matrix = np.zeros((num_words, embedding_size))
    for word, i in word_index.items():
        i -= 1
        if i >= max_features:
            continue
        embeddings_vector = embeddings_index.get(word)
        # oov or not
        if embeddings_vector is not None:
            embeddings_matrix[i] = embeddings_vector
            word_pretrained_index[word] = embeddings_vocab.index(word)
        
        # make 0 as start
        word_index[word] -= 1
        
    return dict(word_pretrained_index=word_pretrained_index,
                word_index=word_index,
                embeddings_matrix=embeddings_matrix)

In [3]:
data_path = '../input/quora-question-pairs/'
word_embedding_file = '../input/glove840b300dtxt/glove.840B.300d.txt'

train = pd.read_csv(os.path.join(data_path, 'train.csv.zip'))
test = pd.read_csv(os.path.join(data_path, 'test.csv.zip'))

X_train_q1 = train['question1'].fillna('').values
X_train_q2 = train['question2'].fillna('').values
X_test_q1 = test['question1'].fillna('').values
X_test_q2 = test['question2'].fillna('').values
y_train = train['is_duplicate'].values
print(X_train_q1.shape, X_train_q2.shape, X_test_q1.shape, X_test_q2.shape)

(404290,) (404290,) (3563475,) (3563475,)


In [4]:
train_size = train.shape[0]
X_train_full = np.hstack([X_train_q1, X_train_q2])
X_test_full = np.hstack([X_test_q1, X_test_q2])

output = process_text_to_sequence(X_train_full, X_test_full, max_features=None, max_len=40)

X_train_q1_, X_valid_q1_, X_train_q2_, X_valid_q2_, y_train_, y_valid_ = train_test_split(
    output['X_train_sequence_pad'][:train_size],
    output['X_train_sequence_pad'][train_size:],
    y_train, train_size=0.8, random_state=2020)

X_train_ = [X_train_q1_, X_train_q2_]
X_valid_ = [X_valid_q1_, X_valid_q2_]

print(X_train_q1_.shape, X_valid_q1_.shape, X_train_q2_.shape, X_valid_q2_.shape)

(323432, 40) (80858, 40) (323432, 40) (80858, 40)


In [5]:
output_embeddings = load_pretrained_word_embeddings(word_embedding_file,
                                                    output['tokenizer'],
                                                    max_features=200000)

In [6]:
def make_model(model_params):
    """ Build siamese model """
    input_size = model_params['input_size']
    output_size = model_params['output_size']
    max_features = model_params['max_features']
    embeddings_size = model_params['embeddings_size']
    embeddings_matrix = model_params['embeddings_matrix']
    loss = model_params['loss']
    optimizer = model_params['optimizer']
    metrics = model_params['metrics']
    
    input_q1 = Input(shape=input_size)
    input_q2 = Input(shape=input_size)    
    x_1 = input_q1
    x_2 = input_q2
    
    embedding_layer = Embedding(input_dim=max_features,
                    output_dim=embeddings_size,
                    weights=[embeddings_matrix],
                    trainable=False)
    x_1 = embedding_layer(x_1)
    x_2 = embedding_layer(x_2)
    
    lstm_layer = LSTM(64, return_sequences=False)
    x_1 = lstm_layer(x_1)
    x_2 = lstm_layer(x_2)
    
    x = subtract([x_1, x_2])
    x = Dense(units=64, activation='relu')(x)
    output = Dense(units=output_size, activation='sigmoid')(x)
    
    model = Model(inputs=[input_q1, input_q2], outputs=output)
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    
    return model

In [7]:
class BaseNN(ABC):
    @abstractmethod
    def initialize(self):
        pass
    
    @abstractmethod
    def fit(self):
        pass
    
    @abstractmethod
    def predict(self):
        pass
    
    @abstractmethod
    def save(self):
        pass
    
class SiameseNetwork(BaseNN):
    def __init__(self, model_params):
        self.model_params = model_params
    
    def initialize(self):
        """ Initialize model """
        model = make_model(self.model_params)
        print('Model Summary:')
        print(model.summary())
        self._model = model
    
    def fit(self, X_train, y_train, X_val, y_val):
        assert self._model is not None
        self.train_params = self.model_params['train_params']
        self.patience = self.train_params['patience']
        self.nb_epochs = self.train_params['nb_epochs']
        self.batch_size = self.train_params['batch_size']
        self.filepath = self.model_params['filepath']
        earlystopping = EarlyStopping(monitor='val_loss',
                                      patience=self.patience,
                                      mode='min')
        checkpointer = ModelCheckpoint(filepath=self.filepath, save_best_only=True)
        history = self._model.fit(X_train, y_train,
                                  epochs=self.nb_epochs,
                                  batch_size=self.batch_size,
                                  validation_data=(X_val, y_val),
                                  callbacks=[earlystopping, checkpointer]).history
        return self._model
    
    def predict(self, X):
        try:
            if self.saved_model_destination is None:
                return self._model.predict(X)
            else:
                loaded_model = load_model(self.saved_model_destination)
                return loaded_model.predict(X)
        except AttributeError:
            raise AttributeError("Model not saved, try .save() first.")
            
    def save(self, saved_model_destination):
        assert self._model is not None
        self.saved_model_destination = saved_model_destination
        self._model.save(self.saved_model_destination)

In [8]:
model_params = {
    'input_size'  : (40, ), # = max_len
    'output_size' : 1,
    'train_params': {'batch_size': 512,
                     'patience'  : 2,
                     'nb_epochs' : 10},
    'max_features': output_embeddings['embeddings_matrix'].shape[0],
    'embeddings_size': 300,
    'embeddings_matrix': output_embeddings['embeddings_matrix'],
    'loss'     : 'binary_crossentropy',
    'optimizer': optimizers.Adam(),
    'metrics'  : ['accuracy'],
    'filepath' : './model.h5'
}

In [9]:
model = SiameseNetwork(model_params)
model.initialize()
_ = model.fit(X_train_, y_train_, X_valid_, y_valid_)
model.save('./siamese_network.h5')

Model Summary:
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 40, 300)      41112600    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 64)           93440       embedding_1[

### Another Way to Calculate Similarity

We can also get the encoding output from LSTM layer

In [10]:
siamese_network = model._model
get_lstm_output = K.function([siamese_network.layers[0].input,
                              siamese_network.layers[1].input],
                             [siamese_network.layers[3].get_output_at(0),
                              siamese_network.layers[3].get_output_at(1)])

def calculate_lstm_encoding_similarity(idx):
    sentence_1 = X_train_q1_[idx].reshape(-1, 1)
    sentence_2 = X_train_q2_[idx].reshape(-1, 1)
    lstm_output = get_lstm_output([sentence_1, sentence_2])
    
    sentence_1_encoding = lstm_output[0][:, -1]
    sentence_2_encoding = lstm_output[1][:, -1]
    return 1 - cosine(sentence_1_encoding, sentence_2_encoding)

In [11]:
score = []
for idx in range(20):
    score = calculate_lstm_encoding_similarity(idx)
    label = train['is_duplicate'].iloc[idx]
    print(f'True label: {label} - LSTM Encoding Similarity = {score:.4f}')

True label: 0 - LSTM Encoding Similarity = 0.8509
True label: 0 - LSTM Encoding Similarity = 0.7504
True label: 0 - LSTM Encoding Similarity = 0.4040
True label: 0 - LSTM Encoding Similarity = 0.9986
True label: 0 - LSTM Encoding Similarity = 0.6755
True label: 1 - LSTM Encoding Similarity = 0.4051
True label: 0 - LSTM Encoding Similarity = 0.9503
True label: 1 - LSTM Encoding Similarity = 0.5221
True label: 0 - LSTM Encoding Similarity = 0.3798
True label: 0 - LSTM Encoding Similarity = 0.9722
True label: 0 - LSTM Encoding Similarity = 0.5528
True label: 1 - LSTM Encoding Similarity = 0.9219
True label: 1 - LSTM Encoding Similarity = 0.9948
True label: 1 - LSTM Encoding Similarity = 0.9901
True label: 0 - LSTM Encoding Similarity = 0.1513
True label: 1 - LSTM Encoding Similarity = 0.3399
True label: 1 - LSTM Encoding Similarity = 0.8977
True label: 0 - LSTM Encoding Similarity = 0.9150
True label: 1 - LSTM Encoding Similarity = 0.9412
True label: 0 - LSTM Encoding Similarity = 0.5148
