In [1]:
from time import time
import pandas as pd
from sklearn.model_selection import train_test_split
import keras
from gensim.models import KeyedVectors
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Input, Layer, Embedding, LSTM, Dense, Flatten, Activation, RepeatVector, Permute, Lambda, \
Bidirectional, TimeDistributed, Dropout, Conv1D, GlobalMaxPool1D
from keras.layers.merge import multiply, concatenate
import keras.backend as K
from util import make_w2v_embeddings, split_and_zero_padding

In [2]:
import numpy as np
embeddings = np.load('./embeddings.npy')
len(embeddings)

11715

In [3]:
batch_size = 1024
n_epoch = 50
n_hidden = 50
embedding_dim = 300
max_seq_length = 10

In [4]:
class ManDist(Layer):

    def __init__(self, **kwargs):
        self.result = None
        super(ManDist, self).__init__(**kwargs)

    def build(self, input_shape):
        super(ManDist, self).build(input_shape)

    def call(self, x, **kwargs):
        self.result = K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True))
        return self.result

    def compute_output_shape(self, input_shape):
        return K.int_shape(self.result)

In [5]:
def shared_model(_input):
    len_embeddings = 11715
    embedded = Embedding(len_embeddings, embedding_dim, weights=[embeddings], input_shape=(max_seq_length,), \
                         trainable=False)(_input)

    # Bi-LSTM
    activations = Bidirectional(LSTM(n_hidden, return_sequences=True), merge_mode='concat')(embedded)
    activations = Bidirectional(LSTM(n_hidden, return_sequences=True), merge_mode='concat')(activations)

    # dropout
    activations = Dropout(0.5)(activations)

    # Attention
    attention = TimeDistributed(Dense(1, activation='tanh'))(activations)
    attention = Flatten()(attention)
    attention = Activation('softmax')(attention)
    attention = RepeatVector(n_hidden * 2)(attention)
    attention = Permute([2, 1])(attention)
    sent_representation = multiply([activations, attention])
    sent_representation = Lambda(lambda x_lambda: K.sum(x_lambda, axis=1))(sent_representation)

    # DropOut
    sent_representation = Dropout(0.1)(sent_representation)

    return sent_representation

In [6]:
def create_model():
    left_input = Input(shape=(max_seq_length,), dtype='float32')
    right_input = Input(shape=(max_seq_length,), dtype='float32')
    left_sen_representation = shared_model(left_input)
    right_sen_representation = shared_model(right_input)


    man_distance = ManDist()([left_sen_representation, right_sen_representation])
    sen_representation = concatenate([left_sen_representation, right_sen_representation, man_distance])
    similarity = Dense(1, activation='sigmoid')(Dense(2)(Dense(4)(Dense(16)(sen_representation))))
    model = Model(inputs=[left_input, right_input], outputs=[similarity])
    
    return model

In [7]:
def load_model():
    model = create_model()
    model.load_weights('./data/SiameseLSTM.h5')
    return model

In [8]:
model = load_model()

In [9]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 10, 300)      3514500     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 10, 300)      3514500     input_2[0][0]                    
______________________________________________________________________________________________

In [10]:
def split_and_zero_padding(df, max_seq_length):
    import itertools
    from keras.preprocessing.sequence import pad_sequences

    X = {'left': df['question1_n'], 'right': df['question2_n']}

    for dataset, side in itertools.product([X], ['left', 'right']):
        dataset[side] = pad_sequences(dataset[side], padding='pre', truncating='post', maxlen=max_seq_length)

    return dataset

In [11]:
def text_to_word_list(text):
    import re
    text = str(text)
    text = text.lower()
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [12]:
def make_w2v_embeddings(word2vec, df, embedding_dim):
    vocabs = {}
    vocabs_cnt = 0

    vocabs_not_w2v = {}
    vocabs_not_w2v_cnt = 0

    for index, row in df.iterrows():
        if index != 0 and index % 1000 == 0:
            print(str(index) + " sentences embedded.")

        for question in ['question1', 'question2']:
            q2n = []
            words = text_to_word_list(row[question])

            for word in words:
                if word not in word2vec and word not in vocabs_not_w2v:
                    vocabs_not_w2v_cnt += 1
                    vocabs_not_w2v[word] = 1
                if word not in vocabs:
                    vocabs_cnt += 1
                    vocabs[word] = vocabs_cnt
                    q2n.append(vocabs_cnt)
                else:
                    q2n.append(vocabs[word])
            df.at[index, question + '_n'] = q2n

    embeddings = 1 * np.random.randn(len(vocabs) + 1, embedding_dim)

    embeddings[0] = 0 

    for index in vocabs:
        vocab_word = vocabs[index]
        if vocab_word in word2vec:
            embeddings[index] = word2vec[vocab_word]
    del word2vec

    return df, embeddings

In [13]:
df_ =  pd.DataFrame([["What are some special cares for someone with a nose that gets stuffy during the night?", "How can I keep my nose from getting stuffy at night?"]], columns=["question1", "question2"])
for q in ['question1', 'question2']:
    df_[q + '_n'] = df_[q]
df_.head()

Unnamed: 0,question1,question2,question1_n,question2_n
0,What are some special cares for someone with a...,How can I keep my nose from getting stuffy at ...,What are some special cares for someone with a...,How can I keep my nose from getting stuffy at ...


In [14]:
train_df, embeddings = make_w2v_embeddings(word2vec=embeddings, df=df_, embedding_dim=embedding_dim)
split_df = split_and_zero_padding(train_df, max_seq_length)
print(split_df)

{'left': array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]], dtype=int32), 'right': array([[17, 18, 19, 20, 21, 10, 22, 23, 13, 24]], dtype=int32)}


  if word not in word2vec and word not in vocabs_not_w2v:


In [15]:
assert split_df['left'].shape == split_df['right'].shape

In [16]:
def find_similar_sentence(user_input):
    is_duplicate = model.predict([split_df['left'], split_df['right']])
    return is_duplicate

In [17]:
score = find_similar_sentence("hi")

In [18]:
score

array([[0.97782946]], dtype=float32)