In [27]:
from time import time
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import itertools
import datetime

from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Lambda
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint
from bert_serving.client import BertClient
sess = K.get_session()

mport tensorflow as tf
import tensorflow_hub as hub
import os
from bert.tokenization import FullTokenizer
from tqdm import tqdm_notebook
from tensorflow.keras import backend as K

# Initialize session
sess = tf.Session()

# Params for bert model and tokenization
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
max_seq_length = 256

In [14]:
bc = BertClient()
#requires a Bert server to be run please check https://github.com/hanxiao/bert-as-service#what-is-it

In [15]:
# File paths
TRAIN_CSV = '/Users/ishitagupta/Desktop/train.csv'  
DEV_CSV = '/Users/ishitagupta/Desktop/test.csv'
MODEL_SAVING_DIR = '/Users/ishitagupta/maLSTM'

In [16]:
# Load training and test set
train_df = pd.read_csv(TRAIN_CSV)
dev_df = pd.read_csv(DEV_CSV)
questions_cols = ['question1', 'question2']

In [17]:
#defining the training and validation/dev set data
X_train = train_df[questions_cols]
Y_train = train_df['is_duplicate']

X_test = dev_df[questions_cols]
Y_test = dev_df['is_duplicate']

# Split into left and right for the 2 LSTM's 
X_train = {'left': X_train.question1, 'right': X_train.question2}
X_test = {'left': X_test.question1, 'right': X_test.question2}

# Convert labels to their numpy representations
Y_train = Y_train.values
Y_test = Y_test.values

# Make sure everything is ok
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

In [None]:
# Create datasets (Only take up to max_seq_length words for memory)
train_text_left = X_train['left'].tolist()
train_text_left = [' '.join(t.split()[0:max_seq_length]) for t in train_text_left]
train_text_left = np.array(train_text_left, dtype=object)[:, np.newaxis]

train_text_right = X_train['right'].tolist()
train_text_right = [' '.join(t.split()[0:max_seq_length]) for t in train_text_right]
train_text_right = np.array(train_text, dtype=object)[:, np.newaxis]

train_label = Y_train['is_duplicate'].tolist()

test_text_left = test_df['sentence'].tolist()
test_text_left = [' '.join(t.split()[0:max_seq_length]) for t in test_text]
test_text_left = np.array(test_text, dtype=object)[:, np.newaxis]
test_label_left = test_df['polarity'].tolist()

test_text_left = test_df['sentence'].tolist()
test_text_left = [' '.join(t.split()[0:max_seq_length]) for t in test_text]
test_text_left = np.array(test_text, dtype=object)[:, np.newaxis]
test_label_left = test_df['polarity'].tolist()



In [35]:
# Model variables for LSTM 
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 64
n_epoch = 25

def BERTEmbedding(x): 
   #x is an input tensor
   # array = sess.run(x)
    encoded= bc.encode(x)
    #data_meta = np.array((len(encoded), 768))
    return encoded

def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

# The visible layer QUERY: check the one, its there because the dimension is 1,512 so for us modification possible?
left_input_text = Input(shape=(1,), dtype="string")
right_input_text = Input(shape=(1,), dtype="string")

#Embedding layer for left and right, QUERY: change it from 512 to 768 for sure 
#encoded_left = Lambda(BERTEmbedding, output_shape=(768, ))(left_input_text)
#encoded_right = Lambda(BERTEmbedding, output_shape=(768, ))(right_input_text)
encoded_left = BERTEmbedding(left_input_text)
encoded_right = BERTEmbedding(right_input_text)
# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(n_hidden)

left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)

# Calculates the distance as defined by the MaLSTM model
malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

# Pack it all up into a model
malstm = Model([left_input, right_input], [malstm_distance])

# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)

malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

# Start training
training_start_time = time()
malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch,
                            validation_data=([X_validation['left'], X_validation['right']], Y_validation))

print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))

TypeError: "Tensor("input_20:0", shape=(?, 1), dtype=string)" must be <class 'list'>, but received <class 'tensorflow.python.framework.ops.Tensor'>

In [None]:
# Plot accuracy
print('Final training accuracy '+ str(malstm_trained.history.get('acc')[-1]))
print('Final dev/test accuracy '+ str(malstm_trained.history.get('val_acc')[-1]))
plt.plot(malstm_trained.history['acc'])
plt.plot(malstm_trained.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()