In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
import time
import datetime
import pandas as pd
import numpy as np

from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Bidirectional, Lambda, Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers import Concatenate, Add, Subtract, Multiply
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard

import keras.backend as kb
import tensorflow as tf

In [63]:
q1_train_final = np.load("/content/drive/My Drive/Text Mining Project/Train_Valid_Split/q1_train_final.npy", allow_pickle='True')
q2_train_final = np.load("/content/drive/My Drive/Text Mining Project/Train_Valid_Split/q2_train_final.npy", allow_pickle='True')
q1_valid_final = np.load("/content/drive/My Drive/Text Mining Project/Train_Valid_Split/q1_valid_final.npy", allow_pickle='True')
q2_valid_final = np.load("/content/drive/My Drive/Text Mining Project/Train_Valid_Split/q2_valid_final.npy", allow_pickle='True')
y_train_final = np.load("/content/drive/My Drive/Text Mining Project/Train_Valid_Split/y_train_final.npy", allow_pickle='True')
y_valid_final = np.load("/content/drive/My Drive/Text Mining Project/Train_Valid_Split/y_valid_final.npy", allow_pickle='True')

In [62]:
q1_test_padded = np.load("/content/drive/My Drive/Text Mining Project/Train_Valid_Split/q1_test_padded.npy", allow_pickle='True')
q2_test_padded = np.load("/content/drive/My Drive/Text Mining Project/Train_Valid_Split/q2_test_padded.npy", allow_pickle='True')
y_test = np.load("/content/drive/My Drive/Text Mining Project/Train_Valid_Split/y_test.npy", allow_pickle='True')

In [64]:
embedding_matrix = np.load("/content/drive/My Drive/Text Mining Project/Train_Valid_Split/embedding_matrix.npy", allow_pickle='True')

In [65]:
batch_size=64
n_epochs = 25

In [66]:
def manhattan_dist(q1_lstm_feature, q2_lstm_feature):
  subtracted = Subtract()([q1_lstm_feature, q2_lstm_feature])
  abs_diff = kb.abs(subtracted)
  dist_score = kb.sum(abs_diff, axis=1, keepdims=True)
  return dist_score

In [96]:
# Input Layer
q1_input = Input(shape=(60,), dtype='int32')
q2_input = Input(shape=(60,), dtype='int32')

# Embedding Layer
embedding_layer = Embedding(input_dim=len(embedding_matrix), output_dim=300, weights=[embedding_matrix], input_length=60, trainable=False)

# The above Embedding layer is shared for both question1 and question2
q1_embed = embedding_layer(q1_input)
q2_embed = embedding_layer(q2_input)

# lSTM layer
lstm_layer = LSTM(units=50)

# The above LSTM layer is shared for both question1 and question2
q1_lstm = lstm_layer(q1_embed)
q2_lstm = lstm_layer(q2_embed)

'''
We want to find manhattan distance between question1 feature and 
question2 feature from final hidden state of LSTM keeping the dimensions same 
'''
dist_score = manhattan_dist(q1_lstm, q2_lstm)


# Detect whether given set of questions are duplicate or not
is_duplicate = Dense(1, activation="sigmoid")(dist_score)

# Combine into the Model
model = Model(inputs=[q1_input, q2_input], outputs=is_duplicate)

# Compiling the model
'''
Configurations:
              Loss function used: Binary Cross Entropy
              Optimizer: Nadam
              metrics used: Accuracy
'''
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])

'''
Callbacks:
          EarlyStopping to avoid Overfitting
          ModelCheckpoint to save best model in all epochs
          TensorBoard Visulaization for Loss

'''
my_callbacks = [
    EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True),
    ModelCheckpoint(filepath='/content/drive/My Drive/Text Mining Project/Saved Model/Callbacks/model.{epoch:02d}-{val_loss:.2f}.h5',
                    monitor='val_loss', mode='min', save_best_only=True, verbose=1),
    TensorBoard(log_dir='/content/drive/My Drive/Text Mining Project/Saved Model/Tensorboard/runs/'),
  ]

In [69]:
start = time.time()
trained_model = model.fit(x=[q1_train_final, q2_train_final], y=y_train_final,
                          validation_data=([q1_valid_final, q2_valid_final], y_valid_final), 
                          batch_size=batch_size, epochs=n_epochs, 
                          callbacks=my_callbacks
                          )
end = time.time()
print(f"Time taken to train {n_epochs} epochs is {datetime.timedelta(seconds=int(end-start))}")

Epoch 1/25
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 00001: val_loss improved from inf to 0.49655, saving model to /content/drive/My Drive/Text Mining Project/Saved Model/Callbacks/model.01-0.50.h5
Epoch 2/25
Epoch 00002: val_loss improved from 0.49655 to 0.46777, saving model to /content/drive/My Drive/Text Mining Project/Saved Model/Callbacks/model.02-0.47.h5
Epoch 3/25
Epoch 00003: val_loss improved from 0.46777 to 0.44745, saving model to /content/drive/My Drive/Text Mining Project/Saved Model/Callbacks/model.03-0.45.h5
Epoch 4/25
Epoch 00004: val_loss improved from 0.44745 to 0.43903, saving model to /content/drive/My Drive/Text Mining Project/Saved Model/Callbacks/model.04-0.44.h5
Epoch 5/25
Epoch 00005: val_loss improved from 0.43903 to 0.43650, saving model to /content/drive/My Drive/Text Mining Project/Saved Model/Callbacks/model.05-0.44.h5
Epoch 6/25
Epoch 00006: val_loss improved from 0.43650 to 0.43026, saving model to /content/drive/My D

In [None]:
# model.save('/content/drive/My Drive/Text Mining Project/Saved Model/myModel.h5', overwrite=True, include_optimizer=True)
# model.save_weights('/content/drive/My Drive/Text Mining Project/Saved Model/myModel.h5')

In [97]:
#model = tf.keras.models.load_model('/content/drive/My Drive/Text Mining Project/Saved Model/myModel.h5')
model.load_weights("/content/drive/My Drive/Text Mining Project/Saved Model/Callbacks/model.12-0.41.h5")

In [98]:
y_pred = model.predict([q1_test_padded, q2_test_padded])

In [99]:
def prob_to_classes(y_pred, threshold=0.5):
  global y_test
  predicted_classes = np.empty_like(y_test)
  for i in range(len(y_pred)):
    if y_pred[i][0] >= threshold:
      predicted_classes[i] = 1
    else:
      predicted_classes[i] = 0
  return predicted_classes

In [100]:
predicted_classes = prob_to_classes(y_pred, threshold=0.5)

In [101]:
acc = np.sum(predicted_classes == y_test)/len(y_test)
print(f"Accuracy = {acc}")

Accuracy = 0.8252


In [102]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted_classes))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      6308
           1       0.76      0.77      0.76      3692

    accuracy                           0.83     10000
   macro avg       0.81      0.81      0.81     10000
weighted avg       0.83      0.83      0.83     10000

