In [None]:
#Here are the reference websites used in this code:

#FOR WORD2VEC:
#https://www.kaggle.com/marijakekic/cnn-in-keras-with-pretrained-word2vec-weights

#FOR BAYESIAN-OPTIMIZATION:
#https://medium.com/@crawftv/parameter-hyperparameter-tuning-with-bayesian-optimization-7acf42d348e1

#FOR BERT:
#https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX#scrollTo=ex5O1eV-Pfct
#https://www.kaggle.com/sharmilaupadhyaya/20newsgroup-classification-using-keras-bert-in-gpu

#FOR METRICS:
#https://ogrisel.github.io/scikit-learn.org/sklearn-tutorial/auto_examples/plot_precision_recall.html

#FOR TPU USAGE:
#https://www.kaggle.com/docs/tpu

Siamese LSTM Neural Network

In [1]:
#import modules:

import pandas as pd
import keras
from keras import regularizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras import layers
from keras import Input
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import to_categorical
from bayes_opt import BayesianOptimization
from tensorflow.python.keras.optimizer_v2 import rmsprop
from skopt import gbrt_minimize, gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Categorical, Integer
import tensorflow as tf
from keras import backend as K
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors
from skopt.plots import plot_convergence
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from keras.utils.vis_utils import plot_model
import pylab as pl
import warnings
warnings.filterwarnings('ignore')
import io
import random
import mxnet as mx
import gluonnlp as nlp
from gluonnlp.calibration import BertLayerCollector
import sys
import numpy as np

In [2]:
#clear session storage:
K.clear_session()

In [None]:
#run this if you use TPU:

# detect and init the TPU:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
    
# instantiate a distribution strategy:
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [3]:
#load datasets:
train = pd.read_csv("../input/quora-data/quora_train.csv")
test = pd.read_csv("../input/quora-data/quora_test.csv")

In [4]:
#handle missing values:
#q1: 1; q2: 1
#train.isna().sum() 

#q1: 0; q2: 1
#test.isna().sum()

train.dropna(axis = 0, how = "any", inplace = True)
test.dropna(axis = 0, how = "any", inplace = True)

In [5]:
#shuffle the datasets and split the training data into train_set and validation_set:
train_set = train.sample(frac = 0.8, random_state = 111)
ind_train = train_set.index
validation_set = train.drop(ind_train, axis = 0)
test = test.sample(frac = 1)

In [6]:
#preprocess data for train set:
max_words = 20000

question_list_train = train_set.question1.tolist() + train_set.question2.tolist()
tokenizer = Tokenizer(num_words = max_words,
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                      lower = True)

tokenizer.fit_on_texts(question_list_train)
word_index = tokenizer.word_index
sequences_train = tokenizer.texts_to_sequences(question_list_train)
X_train = pad_sequences(sequences_train)

train_q1 = X_train[:train_set.shape[0], :]
train_q2 = X_train[train_set.shape[0]:, :]

In [7]:
#preprocess data for validation set:
sequences_validation_q1 = tokenizer.texts_to_sequences(validation_set.question1)
sequences_validation_q2 = tokenizer.texts_to_sequences(validation_set.question2)
validation_q1 = pad_sequences(sequences_validation_q1, maxlen = X_train.shape[1])
validation_q2 = pad_sequences(sequences_validation_q2, maxlen = X_train.shape[1])


In [8]:
#preprocess for the whole training data
question_list_training = train.question1.tolist() + train.question2.tolist()
tokenizer_whole = Tokenizer(num_words = max_words,
                            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                            lower = True)

tokenizer_whole.fit_on_texts(question_list_training)
sequences_training = tokenizer_whole.texts_to_sequences(question_list_training)
X_training = pad_sequences(sequences_training, maxlen = X_train.shape[1])

training_q1 = X_training[:train.shape[0], :]
training_q2 = X_training[train.shape[0]:, :]


In [9]:
#preprocess data for test set:
sequences_test_q1 = tokenizer_whole.texts_to_sequences(test.question1)
sequences_test_q2 = tokenizer_whole.texts_to_sequences(test.question2)
test_q1 = pad_sequences(sequences_test_q1, maxlen = X_train.shape[1])
test_q2 = pad_sequences(sequences_test_q2, maxlen = X_train.shape[1])

In [10]:
#get the targets for train_set, validation_set, training, test data:
y_train_set = np.asarray(train_set.is_duplicate)
y_validation_set = np.asarray(validation_set.is_duplicate)
y_test = np.asarray(test.is_duplicate)
y_train = np.asarray(train.is_duplicate)

word2vec embeddings

In [11]:
#get the pretrained word embedding vectors and construct a matrix:
word_vectors = KeyedVectors.load_word2vec_format("../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin", binary=True)

embedding_dim = 300
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_words:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), embedding_dim)

del(word_vectors)

In [12]:
#specify parameter & hyperparameter space:
lstm_output_dim = Integer(low = 20, high = 100, name = "lstm_output_dim")
learning_rate = Real(low = 1e-4, high = 1e-2, 
                     prior ="log-uniform", name = "learning_rate")
dropout_rate = Real(low = 0.1, high = 0.6, name = "dropout_rate")

param_range = [lstm_output_dim, learning_rate, dropout_rate]
default_param = [64, 1e-3, 0.5]

In [13]:
#define a function to get model structure before training:
def create_model_word2vec(lstm_output_dim, learning_rate, dropout_rate):
    
    maxlen = X_train.shape[1]
    embedding_dim = 300
    
    input1 = Input(shape = (maxlen,), name = "input1")
    input2 = Input(shape = (maxlen,), name = "input2")
        
    embedding = layers.Embedding(max_words, embedding_dim, 
                                 weights = [embedding_matrix],
                                 input_length = maxlen, 
                                 trainable = True)
    lstm = layers.LSTM(lstm_output_dim)
    dropout = layers.Dropout(dropout_rate)
        
    embedded_output1 = embedding(input1)
    embedded_output2 = embedding(input2)
        
    lstm_output1 = lstm(embedded_output1)
    lstm_output2 = lstm(embedded_output2)
        
    merged = layers.concatenate([lstm_output1, lstm_output2], axis = -1)
    dropout_output = dropout(merged)
    predictions = layers.Dense(1, activation = "sigmoid")(dropout_output)
    
    model = Model([input1, input2], predictions)
    optimizer = rmsprop.RMSProp(learning_rate = learning_rate)
    model.compile(optimizer = optimizer, 
                  loss = "binary_crossentropy", 
                  metrics = [tf.keras.metrics.AUC()])
        
    return(model)

In [14]:
#define a function to do model fitting:
@use_named_args(dimensions = param_range)
def fitness_word2vec(lstm_output_dim, learning_rate, dropout_rate):
    
    model = create_model_word2vec(lstm_output_dim, learning_rate, dropout_rate)
    model.fit([train_q1, train_q2], y_train_set, epochs = 2, 
              batch_size = 128)
    
    ypred = model.predict([validation_q1, validation_q2])
    score = average_precision_score(y_validation_set, ypred)
        
    del model
    K.clear_session()
    tf.compat.v1.reset_default_graph()
        
    return(-score)

In [15]:
#Gaussian Process：
gp_word2vec_result = gp_minimize(func = fitness_word2vec,
                                 dimensions = param_range,
                                 n_calls = 20,
                                 n_jobs = -1,
                                 kappa = 5,
                                 x0 = default_param)

Epoch 1/2

KeyboardInterrupt: 

In [None]:
#get convergence plot:
plot_convergence(gp_word2vec_result)

In [None]:
#print out the optimum parameters & hyperparameters:
print("lstm_output_dim": gp_word2vec_result.x[0], 
      "learning_rate": gp_word2vec_result.x[1], 
      "dropout_rate": gp_word2vec_result.x[2])

In [None]:
#print out the optimal objective function value:
gp_word2vec_result.fun

In [None]:
#epoch = 2, n_calls = 20:
#build and re-train the model on the whole training data (run this code on TPU):
model_word2vec = create_model_word2vec(100, 0.003458554720057225, 0.1)
model_word2vec.fit([training_q1, training_q2], y_train, epochs = 2, batch_size = 128)

In [None]:
#get predicted probabilities on test set:
test_pred = model_word2vec.predict([test_q1, test_q2])
test_score = average_precision_score(y_test, test_pred)
test_score

In [None]:
#show model archetecture:
plot_model(model_word2vec, show_shapes = True, show_layer_names = True)

In [None]:
#show model details:
model_word2vec.summary()

In [None]:
# Compute Precision-Recall and plot curve
precision, recall, thresholds = precision_recall_curve(y_test, test_pred)
area = auc(recall, precision)
print("Area Under Curve: %0.2f" % area)

pl.clf()
pl.plot(recall, precision, label='Precision-Recall curve')
pl.xlabel('Recall')
pl.ylabel('Precision')
pl.ylim([0.0, 1.05])
pl.xlim([0.0, 1.0])
pl.title('Precision-Recall Curve for Siamese Model with Word2Vec Embeddings: AUC=%0.2f' % area)
pl.legend(loc="lower left")
pl.show()

In [None]:
#delete the precision and recall without corresponding threshold:
precision = precision[:precision.shape[0] - 1]
recall = recall[:recall.shape[0] - 1]

In [None]:
#precision-thresholds curve:
pl.clf()
pl.plot(thresholds, precision, label='Precision-Thresholds curve')
pl.xlabel('Thresholds')
pl.ylabel('Precision')
pl.ylim([0.0, 1.05])
pl.xlim([0.0, 1.0])
pl.title("Precision-Thresholds Curve for Siamese Model with Word2Vec Embeddings")
pl.legend(loc="lower left")
pl.show()

In [None]:
#recall-thresholds curve:
pl.clf()
pl.plot(thresholds, recall, label='Recall-Thresholds curve')
pl.xlabel('Thresholds')
pl.ylabel('Recall')
pl.ylim([0.0, 1.05])
pl.xlim([0.0, 1.0])
pl.title("Recall-Thresholds Curve for Siamese Model with Word2Vec Embeddings")
pl.legend(loc="lower left")
pl.show()

In [None]:
#get the predicted class labels:
test_pred_classes = []
for i in range(test_pred.shape[0]):
    if test_pred[i] >= 0.5:
        test_pred_classes.append(1)
    else:
        test_pred_classes.append(0)

In [None]:
#print out the classification report:
print(classification_report(y_test, test_pred_classes))

Fine Tuning with BERT 

In [None]:
#install packages and download zip file:
!pip install keras-bert
!pip install keras-rectified-adam
!pip install h5py

!wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip -o uncased_L-12_H-768_A-12.zip

In [None]:
#import relavant modules:
import os
import codecs
import tensorflow as tf
from tqdm import tqdm
from chardet import detect
import keras
from keras_radam import RAdam
from keras import backend as K
from keras_bert import load_trained_model_from_checkpoint
from keras_bert import Tokenizer

In [None]:
#parameters;
SEQ_LEN = 128
BATCH_SIZE = 50
EPOCHS = 1
LR = 1e-4

In [None]:
#Path to get the pre trained model of BERT.
pretrained_path = "../input/output/uncased_L-12_H-768_A-12"
config_path = "./uncased_L-12_H-768_A-12/bert_config.json"
checkpoint_path = "./uncased_L-12_H-768_A-12/bert_model.ckpt"
vocab_path = "./uncased_L-12_H-768_A-12/vocab.txt"

In [None]:
#Loading Pretrained BERT model:
with tpu_strategy.scope():
    model = load_trained_model_from_checkpoint(
        config_path,
        checkpoint_path,
        training = True,
        trainable = True,
        seq_len = SEQ_LEN,
    )

In [None]:
#get the details of original BERT model:
model.summary()

In [None]:
#Show the archetecture of the original BERT model:
plot_model(model, show_shapes = True, show_layer_names = True)

In [None]:
#get the transformer for BERT:
!pip install transformers
from transformers import BertTokenizer

Preprocessing training data:

In [None]:
with tpu_strategy.scope():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
    
    sentences_train = train.question1 + " [SEP] " + train.question2
    sentences_train_values = sentences_train.values
    labels_train = train.is_duplicate.values
    
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    token_ids_train = []
    attention_masks_train = []
    
    # For every sentence in training data:
    for sent in sentences_train_values:
        encoded_dict = tokenizer.encode_plus(sent,                      
                                             add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                             max_length = SEQ_LEN,      # Pad & truncate all sentences.
                                             pad_to_max_length = True,
                                             return_attention_mask = True,   # Construct attn. masks.
                                             return_tensors = 'tf'           # Return pytorch tensors.
                                            )
        
        # Add the encoded sentence to the list.    
        token_ids_train.append(encoded_dict['input_ids'])
        
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks_train.append(encoded_dict['attention_mask'])
    
    # Convert the lists into tensors.
    token_ids_train_tensor = tf.concat(token_ids_train, 0)
    attention_masks_train_tensor = tf.concat(attention_masks_train, 0)
    labels_train_tensor = tf.convert_to_tensor(labels_train)

In [None]:
with tpu_strategy.scope():
    token_type_ids_train_list = []
    
    for i in train.index:
        sentence_1 = train.question1[i]
        sentence_2 = train.question2[i]
        
        token_ids_train_s1 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence_1))
        token_ids_train_s2 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence_2))
        
        token_type_ids_train = tokenizer.create_token_type_ids_from_sequences(token_ids_train_s1, token_ids_train_s2)
        token_type_ids_train_list.append(token_type_ids_train)
    
    token_type_ids_train_tensor = tf.convert_to_tensor(pad_sequences(token_type_ids_train_list, maxlen = SEQ_LEN))

Preprocessing test data:

In [None]:
with tpu_strategy.scope():
    sentences_test = test.question1 + " [SEP] " + test.question2
    sentences_test_values = sentences_test.values
    labels_test = test.is_duplicate.values
    
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    token_ids_test = []
    attention_masks_test = []
    
    # For every sentence in training data:
    for sent in sentences_test_values:
        encoded_dict = tokenizer.encode_plus(sent,                      # Sentence to encode.
                                             add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                             max_length = SEQ_LEN,           # Pad & truncate all sentences.
                                             pad_to_max_length = True,
                                             return_attention_mask = True,   # Construct attn. masks.
                                             return_tensors = 'tf'     # Return pytorch tensors.
                                            )
        
        # Add the encoded sentence to the list.    
        token_ids_test.append(encoded_dict['input_ids'])
        
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks_test.append(encoded_dict['attention_mask'])
        
    # Convert the lists into tensors.
    token_ids_test_tensor = tf.concat(token_ids_test, 0)
    attention_masks_test_tensor = tf.concat(attention_masks_test, 0)
    labels_test_tensor = tf.convert_to_tensor(labels_test)

In [None]:
with tpu_strategy.scope():
    token_type_ids_test_list = []
    
    for i in test.index:
        sentence_1 = test.question1[i]
        sentence_2 = test.question2[i]
        
        token_ids_test_s1 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence_1))
        token_ids_test_s2 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence_2))
        
        token_type_ids_test = tokenizer.create_token_type_ids_from_sequences(token_ids_test_s1, token_ids_test_s2)
        token_type_ids_test_list.append(token_type_ids_test)
    
    token_type_ids_test_tensor = tf.convert_to_tensor(pad_sequences(token_type_ids_test_list, maxlen = SEQ_LEN))

In [None]:
with tpu_strategy.scope():
    inputs = model.inputs[:2]
    dense = model.get_layer('NSP-Dense').output
    outputs = keras.layers.Dense(units = 1, activation='sigmoid')(dense)
    
    model = keras.models.Model(inputs, outputs)
    model.compile(
        rmsprop.RMSProp(learning_rate = LR),
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.AUC()]
    )

In [None]:
#fine-tuning with BERT:
model.fit(
    [token_ids_train_tensor, token_type_ids_train_tensor],
    labels_train_tensor,
    epochs = EPOCHS,
    batch_size = BATCH_SIZE
)



In [None]:
#get the predicted probabilities:
test_pred_bert = model.predict([token_ids_test_tensor, token_type_ids_test_tensor], 
                               verbose = True)
test_score_bert = average_precision_score(test.is_duplicate, test_pred_bert)
test_score_bert

In [None]:
# Compute Precision-Recall and plot curve for bert model:
precision, recall, thresholds = precision_recall_curve(test.is_duplicate, test_pred_bert)
area = auc(recall, precision)
print("Area Under Curve: %0.2f" % area)

pl.clf()
pl.plot(recall, precision, label='Precision-Recall curve')
pl.xlabel('Recall')
pl.ylabel('Precision')
pl.ylim([0.0, 1.05])
pl.xlim([0.0, 1.0])
pl.title('Precision-Recall Curve for BERT Fine Tuning Model: AUC=%0.2f' % area)
pl.legend(loc="lower left")
pl.show()

In [None]:
#delete precision and recall without corresponding threshold:
precision = precision[:precision.shape[0] - 1]
recall = recall[:recall.shape[0] - 1]

In [None]:
#precision-thresholds curve:
pl.clf()
pl.plot(thresholds, precision, label='Precision-Thresholds curve')
pl.xlabel('Thresholds')
pl.ylabel('Precision')
pl.ylim([0.0, 1.05])
pl.xlim([0.0, 1.0])
pl.title("Precision-Thresholds Curve for BERT Fine Tuning Model")
pl.legend(loc="lower left")
pl.show()

In [None]:
#recall-thresholds curve:
pl.clf()
pl.plot(thresholds, recall, label='Recall-Thresholds curve')
pl.xlabel('Thresholds')
pl.ylabel('Recall')
pl.ylim([0.0, 1.05])
pl.xlim([0.0, 1.0])
pl.title("Recall-Thresholds Curve for BERT Fine Tuning Model")
pl.legend(loc="lower left")
pl.show()

In [None]:
#get the predicted class labels:
test_pred_bert_classes = []
for i in range(test_pred_bert.shape[0]):
    if test_pred_bert[i] >= 0.5:
        test_pred_bert_classes.append(1)
    else:
        test_pred_bert_classes.append(0)

In [None]:
#print the classification report:
print(classification_report(test.is_duplicate, test_pred_bert_classes))

visualize the archetecture of models:

In [None]:
plot_model(model, show_shapes = True, show_layer_names = True)

In [None]:
#get the details of BERT:
model.summary()