In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import re
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [3]:
!unzip '/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/google-quest-challenge.zip'
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.shape, df_test.shape

Archive:  /content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/google-quest-challenge.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


((6079, 41), (476, 11))

# 3. Data Preprocessing

In [4]:
# defining a function to remove stop_words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.remove('no'); stop_words.remove('not'); stop_words.remove('nor')
 
def stopwrd_removal(sent):
  lst = []
  for wrd in sent.split():
    if wrd not in stop_words:
      lst.append(wrd)
  return " ".join(lst)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
def text_preprocessor(column, remove_stopwords = False, remove_specialchar = False):
  """pass any column with Text in it from df_train | Note: returns nothing makes inplace changes in df_train"""
  # 1. remove html tags, html urls, replace html comparison operators
  # text = df_train[column].values
  df_train[column] = [re.sub('<.*?>', ' ', i) for i in df_train[column].values]
  df_train[column] = df_train[column].str.replace('&lt;', '<')\
                                          .str.replace('&gt;', '>')\
                                          .str.replace('&le;', '<=' )\
                                          .str.replace('&ge;', '>=')
 
  # 2. remove latex i,e., if there is any formulas or latex we have to remove it
  df_train[column] = [re.sub('\$.*?\$', ' ', i) for i in df_train[column].values]
 
  # 3. all lowercase 
  df_train[column] = df_train[column].str.lower()
 
  # 4. decontractions
  df_train[column] = df_train[column].str.replace("won't", "will not").str.replace("can\'t", "can not").str.replace("n\'t", " not").str.replace("\'re", " are").str.\
                                                replace("\'s", " is").str.replace("\'d", " would").str.replace("\'ll", " will").str.\
                                                replace("\'t", " not").str.replace("\'ve", " have").str.replace("\'m", " am")
  
  # 5. removing non-english or hebrew characters
  df_train[column] = [i.encode("ascii", "ignore").decode() for i in df_train[column].values]
 
  # 6. remove all special-characters other than alpha-numericals
  if remove_specialchar == True:
    df_train[column] = [re.sub('[^A-Za-z0-9]+', ' ', i) for i in df_train[column].values]
 
  # # 7. separating special chars from alphanumerics
  # all_sc = [re.findall('[^ A-Za-z0-9]', i) for i in df_train[column].values]
  # special_char = np.unique([j for i in all_sc for j in i])
  # replace_char = [' '+i+' ' for i in special_char]
  # for i,j in zip(special_char, replace_char):
  #  df_train[column] = df_train[column].str.replace(i, j)
 
  # 8. Stop_word removal
  if remove_stopwords == True:
    df_train[column] = [stopwrd_removal(i) for i in df_train[column].values]
 
  # 9. remove all white-space i.e., \n, \t, and extra_spaces
  df_train[column] = df_train[column].str.replace("\n", " ").str.replace("\t", " ").str.rstrip()
  df_train[column] = [re.sub('  +', ' ', i) for i in df_train[column].values]

In [6]:
# 1. text preprocessing
df_train['clean_title'] = df_train['question_title']
df_train['clean_body'] = df_train['question_body']
df_train['clean_answer'] = df_train['answer']
text_preprocessor('clean_title',  remove_stopwords = False, remove_specialchar = False)
text_preprocessor('clean_body',  remove_stopwords = False, remove_specialchar = False)
text_preprocessor('clean_answer',  remove_stopwords = False, remove_specialchar = False)

# 4. Train_test_split - Random split

In [7]:
# 1. setting up target features
question_tar = ['question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written']
       
answer_tar = ['answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']
 
tar_features = question_tar + answer_tar
len(tar_features)

30

In [8]:
# 2. splitting dataset train_test_split
from sklearn.model_selection import train_test_split
 
X_train, X_cv, y_train, y_cv = train_test_split(df_train[['clean_title', 'clean_body', 'clean_answer']], df_train[tar_features], test_size = 0.12, random_state = 42)
X_train.shape, X_cv.shape, y_train.shape, y_cv.shape

((5349, 3), (730, 3), (5349, 30), (730, 30))

In [9]:
# 3. creating training features : title + body = title_body | answer_train | title + body + answer = title_body_answer
title_train = X_train['clean_title'].values
body_train = X_train['clean_body'].values
answer_train = X_train['clean_answer'].values
 
title_cv = X_cv['clean_title'].values
body_cv = X_cv['clean_body'].values
answer_cv = X_cv['clean_answer'].values
 
# train data
title_body_train = [i+' '+j for i,j in zip(title_train, body_train)]
y_train_ques = y_train[question_tar].values
y_train_ans = y_train[answer_tar].values
 
# cv data
title_body_cv = [i+' '+j for i,j in zip(title_cv, body_cv)]
y_cv_ques = y_cv[question_tar].values
y_cv_ans = y_cv[answer_tar].values
 
len(title_body_train), len(answer_train), len(title_body_cv), len(answer_cv)

(5349, 5349, 730, 730)

# 4.1. Sentence Embedding : BERT

In [10]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 2.7MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 49.8MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 48.3MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB

In [11]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, BertConfig
 
# Load pretrained model/tokenizer
config = BertConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased', config = config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# Tokenizing : title_body
title_body_train_tokens = bert_tokenizer.batch_encode_plus(title_body_train, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
title_body_cv_tokens = bert_tokenizer.batch_encode_plus(title_body_cv, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
# tokens = bert_tokenizer.batch_encode_plus(final_train, max_length = 50, truncation = True, pad_to_max_length = True)

tb_train_input_ids  = np.array(title_body_train_tokens['input_ids'])
tb_train_attn_mask = np.array(title_body_train_tokens['attention_mask'])
tb_train_token_typ_ids = np.array(title_body_train_tokens['token_type_ids'])
 
tb_cv_input_ids  = np.array(title_body_cv_tokens['input_ids'])
tb_cv_attn_mask = np.array(title_body_cv_tokens['attention_mask'])
tb_cv_token_typ_ids = np.array(title_body_cv_tokens['token_type_ids'])

tb_train_input_ids.shape, tb_train_attn_mask.shape, tb_train_token_typ_ids.shape, tb_cv_input_ids.shape, tb_cv_attn_mask.shape, tb_cv_token_typ_ids.shape

((5349, 512), (5349, 512), (5349, 512), (730, 512), (730, 512), (730, 512))

In [None]:
# Tokenizing : answer
ans_train_tokens = bert_tokenizer.batch_encode_plus(answer_train, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
ans_cv_tokens = bert_tokenizer.batch_encode_plus(answer_cv, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
# tokens = bert_tokenizer.batch_encode_plus(final_train, max_length = 50, truncation = True, pad_to_max_length = True)
 
ans_train_input_ids  = np.array(ans_train_tokens['input_ids'])
ans_train_attn_mask = np.array(ans_train_tokens['attention_mask'])
ans_train_token_typ_ids = np.array(ans_train_tokens['token_type_ids'])
 
ans_cv_input_ids  = np.array(ans_cv_tokens['input_ids'])
ans_cv_attn_mask = np.array(ans_cv_tokens['attention_mask'])
ans_cv_token_typ_ids = np.array(ans_cv_tokens['token_type_ids'])
seq_len = ans_train_input_ids.shape[1]
 
ans_train_input_ids.shape, ans_train_attn_mask.shape, ans_train_token_typ_ids.shape, ans_cv_input_ids.shape, ans_cv_attn_mask.shape, ans_cv_token_typ_ids.shape

((5349, 512), (5349, 512), (5349, 512), (730, 512), (730, 512), (730, 512))

In [14]:
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model

In [None]:
# BERT embedings : title_body_train 
batch_size = 32
l = tb_train_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = bert_model([tb_train_input_ids[i*batch_size:(i+1)*batch_size], tb_train_attn_mask[i*batch_size:(i+1)*batch_size], tb_train_token_typ_ids[i*batch_size:(i+1)*batch_size]])[2]

  # getting hidden states of last 4 layers (avg of each timrstep)
  lst = [tf.reduce_mean(i, axis = 1) for i in x[-4:]]

  # concat all 4 averaged hidden_states
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
tb_BERT_train = tf.concat(lst1, axis = 0)
print(tb_BERT_train.shape)

# BERT embedings : title_body_train
batch_size = 32
l = ans_cv_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = bert_model([tb_cv_input_ids[i*batch_size:(i+1)*batch_size], tb_cv_attn_mask[i*batch_size:(i+1)*batch_size], tb_cv_token_typ_ids[i*batch_size:(i+1)*batch_size]])[2]
  # getting hidden states of last 4 layers (avg of each timrstep)
  lst = [tf.reduce_mean(i, axis = 1) for i in x[-4:]]
  # lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
tb_BERT_cv = tf.concat(lst1, axis = 0)
print(tb_BERT_cv.shape)

HBox(children=(FloatProgress(value=0.0, max=168.0), HTML(value='')))


(5349, 3072)


HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))


(730, 3072)


In [None]:
# BERT embedings : answer_train
batch_size = 32
l = ans_train_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = bert_model([ans_train_input_ids[i*batch_size:(i+1)*batch_size], ans_train_attn_mask[i*batch_size:(i+1)*batch_size], ans_train_token_typ_ids[i*batch_size:(i+1)*batch_size]])[2]
  # getting hidden states of last 4 layers
  lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
ans_BERT_train = tf.concat(lst1, axis = 0)
print(ans_BERT_train.shape)

# cv : ans BERT embedings
batch_size = 32
l = ans_cv_input_ids.shape[0]
lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = bert_model([ans_cv_input_ids[i*batch_size:(i+1)*batch_size], ans_cv_attn_mask[i*batch_size:(i+1)*batch_size], ans_cv_token_typ_ids[i*batch_size:(i+1)*batch_size]])[2]
  # getting hidden states of last 4 layers
  lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
ans_BERT_cv = tf.concat(lst1, axis = 0)
print(ans_BERT_cv.shape)

HBox(children=(FloatProgress(value=0.0, max=168.0), HTML(value='')))


(5349, 3072)


HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))


(730, 3072)


In [None]:
# saving pretrained BERT embeddings
np.savez_compressed('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/bert_outputs', tb_BERT_train = tb_BERT_train.numpy(), tb_BERT_cv = tb_BERT_cv.numpy(), 
                    ans_BERT_train = ans_BERT_train.numpy(), ans_BERT_cv = ans_BERT_cv.numpy())

### predictor Model :

In [12]:
# loading pretrained BERT embeddings
tb_BERT_train = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/bert_outputs.npz')['tb_BERT_train']
tb_BERT_cv = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/bert_outputs.npz')['tb_BERT_cv']
ans_BERT_train = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/bert_outputs.npz')['ans_BERT_train']
ans_BERT_cv = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/bert_outputs.npz')['ans_BERT_cv']

In [20]:
tf.keras.backend.clear_session()
seed = 42

# *-----------------title_body-----------------*
tb_input = Input(name = 'tb_bert_out', shape = (768*4,), dtype = 'float32')
tb_dropout_1 = Dropout(rate = 0.2, seed = seed)(tb_input)
 
tb_dense_1 = Dense(units = 1024, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_dropout_1)
tb_dropout_2 = Dropout(rate = 0.2, seed = seed)(tb_dense_1)
 
tb_dense_2 = Dense(units = 512, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_dropout_2)
tb_dropout_3 = Dropout(rate = 0.1, seed = seed)(tb_dense_2)

tb_out = Dense(units = 21, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_dropout_3)
 
# *-----------------answer_model-----------------*
ans_input = Input(name = 'ans_bert_out', shape = (768*4,), dtype = 'float32')
# concat = tf.concat([tb_out, ans_input], axis = -1) 
ans_dropout_1 = Dropout(rate = 0.2, seed = seed)(ans_input)
 
ans_dense_1 = Dense(units = 2048, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_dropout_1)
ans_dropout_2 = Dropout(rate = 0.2, seed = seed)(ans_dense_1)
 
ans_dense_2 = Dense(units = 512, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_dropout_2)
ans_dropout_3 = Dropout(rate = 0.1, seed = seed)(ans_dense_2)

ans_out = Dense(units = 9, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_dropout_3)

# *--------------------concat--------------------*
out = tf.concat([tb_out, ans_out], axis = -1) 
 
model = Model(inputs = [tb_input, ans_input], outputs = out)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tb_bert_out (InputLayer)        [(None, 3072)]       0                                            
__________________________________________________________________________________________________
ans_bert_out (InputLayer)       [(None, 3072)]       0                                            
__________________________________________________________________________________________________
dropout (Dropout)               (None, 3072)         0           tb_bert_out[0][0]                
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 3072)         0           ans_bert_out[0][0]               
_______________________________________________________________________________________

In [16]:
# post processing : binning
def return_bins(arr):
  val = np.unique(arr)
  bins = []
  for i in range(len(val)):
    if i > 0:
      bins.append((val[i-1] + val[i])/2)
  return bins
  
unique_val_30 = [np.unique(df_train[tar_features].values[:, i]) for i in range(30)]
bins_30 = [return_bins(df_train[tar_features].values[:, i]) for i in range(30)]
 
def binned_out(y_pred):
  col = y_pred.shape[1]
  final_pred = np.zeros(y_pred.shape)
  for i in range(col):
    idx = np.digitize(y_pred[:, i], bins_30[i])
    final_pred[:, i] = unique_val_30[i][idx]
  return final_pred

In [21]:
# Defining callbacks
!rm -r '/content/saved models'
!rm -r '/content/logs'
!mkdir '/content/saved models'
!mkdir '/content/logs/'
 
# tensorboard callback
import datetime
log_dir="logs/" + datetime.datetime.now().strftime("%Y-%m-%d %H_%M_%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir, histogram_freq=1, write_graph=True, write_grads=True)
 
# spearman function
from scipy.stats import pearsonr, spearmanr
def compute_spearman(y_true, y_pred, final_pred):
  col = y_true.shape[1]
  lst = []
  for i in range(col):
    # p = round(spearmanr(y_true[:, i], y_pred[:, i])[0], 5)
    p = round(spearmanr(y_true[:, i], final_pred[:, i])[0], 5)
    p = round(p, 5)
    # if np.isnan(p):
    #   p = round(spearmanr(y_true[:, i], y_pred[:, i])[0], 5)
    lst.append(p)
  return np.array(lst), round(np.nanmean(lst), 5)
 
# Custom spearman metric
class print_spearman(tf.keras.callbacks.Callback):
    def __init__(self, train_data, validation_data):
        super(tf.keras.callbacks.Callback, self).__init__()
        self.x, self.y = train_data
        self.val_x, self.val_y = validation_data
    
    def on_train_begin(self, logs={}):
        self.all_feat_spearman = []
        self.spearman_dict = {'train_spearman' :[], 'val_spearman' :[]}
 
    def on_epoch_end(self, epoch, logs={}):
        self.epoch = epoch
        # 1. Test_set evaluation
        print('\nspearman :')
        y_pred = self.model.predict(x = self.x)
        y_pred_val = self.model.predict(x = self.val_x)
 
        final_pred = binned_out(y_pred)
        final_pred_val = binned_out(y_pred_val)
 
        train_spear_lst, train_spearman = compute_spearman(self.y, y_pred, final_pred)
        val_spear_lst, val_spearman = compute_spearman(self.val_y, y_pred_val, final_pred_val)
 
        self.all_feat_spearman.append({'train_spearman' : train_spear_lst, 'val_spearman' : val_spear_lst})
 
        self.spearman_dict['train_spearman'].append(train_spearman)
        self.spearman_dict['val_spearman'].append(val_spearman)
        prev_epoch_lr  = tf.keras.backend.eval(self.model.optimizer.lr)
        print("train_spearman : {} | val_spearman : {} | Learning_Rate : {}".format(train_spearman, val_spearman, round(prev_epoch_lr, 6)))
        # print('train_spear_lst : ', train_spear_lst, '\n' 'val_spear_lst :', val_spear_lst)
 
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor= 'val_loss', factor=np.sqrt(0.1), patience=4, verbose=1)
 
checkpt = tf.keras.callbacks.ModelCheckpoint('/content/saved models/weights.-{epoch:03d}-{val_loss:.5f}', monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=True)
 
print_spearman_fn = print_spearman(train_data = ([tb_BERT_train, ans_BERT_train], y_train.values),
                                 validation_data = ([tb_BERT_cv, ans_BERT_cv], y_cv.values))
callbacks = [print_spearman_fn, reduce_lr, checkpt, tensorboard_callback]



In [22]:
# BERT_model : training a model
tf.keras.backend.clear_session()
opt = tf.keras.optimizers.Adam(learning_rate = 0.0001)
rmse = tf.keras.metrics.RootMeanSquaredError()
 
model.compile(loss = 'binary_crossentropy', optimizer = opt,  metrics = [rmse])
history = model.fit(x = [tb_BERT_train, ans_BERT_train], y =  y_train.values,
                    validation_data = ([tb_BERT_cv, ans_BERT_cv], y_cv.values),
                    batch_size = 64, epochs = 40, callbacks = callbacks)

Epoch 1/40
spearman :
train_spearman : 0.28278 | val_spearman : 0.28746 | Learning_Rate : 9.999999747378752e-05

Epoch 00001: saving model to /content/saved models/weights.-001-0.39854
Epoch 2/40
spearman :
train_spearman : 0.32832 | val_spearman : 0.32837 | Learning_Rate : 9.999999747378752e-05

Epoch 00002: saving model to /content/saved models/weights.-002-0.39364
Epoch 3/40
spearman :
train_spearman : 0.3562 | val_spearman : 0.35006 | Learning_Rate : 9.999999747378752e-05

Epoch 00003: saving model to /content/saved models/weights.-003-0.38994
Epoch 4/40
spearman :
train_spearman : 0.374 | val_spearman : 0.35836 | Learning_Rate : 9.999999747378752e-05

Epoch 00004: saving model to /content/saved models/weights.-004-0.38757
Epoch 5/40
spearman :
train_spearman : 0.38748 | val_spearman : 0.3623 | Learning_Rate : 9.999999747378752e-05

Epoch 00005: saving model to /content/saved models/weights.-005-0.38593
Epoch 6/40
spearman :
train_spearman : 0.39565 | val_spearman : 0.37429 | Learn

In [23]:
# best_results : (val_spearman, epoch_no)
max(print_spearman_fn.spearman_dict['val_spearman']), np.argmax(print_spearman_fn.spearman_dict['val_spearman'])

(0.39536, 25)

# 4.2. USE Embeddings

In [24]:
import tensorflow as tf

import tensorflow_hub as hub
from tensorflow.keras.layers import Input, Softmax, GRU, LSTM, RNN, Embedding, Dense, RepeatVector, TimeDistributed, Bidirectional, Concatenate
from tensorflow.keras.models import Model

In [None]:
# loading USE model
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/5'.
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder-large/5, Total size: 577.10MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/5'.


In [None]:
# title_body
tb_USE_train = np.zeros((len(title_body_train), 512))
batch_size = 32
for i in tqdm(range(int(len(title_body_train)/batch_size)+1)):
  tb_USE_train[i*batch_size : (i+1)*batch_size] = use_model(title_body_train[i*batch_size : (i+1)*batch_size]).numpy()
print(tb_USE_train.shape)

tb_USE_cv = np.zeros((len(title_body_cv), 512))
batch_size = 64
for i in tqdm(range(int(len(title_body_cv)/batch_size)+1)):
  tb_USE_cv[i*batch_size : (i+1)*batch_size] = use_model(title_body_cv[i*batch_size : (i+1)*batch_size]).numpy()
print(tb_USE_cv.shape)

HBox(children=(FloatProgress(value=0.0, max=84.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [None]:
# answer
ans_USE_train = np.zeros((len(answer_train), 512))
batch_size = 32
for i in tqdm(range(int(len(answer_train)/batch_size)+1)):
  ans_USE_train[i*batch_size : (i+1)*batch_size] = use_model(answer_train[i*batch_size : (i+1)*batch_size]).numpy()
print(ans_USE_train.shape)

ans_USE_cv = np.zeros((len(answer_cv), 512))
batch_size = 32
for i in tqdm(range(int(len(answer_cv)/batch_size)+1)):
  ans_USE_cv[i*batch_size : (i+1)*batch_size] = use_model(answer_cv[i*batch_size : (i+1)*batch_size]).numpy()
print(ans_USE_cv.shape)

HBox(children=(FloatProgress(value=0.0, max=168.0), HTML(value='')))


(5349, 512)


HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))


(730, 512)


In [None]:
# saving pretrained USE embeddings
np.savez_compressed('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/use_outputs', tb_USE_train = tb_USE_train, tb_USE_cv = tb_USE_cv, 
                    ans_USE_train = ans_USE_train, ans_USE_cv = ans_USE_cv)

### Modeling :

In [25]:
# laoding pretrained USE embeddings
tb_USE_train = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/use_outputs.npz')['tb_USE_train']
tb_USE_cv = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/use_outputs.npz')['tb_USE_cv']
ans_USE_train = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/use_outputs.npz')['ans_USE_train']
ans_USE_cv = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/use_outputs.npz')['ans_USE_cv']

In [36]:
tf.keras.backend.clear_session()
seed = 42

# *-----------------title_body-----------------*
tb_input = Input(name = 'tb_use_out', shape = (512,), dtype = 'float32')
tb_dropout_1 = Dropout(rate = 0.2, seed = seed)(tb_input)
 
tb_dense_1 = Dense(units = 1024, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_dropout_1)
tb_dropout_2 = Dropout(rate = 0.2, seed = seed)(tb_dense_1)
 
tb_dense_2 = Dense(units = 512, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_dropout_2)
tb_dropout_3 = Dropout(rate = 0.2, seed = seed)(tb_dense_2)

tb_out = Dense(units = 21, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_dropout_3)
 
# *-----------------answer_model-----------------*
ans_input = Input(name = 'ans_use_out', shape = (512,), dtype = 'float32')
concat = tf.concat([tb_out, ans_input], axis = -1) 
ans_dropout_1 = Dropout(rate = 0.2, seed = seed)(concat)
 
ans_dense_1 = Dense(units = 1024, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_dropout_1)
ans_dropout_2 = Dropout(rate = 0.2, seed = seed)(ans_dense_1)
 
ans_dense_2 = Dense(units = 512, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_dropout_2)
ans_dropout_3 = Dropout(rate = 0.2, seed = seed)(ans_dense_2)

ans_out = Dense(units = 9, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_dropout_3)

# *--------------------concat--------------------*
out = tf.concat([tb_out, ans_out], axis = -1) 
 
model = Model(inputs = [tb_input, ans_input], outputs = out)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tb_use_out (InputLayer)         [(None, 512)]        0                                            
__________________________________________________________________________________________________
dropout (Dropout)               (None, 512)          0           tb_use_out[0][0]                 
__________________________________________________________________________________________________
dense (Dense)                   (None, 1536)         787968      dropout[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 1536)         0           dense[0][0]                      
_______________________________________________________________________________________

In [37]:
# Defining callbacks
!rm -r '/content/saved models'
!rm -r '/content/logs'
!mkdir '/content/saved models'
!mkdir '/content/logs/'
 
# tensorboard callback
import datetime
log_dir="logs/" + datetime.datetime.now().strftime("%Y-%m-%d %H_%M_%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir, histogram_freq=1, write_graph=True, write_grads=True)
 
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor= 'val_loss', factor=np.sqrt(0.1), patience=7, verbose=1)
 
checkpt = tf.keras.callbacks.ModelCheckpoint('/content/saved models/weights.-{epoch:03d}-{val_loss:.5f}', monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=True)
 
print_spearman_fn = print_spearman(train_data = ([tb_USE_train, ans_USE_train], y_train.values),
                                 validation_data = ([tb_USE_cv, ans_USE_cv], y_cv.values))
callbacks = [print_spearman_fn, reduce_lr, checkpt, tensorboard_callback]



In [33]:
# LSTM : training a model
tf.keras.backend.clear_session()
opt = tf.keras.optimizers.Adam(learning_rate = 0.00008)
rmse = tf.keras.metrics.RootMeanSquaredError()

model.compile(loss = 'binary_crossentropy', optimizer = opt,  metrics = [rmse])
history = model.fit(x = [tb_USE_train, ans_USE_train], y =  y_train.values,
                    validation_data = ([tb_USE_cv, ans_USE_cv], y_cv.values),
                    batch_size = 64, epochs = 50, callbacks = callbacks)

Epoch 1/50
spearman :
train_spearman : 0.04865 | val_spearman : 0.05349 | Learning_Rate : 7.999999797903001e-05

Epoch 00001: saving model to /content/saved models/weights.-001-0.45263
Epoch 2/50
spearman :
train_spearman : 0.16816 | val_spearman : 0.16506 | Learning_Rate : 7.999999797903001e-05

Epoch 00002: saving model to /content/saved models/weights.-002-0.41570
Epoch 3/50
spearman :
train_spearman : 0.20048 | val_spearman : 0.21808 | Learning_Rate : 7.999999797903001e-05

Epoch 00003: saving model to /content/saved models/weights.-003-0.40312
Epoch 4/50
spearman :
train_spearman : 0.26575 | val_spearman : 0.26134 | Learning_Rate : 7.999999797903001e-05

Epoch 00004: saving model to /content/saved models/weights.-004-0.39406
Epoch 5/50
spearman :
train_spearman : 0.30975 | val_spearman : 0.3101 | Learning_Rate : 7.999999797903001e-05

Epoch 00005: saving model to /content/saved models/weights.-005-0.38857
Epoch 6/50
spearman :
train_spearman : 0.33698 | val_spearman : 0.3342 | Lea

In [34]:
# best_results : (val_spearman, epoch_no)
max(print_spearman_fn.spearman_dict['val_spearman']), np.argmax(print_spearman_fn.spearman_dict['val_spearman'])

(0.43661, 48)

# 4.3. Sentence Embedding : RoBERTa

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 2.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 14.3MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 20.3MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl 

In [None]:
import tensorflow as tf
from transformers import RobertaConfig, RobertaTokenizer, TFRobertaModel
 
# Load pretrained model/tokenizer
config = RobertaConfig.from_pretrained('roberta-base', output_hidden_states=True)
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = TFRobertaModel.from_pretrained('roberta-base', config = config)

- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [None]:
# Tokenizing : title_body
title_body_train_tokens = roberta_tokenizer.batch_encode_plus(title_body_train, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
title_body_cv_tokens = roberta_tokenizer.batch_encode_plus(title_body_cv, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
# tokens = bert_tokenizer.batch_encode_plus(final_train, max_length = 50, truncation = True, pad_to_max_length = True)

tb_train_input_ids  = np.array(title_body_train_tokens['input_ids'])
tb_train_attn_mask = np.array(title_body_train_tokens['attention_mask'])
 
tb_cv_input_ids  = np.array(title_body_cv_tokens['input_ids'])
tb_cv_attn_mask = np.array(title_body_cv_tokens['attention_mask'])

print(tb_train_input_ids.shape, tb_train_attn_mask.shape, tb_cv_input_ids.shape, tb_cv_attn_mask.shape)

# Tokenizing : answer
ans_train_tokens = roberta_tokenizer.batch_encode_plus(answer_train, max_length = 300, truncation = True, pad_to_max_length = True, return_tensors="tf")
ans_cv_tokens = roberta_tokenizer.batch_encode_plus(answer_cv, max_length = 300, truncation = True, pad_to_max_length = True, return_tensors="tf")
# tokens = bert_tokenizer.batch_encode_plus(final_train, max_length = 50, truncation = True, pad_to_max_length = True)
 
ans_train_input_ids  = np.array(ans_train_tokens['input_ids'])
ans_train_attn_mask = np.array(ans_train_tokens['attention_mask'])
 
ans_cv_input_ids  = np.array(ans_cv_tokens['input_ids'])
ans_cv_attn_mask = np.array(ans_cv_tokens['attention_mask'])
seq_len = ans_train_input_ids.shape[1]
 
print(ans_train_input_ids.shape, ans_train_attn_mask.shape, ans_cv_input_ids.shape, ans_cv_attn_mask.shape)

(5349, 512) (5349, 512) (730, 512) (730, 512)
(5349, 300) (5349, 300) (730, 300) (730, 300)


In [None]:
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model

In [None]:
# train : title_body RoBERTa embedings
batch_size = 32
l = tb_train_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = roberta_model([tb_train_input_ids[i*batch_size:(i+1)*batch_size], tb_train_attn_mask[i*batch_size:(i+1)*batch_size]])[2]

  # getting hidden states of last 4 layers (avg of each timrstep)
  lst = [i[:, 0, :] for i in x[-4:]]

  # concat all 4 averaged hidden_states
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
tb_RoBERTa_train = tf.concat(lst1, axis = 0)
print(tb_RoBERTa_train.shape)

# cv : title_body RoBERTa embedings
batch_size = 32
l = ans_cv_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = roberta_model([tb_cv_input_ids[i*batch_size:(i+1)*batch_size], tb_cv_attn_mask[i*batch_size:(i+1)*batch_size]])[2]
  # getting hidden states of last 4 layers (avg of each timrstep)
  lst = [i[:, 0, :] for i in x[-4:]]
  # lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
tb_RoBERTa_cv = tf.concat(lst1, axis = 0)
print(tb_RoBERTa_cv.shape)

HBox(children=(FloatProgress(value=0.0, max=168.0), HTML(value='')))


(5349, 3072)


HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))


(730, 3072)


In [None]:
# train : ans RoBERTa embedings
batch_size = 32
l = ans_train_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = roberta_model([ans_train_input_ids[i*batch_size:(i+1)*batch_size], ans_train_attn_mask[i*batch_size:(i+1)*batch_size]])[2]
  # getting hidden states of last 4 layers (avg of each timrstep)
  lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
ans_RoBERTa_train = tf.concat(lst1, axis = 0)
print(ans_RoBERTa_train.shape)

# cv : ans BERT embedings
batch_size = 32
l = ans_cv_input_ids.shape[0]
lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = roberta_model([ans_cv_input_ids[i*batch_size:(i+1)*batch_size], ans_cv_attn_mask[i*batch_size:(i+1)*batch_size]])[2]
  # getting hidden states of last 4 layers (avg of each timrstep)
  lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
ans_RoBERTa_cv = tf.concat(lst1, axis = 0)
print(ans_RoBERTa_cv.shape)

HBox(children=(FloatProgress(value=0.0, max=168.0), HTML(value='')))


(5349, 3072)


HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))


(730, 3072)


In [None]:
# saving roberta pretrained embeddings
np.savez_compressed('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/roberta_outputs', tb_RoBERTa_train = tb_RoBERTa_train.numpy(), tb_RoBERTa_cv = tb_RoBERTa_cv.numpy(), 
                    ans_RoBERTa_train = ans_RoBERTa_train.numpy(), ans_RoBERTa_cv = ans_RoBERTa_cv.numpy())

### Modelling :

In [35]:
# loading roberta pretrained embeddings
tb_RoBERTa_train = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/roberta_outputs.npz')['tb_RoBERTa_train']
tb_RoBERTa_cv = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/roberta_outputs.npz')['tb_RoBERTa_cv']
ans_RoBERTa_train = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/roberta_outputs.npz')['ans_RoBERTa_train']
ans_RoBERTa_cv = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/roberta_outputs.npz')['ans_RoBERTa_cv']

In [44]:
tf.keras.backend.clear_session()
seed = 42

# *-----------------title_body-----------------*
tb_input = Input(name = 'tb_roberta_out', shape = (768*4,), dtype = 'float32')
tb_dropout_1 = Dropout(rate = 0.2, seed = seed)(tb_input)
 
tb_dense_1 = Dense(units = 1024, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_dropout_1)
tb_dropout_2 = Dropout(rate = 0.2, seed = seed)(tb_dense_1)
 
tb_dense_2 = Dense(units = 512, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_dropout_2)
tb_dropout_3 = Dropout(rate = 0.1, seed = seed)(tb_dense_2)

tb_out = Dense(units = 21, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_dropout_3)
 
# *-----------------answer_model-----------------*
ans_input = Input(name = 'ans_roberta_out', shape = (768*4,), dtype = 'float32')
concat = tf.concat([tb_out, ans_input], axis = -1) 
ans_dropout_1 = Dropout(rate = 0.2, seed = seed)(concat)
 
ans_dense_1 = Dense(units = 1024, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_dropout_1)
ans_dropout_2 = Dropout(rate = 0.2, seed = seed)(ans_dense_1)
 
ans_dense_2 = Dense(units = 512, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_dropout_2)
ans_dropout_3 = Dropout(rate = 0.1, seed = seed)(ans_dense_2)

ans_out = Dense(units = 9, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_dropout_3)

# *--------------------concat--------------------*
out = tf.concat([tb_out, ans_out], axis = -1) 
 
model = Model(inputs = [tb_input, ans_input], outputs = out)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tb_roberta_out (InputLayer)     [(None, 3072)]       0                                            
__________________________________________________________________________________________________
dropout (Dropout)               (None, 3072)         0           tb_roberta_out[0][0]             
__________________________________________________________________________________________________
dense (Dense)                   (None, 1024)         3146752     dropout[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 1024)         0           dense[0][0]                      
_______________________________________________________________________________________

In [45]:
# Defining callbacks
!rm -r '/content/saved models'
!rm -r '/content/logs'
!mkdir '/content/saved models'
!mkdir '/content/logs/'
 
# tensorboard callback
import datetime
log_dir="logs/" + datetime.datetime.now().strftime("%Y-%m-%d %H_%M_%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir, histogram_freq=1, write_graph=True, write_grads=True)
 
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor= 'val_loss', factor=np.sqrt(0.1), patience=7, verbose=1)
 
checkpt = tf.keras.callbacks.ModelCheckpoint('/content/saved models/weights.-{epoch:03d}-{val_loss:.5f}', monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=True)
 
print_spearman_fn = print_spearman(train_data = ([tb_RoBERTa_train, ans_RoBERTa_train], y_train.values),
                                 validation_data = ([tb_RoBERTa_cv, ans_RoBERTa_cv], y_cv.values))
callbacks = [print_spearman_fn, reduce_lr, checkpt, tensorboard_callback]



In [47]:
# LSTM : training a model
tf.keras.backend.clear_session()
opt = tf.keras.optimizers.Adam(learning_rate = 0.001)
rmse = tf.keras.metrics.RootMeanSquaredError()
 
model.compile(loss = 'binary_crossentropy', optimizer = opt,  metrics = [rmse])
history = model.fit(x = [tb_RoBERTa_train, ans_RoBERTa_train], y =  y_train.values,
                    validation_data = ([tb_RoBERTa_cv, ans_RoBERTa_cv], y_cv.values),
                    batch_size = 64, epochs = 75, callbacks = callbacks)

Epoch 1/75
spearman :
train_spearman : 0.24809 | val_spearman : 0.24901 | Learning_Rate : 0.0010000000474974513

Epoch 00001: saving model to /content/saved models/weights.-001-0.41228
Epoch 2/75
spearman :
train_spearman : 0.27542 | val_spearman : 0.27254 | Learning_Rate : 0.0010000000474974513

Epoch 00002: saving model to /content/saved models/weights.-002-0.40281
Epoch 3/75
spearman :
train_spearman : 0.29459 | val_spearman : 0.29435 | Learning_Rate : 0.0010000000474974513

Epoch 00003: saving model to /content/saved models/weights.-003-0.39780
Epoch 4/75
spearman :
train_spearman : 0.31958 | val_spearman : 0.30907 | Learning_Rate : 0.0010000000474974513

Epoch 00004: saving model to /content/saved models/weights.-004-0.39491
Epoch 5/75
spearman :
train_spearman : 0.32488 | val_spearman : 0.31364 | Learning_Rate : 0.0010000000474974513

Epoch 00005: saving model to /content/saved models/weights.-005-0.39732
Epoch 6/75
spearman :
train_spearman : 0.32311 | val_spearman : 0.31672 | L

In [48]:
# best_results : (val_spearman, epoch_no)
max(print_spearman_fn.spearman_dict['val_spearman']), np.argmax(print_spearman_fn.spearman_dict['val_spearman'])

(0.41772, 58)

# 4.4. Sentence Embedding : XLNet

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 2.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 14.3MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 20.3MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl 

In [61]:
import tensorflow as tf
from transformers import XLNetConfig, XLNetTokenizer, TFXLNetModel
 
# Load pretrained model/tokenizer
config = XLNetConfig.from_pretrained('xlnet-base-cased', output_hidden_states=True)
xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_model = TFXLNetModel.from_pretrained('xlnet-base-cased', config = config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=565485600.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetModel: ['lm_loss']
- This IS expected if you are initializing TFXLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFXLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLNetModel were initialized from the model checkpoint at xlnet-base-cased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFXLNetModel for predictions without further training.


In [62]:
# Tokenizing : title_body
title_body_train_tokens = xlnet_tokenizer.batch_encode_plus(title_body_train, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
title_body_cv_tokens = xlnet_tokenizer.batch_encode_plus(title_body_cv, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
# tokens = bert_tokenizer.batch_encode_plus(final_train, max_length = 50, truncation = True, pad_to_max_length = True)

tb_train_input_ids  = np.array(title_body_train_tokens['input_ids'])
tb_train_attn_mask = np.array(title_body_train_tokens['attention_mask'])
tb_train_token_typ_ids = np.array(title_body_train_tokens['token_type_ids'])
 
tb_cv_input_ids  = np.array(title_body_cv_tokens['input_ids'])
tb_cv_attn_mask = np.array(title_body_cv_tokens['attention_mask'])
tb_cv_token_typ_ids = np.array(title_body_cv_tokens['token_type_ids'])

print(tb_train_input_ids.shape, tb_train_attn_mask.shape, tb_train_token_typ_ids.shape, tb_cv_input_ids.shape, tb_cv_attn_mask.shape, tb_cv_token_typ_ids.shape)

# Tokenizing : answer
ans_train_tokens = xlnet_tokenizer.batch_encode_plus(answer_train, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
ans_cv_tokens = xlnet_tokenizer.batch_encode_plus(answer_cv, max_length = 512, truncation = True, pad_to_max_length = True, return_tensors="tf")
ans_train_input_ids  = np.array(ans_train_tokens['input_ids'])
ans_train_attn_mask = np.array(ans_train_tokens['attention_mask'])
ans_train_token_typ_ids = np.array(ans_train_tokens['token_type_ids'])
 
ans_cv_input_ids  = np.array(ans_cv_tokens['input_ids'])
ans_cv_attn_mask = np.array(ans_cv_tokens['attention_mask'])
ans_cv_token_typ_ids = np.array(ans_cv_tokens['token_type_ids'])
seq_len = ans_train_input_ids.shape[1]
 
print(ans_train_input_ids.shape, ans_train_attn_mask.shape, ans_train_token_typ_ids.shape, ans_cv_input_ids.shape, ans_cv_attn_mask.shape, ans_cv_token_typ_ids.shape)

(5349, 512) (5349, 512) (5349, 512) (730, 512) (730, 512) (730, 512)
(5349, 512) (5349, 512) (5349, 512) (730, 512) (730, 512) (730, 512)


In [63]:
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model

In [65]:
# train : title_body XLNet embeddings
batch_size = 32
l = tb_train_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = xlnet_model([tb_train_input_ids[i*batch_size:(i+1)*batch_size], tb_train_attn_mask[i*batch_size:(i+1)*batch_size]])[1]

  # getting hidden states of last 4 layers (avg of each timrstep)
  lst = [tf.reduce_mean(i, axis = 1) for i in x[-4:]]

  # concat all 4 averaged hidden_states
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
tb_XLNet_train = tf.concat(lst1, axis = 0)
print(tb_XLNet_train.shape)

# cv : title_body XLNet embeddings
batch_size = 32
l = tb_cv_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = xlnet_model([tb_cv_input_ids[i*batch_size:(i+1)*batch_size], tb_cv_attn_mask[i*batch_size:(i+1)*batch_size]])[1]
  # getting hidden states of last 4 layers (avg of each timrstep)
  lst = [tf.reduce_mean(i, axis = 1) for i in x[-4:]]
  # lst = [i[:, 0, :] for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
tb_XLNet_cv = tf.concat(lst1, axis = 0)
print(tb_XLNet_cv.shape)

HBox(children=(FloatProgress(value=0.0, max=168.0), HTML(value='')))


(5349, 3072)


HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))


(730, 3072)


In [66]:
# train : ans XLNet embeddings
batch_size = 32
l = ans_train_input_ids.shape[0]

lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = xlnet_model([ans_train_input_ids[i*batch_size:(i+1)*batch_size], ans_train_attn_mask[i*batch_size:(i+1)*batch_size]])[1]
  # getting hidden states of last 4 layers (avg of each timrstep)
  lst = [tf.reduce_mean(i, axis = 1) for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
ans_XLNet_train = tf.concat(lst1, axis = 0)
print(ans_XLNet_train.shape)

# cv : ans XLNet embeddings
batch_size = 32
l = ans_cv_input_ids.shape[0]
lst1 = []
for i in tqdm(range((l//batch_size)+1)):
  x = xlnet_model([ans_cv_input_ids[i*batch_size:(i+1)*batch_size], ans_cv_attn_mask[i*batch_size:(i+1)*batch_size]])[1]
  # getting hidden states of last 4 layers (avg of each timrstep)
  lst = [tf.reduce_mean(i, axis = 1) for i in x[-4:]]
  conc = tf.concat(lst, axis = 1)
  lst1.append(conc)
ans_XLNet_cv = tf.concat(lst1, axis = 0)
print(ans_XLNet_cv.shape)

HBox(children=(FloatProgress(value=0.0, max=168.0), HTML(value='')))


(5349, 3072)


HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))


(730, 3072)


In [None]:
# saving pretrained xlnet embeddings
np.savez_compressed('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/xlnet_outputs', tb_XLNet_train = tb_XLNet_train.numpy(), tb_XLNet_cv = tb_XLNet_cv.numpy(), 
                    ans_XLNet_train = ans_XLNet_train.numpy(), ans_XLNet_cv = ans_XLNet_cv.numpy())

### Modelling :

In [81]:
# loading pretrained xlnet embeddings
tb_XLNet_train = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/xlnet_outputs.npz')['tb_XLNet_train']
tb_XLNet_cv = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/xlnet_outputs.npz')['tb_XLNet_cv']
ans_XLNet_train = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/xlnet_outputs.npz')['ans_XLNet_train']
ans_XLNet_cv = np.load('/content/drive/My Drive/AAIC Course/Personal case study 2  - Google Quest/xlnet_outputs.npz')['ans_XLNet_cv']

In [82]:
tf.keras.backend.clear_session()
seed = 42

# *-----------------title_body-----------------*
tb_input = Input(name = 'tb_roberta_out', shape = (768*4,), dtype = 'float32')
tb_dropout_1 = Dropout(rate = 0.2, seed = seed)(tb_input)
 
tb_dense_1 = Dense(units = 1536, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_dropout_1)
tb_dropout_2 = Dropout(rate = 0.2, seed = seed)(tb_dense_1)
 
tb_dense_2 = Dense(units = 512, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_dropout_2)
tb_dropout_3 = Dropout(rate = 0.1, seed = seed)(tb_dense_2)

tb_out = Dense(units = 21, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(tb_dropout_3)
 
# *-----------------answer_model-----------------*
ans_input = Input(name = 'ans_roberta_out', shape = (768*4,), dtype = 'float32')
# concat = tf.concat([tb_out, ans_input], axis = -1) 
ans_dropout_1 = Dropout(rate = 0.2, seed = seed)(ans_input)
 
ans_dense_1 = Dense(units = 1024, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_dropout_1)
ans_dropout_2 = Dropout(rate = 0.2, seed = seed)(ans_dense_1)
 
ans_dense_2 = Dense(units = 512, activation = 'relu', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_dropout_2)
ans_dropout_3 = Dropout(rate = 0.1, seed = seed)(ans_dense_2)

ans_out = Dense(units = 9, activation = 'sigmoid', kernel_initializer = tf.keras.initializers.he_normal(seed = seed))(ans_dropout_3)

# *--------------------concat--------------------*
out = tf.concat([tb_out, ans_out], axis = -1) 
 
model = Model(inputs = [tb_input, ans_input], outputs = out)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tb_roberta_out (InputLayer)     [(None, 3072)]       0                                            
__________________________________________________________________________________________________
ans_roberta_out (InputLayer)    [(None, 3072)]       0                                            
__________________________________________________________________________________________________
dropout (Dropout)               (None, 3072)         0           tb_roberta_out[0][0]             
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 3072)         0           ans_roberta_out[0][0]            
_______________________________________________________________________________________

In [83]:
# Defining callbacks
!rm -r '/content/saved models'
!rm -r '/content/logs'
!mkdir '/content/saved models'
!mkdir '/content/logs/'
 
# tensorboard callback
import datetime
log_dir="logs/" + datetime.datetime.now().strftime("%Y-%m-%d %H_%M_%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir, histogram_freq=1, write_graph=True, write_grads=True)
 
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor= 'val_loss', factor=np.sqrt(0.1), patience=7, verbose=1)
 
checkpt = tf.keras.callbacks.ModelCheckpoint('/content/saved models/weights.-{epoch:03d}-{val_loss:.5f}', monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=True)
 
print_spearman_fn = print_spearman(train_data = ([tb_XLNet_train, ans_XLNet_train], y_train.values),
                                 validation_data = ([tb_XLNet_cv, ans_XLNet_cv], y_cv.values))
callbacks = [print_spearman_fn, reduce_lr, checkpt, tensorboard_callback]



In [84]:
# LSTM : training a model
tf.keras.backend.clear_session()
opt = tf.keras.optimizers.Adam(learning_rate = 0.0001)
rmse = tf.keras.metrics.RootMeanSquaredError()
 
model.compile(loss = 'binary_crossentropy', optimizer = opt,  metrics = [rmse])
history = model.fit(x = [tb_XLNet_train, ans_XLNet_train], y =  y_train.values,
                    validation_data = ([tb_XLNet_cv, ans_XLNet_cv], y_cv.values),
                    batch_size = 64, epochs = 75, callbacks = callbacks)

Epoch 1/75
spearman :
train_spearman : 0.16392 | val_spearman : 0.15531 | Learning_Rate : 9.999999747378752e-05

Epoch 00001: saving model to /content/saved models/weights.-001-0.42974
Epoch 2/75
spearman :
train_spearman : 0.17559 | val_spearman : 0.17226 | Learning_Rate : 9.999999747378752e-05

Epoch 00002: saving model to /content/saved models/weights.-002-0.42308
Epoch 3/75
spearman :
train_spearman : 0.19335 | val_spearman : 0.20855 | Learning_Rate : 9.999999747378752e-05

Epoch 00003: saving model to /content/saved models/weights.-003-0.41867
Epoch 4/75
spearman :
train_spearman : 0.21641 | val_spearman : 0.21935 | Learning_Rate : 9.999999747378752e-05

Epoch 00004: saving model to /content/saved models/weights.-004-0.41265
Epoch 5/75
spearman :
train_spearman : 0.24054 | val_spearman : 0.24004 | Learning_Rate : 9.999999747378752e-05

Epoch 00005: saving model to /content/saved models/weights.-005-0.40907
Epoch 6/75
spearman :
train_spearman : 0.2705 | val_spearman : 0.25795 | Le

In [85]:
# best_results : (val_spearman, epoch_no)
max(print_spearman_fn.spearman_dict['val_spearman']), np.argmax(print_spearman_fn.spearman_dict['val_spearman'])

(0.37525, 64)

### # Final Results :


In [90]:
# Final Results
from prettytable import PrettyTable
x = PrettyTable()

x.field_names = ["Model", "train_spearman", "val_spearman"]
x.add_row(["baseline_LSTM", 0.4995, 0.34899])
x.add_row(["BERT", 0.50081, 0.39536])
x.add_row(["USE",  0.52245, 0.4366])
x.add_row(["RoBERTa", 0.48059, 0.41772])
x.add_row(["XLNet", 0.52237, 0.37525])

print(x)

+---------------+----------------+--------------+
|     Model     | train_spearman | val_spearman |
+---------------+----------------+--------------+
| baseline_LSTM |     0.4995     |   0.34899    |
|      BERT     |    0.50081     |   0.39536    |
|      USE      |    0.52245     |    0.4366    |
|    RoBERTa    |    0.48059     |   0.41772    |
|     XLNet     |    0.52237     |   0.37525    |
+---------------+----------------+--------------+
