## Simple BERT test

Testing preprocessed dataset with BERT minimalistic model from: https://www.kaggle.com/khoongweihao/bert-base-tf2-0-minimalistic-iii

In [1]:
import tensorflow as tf
import numpy as np
import random
import os


def set_seeds(seed):
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

SEED = 21937
set_seeds(SEED)

In [2]:
import pandas as pd

train_dset = pd.read_csv("../input/google-quest-challenge/train.csv")   # index_col='qa_id'
test_dset = pd.read_csv("../input/google-quest-challenge/test.csv")   # index_col='qa_id'
df_sub = pd.read_csv('../input/google-quest-challenge/sample_submission.csv')

free_text_columns = ['question_title', 'question_body', 'answer']

category_columns = ['host', 'category']

discard_columns = ['question_user_name', 'question_user_page',  'answer_user_name', 'answer_user_page', 'url']

target_columns = ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational',
                  'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer',
                  'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent',
                  'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice',
                  'question_type_compare', 'question_type_consequence', 'question_type_definition',
                  'question_type_entity', 'question_type_instructions', 'question_type_procedure',
                  'question_type_reason_explanation', 'question_type_spelling', 'question_well_written',
                  'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance',
                  'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure',
                  'answer_type_reason_explanation', 'answer_well_written']

## BERT

In [3]:
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import tensorflow_hub as hub
import tensorflow as tf
import bert_tokenization as tokenization
import tensorflow.keras.backend as K
import gc
import os
from scipy.stats import spearmanr
from math import floor, ceil
from tensorflow.keras.models import load_model

In [4]:
PATH = '../input/google-quest-challenge/'
BERT_PATH = '../input/bert-base-from-tfhub/bert_en_uncased_L-12_H-768_A-12'
tokenizer = tokenization.FullTokenizer(BERT_PATH+'/assets/vocab.txt', True)
MAX_SEQUENCE_LENGTH = 512


output_categories = target_columns
input_categories = ['question_title', 'question_body', 'answer']

In [5]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=30, q_max_len=239, a_max_len=239):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = q[:q_new_len]
        a = a[:a_new_len]
    
    return t, q, a

def _convert_to_bert_inputs(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    stoken = ["[CLS]"] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]

    input_ids = _get_ids(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks(stoken, max_sequence_length)
    input_segments = _get_segments(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arays(df, columns, prefix, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance[prefix+'question_title'], instance[prefix+'question_body'], instance[prefix+'answer']   # instance.question_title, instance.question_body, instance.answer

        t, q, a = _trim_input(t, q, a, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]


def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

In [6]:
def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(
            spearmanr(col_trues, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.mean(rhos)


def compute_spearmanr2(preds, trues):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(
            spearmanr(col_trues, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.nanmean(rhos), rhos


class CustomCallback(tf.keras.callbacks.Callback):
    
    def __init__(self, valid_data, test_data, batch_size=16, fold=None):

        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]
        self.test_inputs = test_data
        
        self.batch_size = batch_size
        self.fold = fold
        
    def on_train_begin(self, logs={}):
        self.valid_predictions = []
        self.test_predictions = []
        
    def on_epoch_end(self, epoch, logs={}):
        predicted = self.model.predict(self.valid_inputs, batch_size=self.batch_size)
        self.valid_predictions.append(predicted)
        
        rho_val = compute_spearmanr(
            self.valid_outputs, np.average(self.valid_predictions, axis=0))
        
        print(f'epoch = {epoch}, valid_spearman = {rho_val}') 
        self.test_predictions.append(
            self.model.predict(self.test_inputs, batch_size=self.batch_size))

    def on_train_end(self, logs={}):
        score, rho_cols = compute_spearmanr2(np.average(self.valid_predictions, axis=0), self.valid_outputs)
        rho_print = [print(target_columns[i] + " rho: " + str(rho_cols[i]) ) for i in range(0, len(target_columns))]  
       
        
        
def bert_model():
    
    input_word_ids = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')
    
    bert_layer = hub.KerasLayer(BERT_PATH, trainable=True)
    
    _, sequence_output = bert_layer([input_word_ids, input_masks, input_segments])
    
    x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(30, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(
        inputs=[input_word_ids, input_masks, input_segments], outputs=out)
    
    return model    
        
def train_and_predict(model, train_data, valid_data, test_data, 
                      learning_rate, epochs, batch_size, loss_function, fold):
        
    custom_callback = CustomCallback(
        valid_data=(valid_data[0], valid_data[1]), 
        test_data=test_data,
        batch_size=batch_size,
        fold=None)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss=loss_function, optimizer=optimizer)
    model.fit(train_data[0], train_data[1], epochs=epochs, 
              batch_size=batch_size, callbacks=[custom_callback])
    
    return custom_callback

In [7]:
gkf = GroupKFold(n_splits=5).split(X=train_dset.question_body, groups=train_dset.question_body)

prefix = '' # 'clean_'
outputs = compute_output_arrays(train_dset, output_categories)
inputs = compute_input_arays(train_dset, input_categories, prefix, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arays(test_dset, input_categories, prefix, tokenizer, MAX_SEQUENCE_LENGTH)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [8]:
histories = []
for fold, (train_idx, valid_idx) in enumerate(gkf):
    print(f"fold {fold}")
    K.clear_session()
    model = bert_model()

    train_inputs = [inputs[i][train_idx] for i in range(3)]
    train_outputs = outputs[train_idx]

    valid_inputs = [inputs[i][valid_idx] for i in range(3)]
    valid_outputs = outputs[valid_idx]

    # history contains two lists of valid and test preds respectively:
    #  [valid_predictions_{fold}, test_predictions_{fold}]
    history = train_and_predict(model, 
                      train_data=(train_inputs, train_outputs), 
                      valid_data=(valid_inputs, valid_outputs),
                      test_data=test_inputs, 
                      learning_rate=3e-5, epochs=3, batch_size=8,
                      loss_function='binary_crossentropy', fold=fold)

    histories.append(history)

fold 0
Train on 4863 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
question_asker_intent_understanding rho: 0.3962614795833214
question_body_critical rho: 0.6255129701076222
question_conversational rho: 0.3860775318581291
question_expect_short_answer rho: 0.30054662483066313
question_fact_seeking rho: 0.3632949828196535
question_has_commonly_accepted_answer rho: 0.4169553828292324
question_interestingness_others rho: 0.3471705354634398
question_interestingness_self rho: 0.4616583557582801
question_multi_intent rho: 0.5425816918929964
question_not_really_a_question rho: 0.060480286448173354
question_opinion_seeking rho: 0.4325403176080822
question_type_choice rho: 0.7172161615453545
question_type_compare rho: 0.35719615202944327
question_type_consequence rho: 0.14642165466807872
question_type_definition rho: 0.37512403508421754
question_type_entity rho: 0.5156850579565658
question_type_instructions rho: 0.7890344331334717
question_type_procedure rho: 0.3702830489013259
question_type_reason_expl

In [9]:
test_predictions = [histories[i].test_predictions for i in range(len(histories))]
test_predictions = [np.average(test_predictions[i], axis=0) for i in range(len(test_predictions))]
test_predictions = np.mean(test_predictions, axis=0)

df_sub.iloc[:, 1:] = test_predictions

df_sub.to_csv('submission.csv', index=False)