In [None]:
import math
import glob
import sys
import gc
import os
import re
from math import floor, ceil

import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)

#TF&K
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, losses, models, callbacks
from tensorflow.keras.initializers import *
from tensorflow.keras.regularizers import *
from tensorflow.keras.constraints import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.layers import *
from tensorflow.keras.losses import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import *
print(tf.__version__)

import spacy
from transformers import *

from sklearn.model_selection import GroupKFold
from scipy.stats import spearmanr

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm_notebook
tqdm_notebook().pandas()

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import os

In [None]:
def seed_everything(seed=0):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

SEED = 69
seed_everything(SEED)

In [None]:
mispell_dict = {"aren't" : "are not",          "can't" : "cannot",
                "couldn't" : "could not",      "couldnt" : "could not",
                "didn't" : "did not",          "doesn't" : "does not",
                "doesnt" : "does not",         "don't" : "do not",
                "hadn't" : "had not",          "hasn't" : "has not",
                "haven't" : "have not",        "havent" : "have not",
                "he'd" : "he would",           "he'll" : "he will",
                "he's" : "he is",              "i'd" : "I would",
                "i'd" : "I had",               "i'll" : "I will",
                "i'm" : "I am",                "isn't" : "is not",
                "it's" : "it is",              "it'll":"it will",
                "i've" : "I have",             "let's" : "let us",
                "mightn't" : "might not",      "mustn't" : "must not",
                "shan't" : "shall not",        "she'd" : "she would",
                "she'll" : "she will",         "she's" : "she is",
                "shouldn't" : "should not",    "shouldnt" : "should not",
                "that's" : "that is",          "thats" : "that is",
                "there's" : "there is",        "theres" : "there is",
                "they'd" : "they would",       "they'll" : "they will",
                "they're" : "they are",        "theyre":  "they are",
                "they've" : "they have",       "we'd" : "we would",
                "we're" : "we are",            "weren't" : "were not",
                "we've" : "we have",           "what'll" : "what will",
                "what're" : "what are",        "what's" : "what is",
                "what've" : "what have",       "where's" : "where is",
                "who'd" : "who would",         "who'll" : "who will",
                "who're" : "who are",          "who's" : "who is",
                "who've" : "who have",         "won't" : "will not",
                "wouldn't" : "would not",      "you'd" : "you would",
                "you'll" : "you will",         "you're" : "you are",
                "you've" : "you have",         "'re": " are",
                "wasn't": "was not",           "we'll":" will",
                "didn't": "did not",           "tryin'":"trying"}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


def replace_typical_misspell(text):
    mispellings, mispellings_re = _get_mispell(mispell_dict)

    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)


puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&',
          '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
          '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
          '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\xa0', '\t',
          '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑',
          '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\u3000', '\u202f',
          '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫',
          '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
          '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・',
          '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x).replace("\n","")
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

def cln_space(x): 
    return " ".join(x.split())

def preprocess(x):
    x= clean_text(x.lower())
    x= replace_typical_misspell(x)
    x= cln_space(x)
    
    return x

In [None]:
MODEL = 'roberta-base-tf_model.h5'
PATH = '../input/google-quest-challenge/'
ROBERTA_PATH = '../input/roberta-base-tf2/'
tokenizer = RobertaTokenizer.from_pretrained(ROBERTA_PATH, do_lower_case=True)
MAX_SEQUENCE_LENGTH = 512

df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

In [None]:
df_train['question_body'] = df_train['question_body'].progress_map(lambda q: preprocess(q))
df_train['answer'] = df_train['answer'].progress_map(lambda q: preprocess(q))
df_train['question_title'] = df_train['question_title'].progress_map(lambda q: preprocess(q))

df_test['question_body'] = df_test['question_body'].progress_map(lambda q: preprocess(q))
df_test['answer'] = df_test['answer'].progress_map(lambda q: preprocess(q))
df_test['question_title'] = df_test['question_title'].progress_map(lambda q: preprocess(q))

In [None]:
def _convert_to_transformer_inputs(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for transformer (including bert)"""
    
    def return_id(str1, str2, truncation_strategy, length):

        inputs = tokenizer.encode_plus(str1, str2,
            add_special_tokens=True,
            max_length=length,
            truncation_strategy=truncation_strategy)
        
        input_ids =  inputs["input_ids"]
        input_masks = [1] * len(input_ids)
        input_segments = inputs["token_type_ids"]
        padding_length = length - len(input_ids)
        padding_id = tokenizer.pad_token_id
        input_ids = input_ids + ([padding_id] * padding_length)
        input_masks = input_masks + ([0] * padding_length)
        input_segments = input_segments + ([0] * padding_length)
        
        #print(input_ids)
        #print(input_masks)

        return [input_ids, input_masks, input_segments]
    
    input_ids_q, input_masks_q, input_segments_q = return_id(
        title + ' ' + question, None, 'longest_first', max_sequence_length)
    
    input_ids_a, input_masks_a, input_segments_a = return_id(
        answer, None, 'longest_first', max_sequence_length)
    
    return [input_ids_q, input_masks_q, input_segments_q,
            input_ids_a, input_masks_a, input_segments_a]



def compute_input_arrays(df, columns, tokenizer, max_sequence_length):
    input_ids_q, input_masks_q, input_segments_q = [], [], []
    input_ids_a, input_masks_a, input_segments_a = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        ids_q, masks_q, segments_q, ids_a, masks_a, segments_a = \
        _convert_to_transformer_inputs(t, q, a, tokenizer, max_sequence_length)
        
        input_ids_q.append(ids_q)
        input_masks_q.append(masks_q)
        input_segments_q.append(segments_q)

        input_ids_a.append(ids_a)
        input_masks_a.append(masks_a)
        input_segments_a.append(segments_a)
        
    return [np.asarray(input_ids_q, dtype=np.int32), 
            np.asarray(input_masks_q, dtype=np.int32), 
            np.asarray(input_segments_q, dtype=np.int32),
            np.asarray(input_ids_a, dtype=np.int32), 
            np.asarray(input_masks_a, dtype=np.int32), 
            np.asarray(input_segments_a, dtype=np.int32)]

def compute_output_arrays(df, columns):
    return np.asarray(df[columns])


def compute_spearmanr_ignore_nan(trues, preds):
    rhos = []
    for tcol, pcol in zip(np.transpose(trues), np.transpose(preds)):
        rhos.append(spearmanr(tcol, pcol).correlation)
    return np.nanmean(rhos)

def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(spearmanr(col_trues, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.mean(rhos)

In [None]:
def step_decay(epoch):
    '''
    control learning rate
    :param epoch:
    :return:
    '''
    if epoch < 2:
        lr = 3e-5
    else:
        lr = 3e-6
    return lr


lrate = LearningRateScheduler(step_decay, verbose=2)

keys = [0, 0.333333333333333, 0.44444444444444444, 0.55555555555555555, 0.666666666666666666666, 0.777777777777777777, 0.888888888888888888888, 1]
def most_near(n):
    dis = 10
    m = 0
    for e in keys:
        if abs(e - n) < dis:
            dis = abs(e - n) 
            m = e
    # if n < 0.1:
    #     m = 0
    # elif n > 0.98:
    #     m = 1
    # else:
    #     m = n
    return m

def repare(test_predictions):
    result = []
    for each in test_predictions:
        each = each.tolist()
        for i,t in enumerate(each):
            temp1 = most_near(t)
            # temp = random.randint(0,100)
            # if temp == 10:
            #     if temp1<0.99:
            #         each[i] = temp1 + 0.01
            #     else:
            #         each[i] = temp1 - 0.01   
            # else:
            each[i] = temp1
        result.append(each)

    result = np.asarray(result)
    return result

In [None]:
class CyclicLR(tf.keras.callbacks.Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())
        #print('cuurent lr: {}\n'.format(self.clr()))

    def on_epoch_end(self, epoch, logs={}):
        print('cuurent lr: {}\n'.format(self.clr()))

In [None]:
class CustomCallback(tf.keras.callbacks.Callback):
    
    def __init__(self, valid_data, test_data, batch_size=16, fold=None):

        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]
        self.test_inputs = test_data
        self.value = 0
        
        self.batch_size = batch_size
        self.fold = fold
        
    def on_train_begin(self, logs={}):
        self.valid_predictions = []
        self.test_predictions = []
        
    def on_epoch_end(self, epoch, logs={}):
        self.valid_predictions = self.model.predict(self.valid_inputs, batch_size=self.batch_size)
        valid_predictions1 = repare(self.valid_predictions)
        rho_val = compute_spearmanr(self.valid_outputs, self.valid_predictions)
        print('\nbefore repare: rho_val = {}'.format(rho_val))
        rho_val1 = compute_spearmanr(self.valid_outputs, valid_predictions1)
        print('after repare: rho_val = {}\n'.format(rho_val1))
        
        if rho_val1 > self.value:
            self.value = rho_val1
            self.model.save_weights(f'/kaggle/working/Finetune-Roberta-{fold}.h5')
            
            self.test_predictions.append(self.model.predict(self.test_inputs, batch_size=self.batch_size))

            if len(self.test_predictions) > 0 :
                self.test_predictions.pop()
            self.test_predictions.append(self.model.predict(self.test_inputs, batch_size=self.batch_size))

def create_model():
    q_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    a_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    q_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    a_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    q_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    a_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    config = RobertaConfig().from_pretrained(ROBERTA_PATH+'config.json') # print(config) to see settings
    config.output_hidden_states = True # Set to True to obtain hidden states
    # caution: when using e.g. XLNet, XLNetConfig() will automatically use xlnet-large config
    
    # normally ".from_pretrained('bert-base-uncased')", but because of no internet, the 
    # pretrained model has been downloaded manually and uploaded to kaggle. 
    
    """ bert_model = TFRobertaModel.from_pretrained(
        BERT_PATH+MODEL, config=config, from_tf=True) """

    bert_model = TFRobertaModel.from_pretrained(
        ROBERTA_PATH, config=config)

    #bert_model = TFRobertaModel.from_pretrained(BERT_PATH+MODEL)

    # if config.output_hidden_states = True, obtain hidden states via bert_model(...)[-1]
    q_embedding, q_pooler_output, q_hidden_states = bert_model(q_id, attention_mask=q_mask, token_type_ids=q_atn)
    a_embedding, a_pooler_output, a_hidden_states = bert_model(a_id, attention_mask=a_mask, token_type_ids=a_atn)

    q_pooler_output = tf.reshape(q_pooler_output,(-1,1,768))
    q_h12 = tf.reshape(q_hidden_states[-1][:,0],(-1,1,768))
    q_h11 = tf.reshape(q_hidden_states[-2][:,0],(-1,1,768))
    q_h10 = tf.reshape(q_hidden_states[-3][:,0],(-1,1,768))
    #print(q_h10.shape)
    #print(q_pooler_output.shape)
    q_concat_hidden = tf.keras.layers.Concatenate(axis=2)([q_pooler_output, q_h12, q_h11, q_h10])
    q_x = tf.keras.layers.GlobalMaxPooling1D()(q_concat_hidden)

    a_pooler_output = tf.reshape(a_pooler_output,(-1,1,768))
    a_h12 = tf.reshape(a_hidden_states[-1][:,0],(-1,1,768))
    a_h11 = tf.reshape(a_hidden_states[-2][:,0],(-1,1,768))
    a_h10 = tf.reshape(a_hidden_states[-3][:,0],(-1,1,768))
    #print(a_h10.shape)
    #print(a_pooler_output.shape)
    a_concat_hidden = tf.keras.layers.Concatenate(axis=2)([a_pooler_output, a_h12, a_h11, a_h10])
    a_x = tf.keras.layers.GlobalMaxPooling1D()(a_concat_hidden)

    concatenate_qa = tf.keras.layers.Concatenate()([q_x, a_x])
    x = tf.keras.layers.Dropout(0.2)(concatenate_qa)
    out = tf.keras.layers.Dense(30, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=[q_id, q_mask, q_atn, a_id, a_mask, a_atn,], outputs=out)
    model.compile(
        optimizer = tf.keras.optimizers.Adam(),
        loss=['binary_crossentropy'])
    return model

In [None]:
outputs = compute_output_arrays(df_train, output_categories)
inputs = compute_input_arrays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arrays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

gkf = GroupKFold(n_splits=5).split(X=df_train.question_body, groups=df_train.question_body)

In [None]:
valid_preds = []
test_preds = []
valss = []

clr = CyclicLR(base_lr=1e-5, max_lr=3e-5,
               step_size=4863//2)

for fold, (train_idx, valid_idx) in enumerate(gkf):
    
    if fold < 3:
        train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
        train_outputs = outputs[train_idx]
    
        valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
        valid_outputs = outputs[valid_idx]
    
        custom_callback = CustomCallback(
            valid_data=(valid_inputs, valid_outputs), 
            test_data=test_inputs,
            batch_size=16,
            fold=fold)
        
        K.clear_session()
        model = create_model()
        model.fit(train_inputs, train_outputs, epochs=3, batch_size=8, callbacks=[custom_callback, lrate])
        #model.save_weights(f'/kaggle/working/Finetune-Roberta-{fold}.h5')
        valid_preds.append(model.predict(valid_inputs))
        test_preds.append(model.predict(test_inputs))
        
        rho_val = compute_spearmanr_ignore_nan(valid_outputs, valid_preds[-1])
        print('validation score = ', rho_val)
        valss.append(rho_val)

In [None]:
df_sub.iloc[:, 1:] = np.average(test_preds, axis=0) # for weighted average set weights=[...]
df_sub.to_csv('submission.csv', index=False)
print('local cv = {}'.format(np.array(valss).mean()))