In [None]:
import numpy as np
np.set_printoptions(suppress=True)
import pandas as pd

#TF&K
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, losses, models, callbacks
from tensorflow.keras.initializers import *
from tensorflow.keras.regularizers import *
from tensorflow.keras.constraints import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.layers import *
from tensorflow.keras.losses import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import *

#NLP
import bert_tokenization as tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import get_custom_objects
from transformers import TFPreTrainedModel, TFBertMainLayer, BertConfig, TFBertModel, BertTokenizer
import spacy
from spacy.lang.en import English

import math
import glob
import sys
import gc
import os
import re
from math import floor, ceil

from scipy.stats import spearmanr, rankdata
from os.path import join as path_join
from urllib.parse import urlparse
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import KFold, GroupKFold, train_test_split
from ml_stratifiers import MultilabelStratifiedShuffleSplit, MultilabelStratifiedKFold

#matplotlib
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(font_scale=1.56)

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm_notebook
tqdm_notebook().pandas()

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

In [None]:
def seed_everything(seed=0):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

SEED = 69
seed_everything(SEED)

In [None]:
mispell_dict = {"aren't" : "are not",          "can't" : "cannot",
                "couldn't" : "could not",      "couldnt" : "could not",
                "didn't" : "did not",          "doesn't" : "does not",
                "doesnt" : "does not",         "don't" : "do not",
                "hadn't" : "had not",          "hasn't" : "has not",
                "haven't" : "have not",        "havent" : "have not",
                "he'd" : "he would",           "he'll" : "he will",
                "he's" : "he is",              "i'd" : "I would",
                "i'd" : "I had",               "i'll" : "I will",
                "i'm" : "I am",                "isn't" : "is not",
                "it's" : "it is",              "it'll":"it will",
                "i've" : "I have",             "let's" : "let us",
                "mightn't" : "might not",      "mustn't" : "must not",
                "shan't" : "shall not",        "she'd" : "she would",
                "she'll" : "she will",         "she's" : "she is",
                "shouldn't" : "should not",    "shouldnt" : "should not",
                "that's" : "that is",          "thats" : "that is",
                "there's" : "there is",        "theres" : "there is",
                "they'd" : "they would",       "they'll" : "they will",
                "they're" : "they are",        "theyre":  "they are",
                "they've" : "they have",       "we'd" : "we would",
                "we're" : "we are",            "weren't" : "were not",
                "we've" : "we have",           "what'll" : "what will",
                "what're" : "what are",        "what's" : "what is",
                "what've" : "what have",       "where's" : "where is",
                "who'd" : "who would",         "who'll" : "who will",
                "who're" : "who are",          "who's" : "who is",
                "who've" : "who have",         "won't" : "will not",
                "wouldn't" : "would not",      "you'd" : "you would",
                "you'll" : "you will",         "you're" : "you are",
                "you've" : "you have",         "'re": " are",
                "wasn't": "was not",           "we'll":" will",
                "didn't": "did not",           "tryin'":"trying"}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


def replace_typical_misspell(text):
    mispellings, mispellings_re = _get_mispell(mispell_dict)

    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)


puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&',
          '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
          '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
          '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\xa0', '\t',
          '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑',
          '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\u3000', '\u202f',
          '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫',
          '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
          '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・',
          '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x).replace("\n","")
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

def preprocess(x):
    x= clean_text(x.lower())
    x= replace_typical_misspell(x)
    
    return x

In [None]:
PATH = '../input/google-quest-challenge/'
BERT_PATH = '../input/bert-base-from-tfhub/bert_en_uncased_L-12_H-768_A-12'
tokenizer2 = BertTokenizer.from_pretrained(BERT_PATH+'/assets/vocab.txt', do_lower_case=True,)
MAX_SEQUENCE_LENGTH = 512

df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df = pd.concat([df_train,df_test],axis=0,ignore_index=True)
df_sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)
print('submission shape =', df_sub.shape)

output_categories = ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written']
input_categories = ['question_title', 'question_body', 'answer']
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

PATH = '../input/google-quest-challenge/'

In [None]:
df_train['question_body'] = df_train['question_body'].progress_map(lambda q: preprocess(q))
df_train['answer'] = df_train['answer'].progress_map(lambda q: preprocess(q))
df_train['question_title'] = df_train['question_title'].progress_map(lambda q: preprocess(q))

df_test['question_body'] = df_test['question_body'].progress_map(lambda q: preprocess(q))
df_test['answer'] = df_test['answer'].progress_map(lambda q: preprocess(q))
df_test['question_title'] = df_test['question_title'].progress_map(lambda q: preprocess(q))

In [None]:
def _get_masks(tokens, max_seq_length):
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens)+[0]*(max_seq_length-len(tokens))

def _get_segments(tokens, max_seq_length):
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    
    segments=[]
    first_sep=True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == '[SEP]':
            current_segment_id=1
    return segments+[0]*(max_seq_length-len(tokens))

def _trim_input(title, question, answer, max_sequence_length, t_max_len=30, q_max_len=239, a_max_len=239):

    t = tokenizer2.tokenize(title)
    q = tokenizer2.tokenize(question)
    a = tokenizer2.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = q[:q_new_len]
        a = a[:a_new_len]
    
    return t, q, a

def _get_ids(tokens, tokenizer, max_seq_length):
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0]*(max_seq_length - len(token_ids))
    return input_ids

def _convert_to_bert_inputs(title, question, answer, tokenizer, max_seq_length):
    stoken = ['[CLS]']+title+['[QBODY]']+question+['[SEP]']+answer+['[SEP]']
    #stoken = ['[CLS]']+title+['[QBODY]']+question+['[ANS]']+answer+['[SEP]']
    input_ids = _get_ids(tokens=stoken, tokenizer=tokenizer,max_seq_length=max_seq_length)
    input_masks = _get_masks(tokens=stoken, max_seq_length=max_seq_length)
    input_segments = _get_segments(tokens=stoken, max_seq_length=max_seq_length)
    
    return [input_ids, input_masks, input_segments]

def compute_input_array(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, col in tqdm(df[columns].iterrows()):
        t, q, a = col.question_title, col.question_body, col.answer
        t,q,a = _trim_input(t,q,a, max_sequence_length)
        ids, masks, segments = _convert_to_bert_inputs(t,q,a, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.array(input_ids, dtype=np.int32),
            np.array(input_masks, dtype=np.int32), 
            np.array(input_segments, dtype=np.int32)]

def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(
            #spearmanr(col_trues, col_pred).correlation)
            spearmanr(col_trues, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.mean(rhos)
    #return np.nanmean(rhos)

In [None]:
test_predictions = []
class CustomCallback(tf.keras.callbacks.Callback):
    def __init__(self, valid_data, test_data, test_predictions=test_predictions, batch_size=16, fold=None):
        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]
        self.test_inputs = test_data
        self.batch_size = batch_size
        self.test_predictions = test_predictions
        self.fold = fold
        self.value = -1
        self.bad_epochs = 0
        
    def on_train_begin(self, logs={}):
        self.valid_predictions=[]
        #self.test_predictions=[]
        
    def on_epoch_end(self, epoch, logs={}):
        self.valid_predictions.append(self.model.predict(self.valid_inputs, batch_size=self.batch_size))
        
        rho_val = compute_spearmanr(self.valid_outputs, np.average(self.valid_predictions, axis=0))
        print(f"\nvalidation rho: {round(rho_val,4)}")
        
        if rho_val >= self.value:
            self.value = rho_val
            self.model.save_weights(f'/kaggle/working/bert-base-{fold}.hdf5')
        
        if (epoch)%4==1 and math.isnan(rho_val):
            self.model.save_weights(f'/kaggle/working/bert-base-{fold}.hdf5')
            
        self.test_predictions.append(self.model.predict(self.test_inputs, batch_size=self.batch_size))
        
def bce(t,p):
    return binary_crossentropy(t,p)

def custom_loss(true,pred):
    bce = binary_crossentropy(true,pred)
    return bce + logcosh(true,pred)

In [None]:
bert_config=BertConfig(unk_token="[QBODY]", pad_token="[ANS]").from_pretrained('../input/bert-tensorflow/bert-base-uncased-config.json',output_hidden_states=False)

def bert_model():

    input_ids = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH), dtype = tf.int32, name = 'input_word_ids')
    input_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH), dtype = tf.int32, name = 'input_masks')
    input_segments = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH), dtype = tf.int32, name = 'input_segments')

    bert_model = TFBertModel.from_pretrained(pretrained_model_name_or_path='../input/bert-tensorflow/bert-base-uncased-tf_model.h5',config = bert_config)

    sequence_output, pooler_output  = bert_model([input_ids,input_mask, input_segments])
    
    avgpool = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    x = tf.keras.layers.Dropout(0.2)(avgpool)
    out = tf.keras.layers.Dense(30, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(inputs=[input_ids, input_mask, input_segments], outputs=out)
    
    model.compile(
        optimizer = tf.keras.optimizers.Adam(lr=3e-5),
        #loss=['binary_crossentropy']
        loss = custom_loss,
        metrics = [bce,logcosh]
    )
    return model

In [None]:
gkf = GroupKFold(n_splits=10).split(X=df_train.question_body, groups=df_train.question_body)

outputs = compute_output_arrays(df_train, output_categories)
inputs = compute_input_array(df_train, input_categories, tokenizer2, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_array(df_test, input_categories, tokenizer2, MAX_SEQUENCE_LENGTH)

In [None]:
histories = []

for fold, (train_idx, valid_idx) in enumerate(gkf):
    
    if fold in [0,2,4,6,8]:
        K.clear_session()
        model = bert_model()
        train_inputs = [inputs[i][train_idx] for i in range(3)] 
        train_outputs = outputs[train_idx]
    
        valid_inputs = [inputs[i][valid_idx] for i in range(3)]
        valid_outputs = outputs[valid_idx]
    
        custom_callback = CustomCallback(valid_data=(valid_inputs,valid_outputs), 
                                         test_data=test_inputs,
                                         batch_size=8, fold=fold)
        H = model.fit(train_inputs,train_outputs, batch_size=8, epochs=5, callbacks=[custom_callback])
        histories.append(H)
        
        del model
        gc.collect()

In [None]:
print(len(test_predictions))

In [None]:
BERT_test_preds = np.array([np.array([rankdata(c) for c in p.T]).T for p in all_predictions]).mean(axis=0)
max_val = BERT_test_preds.max() + 1
BERT_test_preds = BERT_test_preds/max_val + 1e-12

In [None]:
Final_test_preds = BERT_test_preds
Final_test_preds

In [None]:
submission[targets] = Final_test_preds
submission.to_csv("submission.csv", index = False)

In [None]:
submission