### one time installations

In [None]:
# !gdown --id 1pb7gEkctrVrJA79EAIo7H7nuzD6uV1fW
# !gdown --id 1oIeAE9HXXKWPcYa-AZ0ht5ef6sKe_Vh_
# !gdown --id 10rAuIDvsYR2yDiCqP7GmYGPc-UmtLbJb

In [None]:
# !pip install --quiet transformers --target=/kaggle/working/chaii_packages
# !pip install --quiet datasets --target=/kaggle/working/chaii_packages
# !pip install --quiet SentencePiece --target=/kaggle/working/chaii_packages
# !pip install --quiet pytorch-lightning --target=/kaggle/working/chaii_packages 
# !pip install ipdb --target=/kaggle/working/chaii_packages
# import sys
# sys.path.append('/kaggle/working/chaii_packages')

### libraries

In [None]:
# %env PYTHONPATH= 
%env WANDB_DISABLED=True

In [None]:
import os
import ast
import numpy as np
import pandas as pd
import sklearn
import random
from sklearn import model_selection

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import default_data_collator

import pytorch_lightning as pl

### hyperparameters 

In [None]:
class hyperparameters:
    # seed
    seed = 2021
    
    # tokenizer
    tokenizer_name = "../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2"
    max_len = 384 # maximum length of context and question in a datapoint
    overlap_len = 128 # overlap between two parts of the context when it is split
    
    # model
    model_name = tokenizer_name
    batch_size = 4
    epochs = 1
    
    # data
    train_csv = "../input/chaii-hindi-and-tamil-question-answering/train.csv" 
    test_csv = "../input/chaii-hindi-and-tamil-question-answering/test.csv"
    external_mlqa = "../input/external-data-mlqa-xquad-preprocessing/mlqa_hindi.csv"
    # external_csv2 = "../input/external-data-mlqa-xquad-preprocessing/xquad.csv"
    # external_csv3 = "../input/squad-translated-to-tamil-for-chaii/squad_translated_tamil.csv"
    external_google_hi = "../input/google-translated-squad20-to-hindi-and-tamil/squad_hi.csv"
    external_google_ta = "../input/google-translated-squad20-to-hindi-and-tamil/squad_ta.csv"
    
    # prediction
    top_x = 20 # top 5 answer predictions by each feature
    max_tok_in_ans = 30 # max 10 tokens in predicted answer text
    output_dir = "model_dir"

In [None]:
hyperparams = hyperparameters()

In [None]:
# pl.seed_everything(hyperparams.seed)
print("available gpu count:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(torch.cuda.device(i))

### tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(hyperparams.tokenizer_name)

### Ingredients (data) for Chaii

In [None]:
extdata = pd.read_csv(hyperparams.external_mlqa,encoding = 'utf-8')
# extdata2 = pd.read_csv(hyperparams.external_csv2,encoding = 'utf-8')
# extdata3 = pd.read_csv(hyperparams.external_csv3,encoding = 'utf-8')
# extdata3["language"] = ['tamil']*len(extdata3)
# extdata = pd.concat([extdata1, extdata2,extdata3])
extdata['id'] = list(np.arange(1, len(extdata)+1))
# chaii_df_1 = pd.read_csv(hyperparams.train_csv, encoding='utf-8')
# chaii_df = pd.concat([chaii_df_1, extdata]).reset_index(drop=True)
# chaii_df

In [None]:
def convert_answers(row):
    return {'answer_start': [row[0]['answer_start']], 'text': [row[0]['text']]}

def process_google_translate(df, hindi=True):
    df = df.loc[df.loc[:, 'is_in'] == True].reset_index(drop=True)
    df['answers'] = df['answers'].apply(ast.literal_eval)
    df['answers'] = df['answers'].apply(convert_answers)
    split_df = pd.json_normalize(df.answers)
    df=df.drop('is_in', axis=1)
    df=df.drop('c_id', axis=1)
    df=df.drop('answers', axis=1)
    df['answer_text'] = split_df['text'].str.get(0)
    df['answer_start'] = split_df['answer_start'].str.get(0)
    df['language'] = ['hindi']*len(df) if hindi == True else ['tamil']*len(df)
    return df

In [None]:
extdata_google_hi = pd.read_csv(hyperparams.external_google_hi,encoding = 'utf-8')
extdata_google_hi = process_google_translate(extdata_google_hi, hindi=True)
extdata_google_ta = pd.read_csv(hyperparams.external_google_ta,encoding = 'utf-8')
extdata_google_ta = process_google_translate(extdata_google_ta, hindi=False)

In [None]:
# take few samples 
extdata_1 = extdata_google_hi.sample(4500, random_state=42).reset_index(drop=True)
extdata_2 = extdata_google_ta.sample(4500, random_state=42).reset_index(drop=True)

In [None]:
extdata = pd.concat([extdata, extdata_1, extdata_2])

In [None]:
chaii_df = pd.read_csv(hyperparams.train_csv, encoding='utf-8')
# chaii_df = sklearn.utils.shuffle(chaii_df, random_state=4).reset_index(drop=True)

In [None]:
# train_df, val_df = model_selection.train_test_split(chaii_df, test_size=0.1) #, random_state=hyperparams.seed) # hyperparams.seed
# train_df = train_df.reset_index(drop=True)
# val_df = val_df.reset_index(drop=True)

In [None]:
# train_df = pd.concat([train_df, extdata]).reset_index(drop=True)
train_df = pd.concat([chaii_df, extdata]).reset_index(drop=True)

In [None]:
train_df = sklearn.utils.shuffle(train_df, random_state=42).reset_index(drop=True)

In [None]:
# train_df = chaii_df.reset_index(drop=True) # complete data for training the model
# train_df = train_df.loc[:5]

In [None]:
print(len(train_df))#, len(val_df))

### Data pipeline

In [None]:
def prepare_chaii(data_df, tokenizer, test=False):
    # prepare_chaii takes in raw data and returns tokenized data 
    # along with position of first token and last token in the answer_text
    
    # strip trailing and leading whitespaces in context, question, and (answer_text)?
    data_df.loc[:, 'context'] = data_df.loc[:, 'context'].apply(lambda sen : str(sen))
    data_df.loc[:, 'question'] = data_df.loc[:, 'question'].apply(lambda sen : str(sen).lstrip())
    if not test:
        data_df.loc[:, 'answer_text'] = data_df.loc[:, 'answer_text'].apply(lambda sen : str(sen))
    
    # question; context -- order is important, and is used in prediction stage to find whether predicted tokens seq_id is 0 (question) or 1 (context)
    data_tok = tokenizer(
        list(data_df['question']), list(data_df['context']),
        max_length=hyperparams.max_len, 
        truncation='only_second',
        stride=hyperparams.overlap_len,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    if test:
        return data_tok
    
    # data_df contains original raw data having question, context
    # data_tok contains tokenized data, where context might have split into multiple sentences 
    # data_tok is a dict, containing keys : dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])
    # every value is a list, and no tensors here
    
    # adding two more keys that will contain the position of first token and last token in the answer_text
    data_tok['start_positions'], data_tok['end_positions'] = [], []
    
    n_sents = len(data_tok['input_ids'])
    map_id_sent2context = data_tok['overflow_to_sample_mapping'] # id means index! since input_ids means various inputs to the model
    map_offsets = data_tok['offset_mapping']
    assert len(map_offsets) == len(map_id_sent2context) == n_sents
    
    for input_id in range(n_sents):
        sent = data_tok['input_ids'][input_id]
        
        # get the answer_start and answer_text for this input_id using the id in data_df
        context_id = map_id_sent2context[input_id]
        answer_text = data_df.loc[context_id, 'answer_text']
        answer_start = data_df.loc[context_id, 'answer_start']
        answer_end = answer_start + len(answer_text) # will use this in next code block
        
        # check whether the answer is present in the current input_id or not using offsets
        qn_context_id = data_tok.sequence_ids(input_id)
        
            # first: get the start_idx_token and end_idx_token of context
        start_idx_token = qn_context_id.index(1)
        end_idx_token = len(qn_context_id) - qn_context_id[::-1].index(1) - 1
        
            # second: use the offsets for input_id to find if answer_start and answer_end are inside this chunk of context or not
        offset_map = map_offsets[input_id]

        if answer_start >= offset_map[start_idx_token][0] and answer_end <= offset_map[end_idx_token][1]:
            # now finally get the idx_token for the first and last token in the answer_text
            while start_idx_token < len(sent) and answer_start >= offset_map[start_idx_token][0]:
                start_idx_token += 1
            while answer_end <= offset_map[end_idx_token][1]:
                end_idx_token -= 1
            
            data_tok['start_positions'].append(start_idx_token - 1)
            data_tok['end_positions'].append(end_idx_token + 1)
        
        else:
            cls_token_idx = sent.index(tokenizer.cls_token_id)
            assert cls_token_idx == 0
            data_tok['start_positions'].append(0) # cls token index
            data_tok['end_positions'].append(0) # cls token index

    return data_tok     

In [None]:
class chaii_ka_data(Dataset):
    def __init__(self, data_df, tokenizer, test=False):
        super(chaii_ka_data, self).__init__()
        '''
            test = True means data_df without answer_text, answer_start
            data_df is the pandas dataframe containing context, question, ...        
        '''
        
        # tokenize data samples context;question, and create new samples if overflow
        # we need to do this apriori (and not in __getitem__ directly) because a datasample may create more samples upon tokenization
        self.reqd_keys = ['input_ids', 'attention_mask'] 
        if not test:
            self.reqd_keys += ['start_positions', 'end_positions']
        self.data_tok = prepare_chaii(data_df, tokenizer, test=test)
    
    def __getitem__(self, input_id): # index is input_id as used in prepare_chaii()
        # sent = self.data_tok['input_ids'][input_id]
        # att_mask = self.data_tok['attention_mask'][input_id]
        # offset_map = self.data_tok['offset_mapping'][input_id]
        # start_idx_tok = self.data_tok['start_positions'][input_id]
        # end_idx_tok = self.data_tok['end_positions'][input_id]
        
        return {k: torch.tensor(v[input_id], dtype=torch.long) for k,v in self.data_tok.items() if k in self.reqd_keys}
    
    def __len__(self):
        return len(self.data_tok['input_ids'])

In [None]:
trainset = chaii_ka_data(train_df, tokenizer)
# valset = chaii_ka_data(val_df, tokenizer)

#### model predictions (start and token index) to answer_text
this transformation requires original (raw) data_df, data_tok, and start, end logits (probabilities)

In [None]:
def serve_chaii(test_df, testset, logits):
    assert len(logits[0]) == len(logits[1]) == len(testset)
    submission = {
        "id" : [],
        "PredictionString" : []
    }
    n_examples = len(test_df)
    # print("number of examples in test df", n_examples)
    for example_idx in range(n_examples):
        # current example (or context) in the given test_df
        example_id = test_df.loc[example_idx, 'id']
        example_context = test_df.loc[example_idx, 'context']

        # get all the features (or sents), start_logits, end_logits for the current example index
        data_tok = testset.data_tok
        map_id_sent2context = data_tok['overflow_to_sample_mapping']
        assert len(map_id_sent2context) == len(testset)

        sents_first_idx = map_id_sent2context.index(example_idx)
        sents_last_idx = len(map_id_sent2context) - map_id_sent2context[::-1].index(example_idx) - 1
        assert (np.array(map_id_sent2context[sents_first_idx: sents_last_idx+1]) == example_idx).mean() == 1, set_trace()

        sents = data_tok['input_ids'][sents_first_idx: sents_last_idx+1]
        start_logits = logits[0][sents_first_idx: sents_last_idx+1]
        end_logits = logits[1][sents_first_idx: sents_last_idx+1]
        sents_offset_mappings = data_tok['offset_mapping'][sents_first_idx: sents_last_idx+1]
        n_sents = len(sents)
        assert n_sents == len(start_logits) == len(end_logits)
        
        # get the answer_text from these sents using start_logits, end_logits
        # rank all possible answers for each sentence
        # then club all these answers from each sentence and take the best one as final predicted_answer
        # Also, consider the case when a sentence has no answer_text i.e. model predicts no answer
        pred_answers = []
        for local_idx in range(n_sents):
            sent = sents[local_idx]
            offset_mp = sents_offset_mappings[local_idx]
            start_lgts, end_lgts = start_logits[local_idx], end_logits[local_idx] # 384-dim list containing probabilities

            # take the top 5 confident predictions of the model for start and end token indices
            top_x = hyperparams.top_x
            ranked_strt_tok_idxs = np.argsort(start_lgts)[::-1][:top_x].tolist()
            ranked_end_tok_idxs = np.argsort(end_lgts)[::-1][:top_x].tolist()
            
            seq_ids = data_tok.sequence_ids(sents_first_idx + local_idx)

            # see which all are possible answers, and append
            for start_tok_idx in ranked_strt_tok_idxs:
                for end_tok_idx in ranked_end_tok_idxs:
                    # meaningless prediction
                    if (start_tok_idx > end_tok_idx) or (start_tok_idx>len(offset_mp)) or (end_tok_idx>len(offset_mp)):
                        continue

                    # answer tokens NOT present in context, but in question
                    if seq_ids[start_tok_idx] == 0 or seq_ids[end_tok_idx] == 0 or seq_ids[start_tok_idx] == None or seq_ids[end_tok_idx] == None: # 0 denotes question since question; context, None means CLS, etc tokens
                        continue
                        
                    if end_tok_idx-start_tok_idx+1 > hyperparams.max_tok_in_ans:
                        continue
                    
                    if seq_ids[start_tok_idx] == 1 and seq_ids[end_tok_idx] == 1:
                        score = start_lgts[start_tok_idx] + end_lgts[end_tok_idx]
                        answer_text = example_context[offset_mp[start_tok_idx][0]: offset_mp[end_tok_idx][1]] # tokenizer.decode(sent[start_tok_idx: end_tok_idx+1])
                        pred_answers.append((score, answer_text))

        if len(pred_answers) > 0:
            pred_answers = sorted(pred_answers, key=lambda element : element[0])[::-1]
            predicted_answer = pred_answers[0][1]
        else:
            print("empty answer predicted!!!")
            predicted_answer = ""

        submission['id'].append(example_id)
        submission['PredictionString'].append(predicted_answer)

    assert len(submission['id']) == len(test_df)
    return submission

### model

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(hyperparams.model_name)

### training

In [None]:
training_args = TrainingArguments(
    output_dir=hyperparams.output_dir, 
    overwrite_output_dir=True, 
    per_device_train_batch_size=hyperparams.batch_size,
#     per_device_eval_batch_size=hyperparams.batch_size,
#     evaluation_strategy="epoch", 
    learning_rate=3e-5,
    weight_decay=0.01,
    gradient_accumulation_steps=8,
    num_train_epochs=hyperparams.epochs,
    save_strategy="epoch",
    warmup_ratio=0.1,
)

data_collator = default_data_collator

trainer = Trainer(
    model=model, args=training_args,
    train_dataset=trainset, 
#     eval_dataset=valset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

### Evaluation
Compute Jaccard's score for trainset, valset using saved model at each epoch

In [None]:
# def jaccard(str1, str2): 
#     a = set(str1.lower().split()) 
#     b = set(str2.lower().split())
#     c = a.intersection(b)
#     return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
# def compute_jaccard(pred_df, gt_df):
#     num_examples = 0
#     score = 0
#     for idx, example_id in enumerate(gt_df.loc[:,'id']):
#         gt_answer = gt_df.loc[idx, 'answer_text']
#         pred_answer = pred_df.loc[pred_df.loc[:, 'id'] == example_id].reset_index(drop=True).loc[0, 'PredictionString']
#         # print(gt_answer, pred_answer)
#         score += jaccard(gt_answer, pred_answer)
#         num_examples += 1
        
#     score /= num_examples
#     return score

In [None]:
# model_checkpoints = os.listdir(hyperparams.output_dir)
# model_checkpoints1 = sorted(model_checkpoints, key=lambda element : int(element[11:]))[::-1]
# model_checkpoint = model_checkpoints1[0]
# train_scores, val_scores = [], []
# for cp_id, model_checkpoint in enumerate(model_checkpoints):
#     if model_checkpoint[:5] != "check":
#         continue
#     # load the model
#     model = AutoModelForQuestionAnswering.from_pretrained(os.path.join(hyperparams.output_dir, model_checkpoint))
#     trainer = Trainer(
#         model=model
#     )
    
# #     logits = trainer.predict(trainset).predictions
# #     submission = serve_chaii(train_df, trainset, logits)
# #     pred_df = pd.DataFrame(submission)
# #     train_score = compute_jaccard(pred_df, train_df)
# #     train_scores.append(train_score)

#     logits = trainer.predict(valset).predictions
#     submission = serve_chaii(val_df, valset, logits)
#     pred_df = pd.DataFrame(submission)
#     val_score = compute_jaccard(pred_df, val_df)
#     val_scores.append(val_score)
    
    
#     # print(model_checkpoint, "train:", train_score, "val:", val_score)
# print(model_checkpoints)
# # print(train_scores)
# print(val_scores)

### Prediction of textual answers
1. generate submission.csv containing "id", "PredictionString" columns
2. Use Trainer API for predict instead of trainer.model(\*\*batch) as it handles batching, and CPU-GPU on its own
3. trainer.predict(testset) gives the start and end logits for all the input features in the test set
4. for each example in the test_df, select the best answer from its features

In [None]:
model_checkpoints = os.listdir(hyperparams.output_dir)
model_checkpoints1 = sorted(model_checkpoints, key=lambda element : int(element[11:]))[::-1]
model_checkpoint = model_checkpoints1[0]

In [None]:
# load the model
model = AutoModelForQuestionAnswering.from_pretrained(os.path.join(hyperparams.output_dir, model_checkpoint))
trainer = Trainer(
    model=model
)

In [None]:
# load the dataset
test_df = pd.read_csv(hyperparams.test_csv, encoding='utf-8') # uncomment this for submission
# test_df = pd.read_csv(hyperparams.train_csv, encoding='utf-8').loc[:10] # comment this for submission
testset = chaii_ka_data(test_df, tokenizer, test=True)
# testloader = DataLoader(testset, batch_size=16)
# next(iter(testloader))

In [None]:
# pass the complete testset in trainer API
# the API will automatically do batch-wise prediction
# start, end logits are accessible using model_output.predictions[0],[1]
# if the testset has labels, then model_output.label_ids contains them
# model_output.metrics = {'test_runtime': 10.6385, 'test_samples_per_second': 128.684, 'test_steps_per_second': 16.168}
model_output = trainer.predict(testset)
logits = model_output.predictions

In [None]:
submission = serve_chaii(test_df, testset, logits)
submission_df = pd.DataFrame(submission)
# submission_df.loc[:, 'PredictionString'] = submission_df.loc[:, 'PredictionString'].apply(lambda ans: "\"" + str(ans) + "\"")
submission_df.to_csv('submission.csv', index=False)

### references
1. https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb
2. https://huggingface.co/transformers/internal/tokenization_utils.html#transformers.tokenization_utils_base.PreTrainedTokenizerBase.__call__