In [None]:
import pandas as pd
import numpy as np
import transformers
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator
import random
import os
import torch

In [None]:
seed=2021
def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
fix_all_seeds(seed)

In [None]:
model_name="../input/huggingface-question-answering-models/multilingual/xlm-roberta-large-squad2"

In [None]:
train=pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/train.csv')
test=pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')
train.head()

In [None]:
#constants
batch_size=4

In [None]:
tamil_train=pd.read_csv('../input/squad-translated-to-tamil-for-chaii/squad_translated_tamil.csv')
# hindi_train=pd.read_csv('../input/squad-translated-to-hindi/squad_translated_to_hindi_5k.csv')

In [None]:
hindi_train=pd.read_csv('../input/mlqa-hindi-processed/mlqa_hindi.csv')
hindi_train2=pd.read_csv('../input/mlqa-hindi-processed/xquad.csv')

In [None]:
hindi_train = hindi_train.append(hindi_train2)

In [None]:
tamil_train['language']='tamil'

In [None]:
import random, string
def getid(d):
    x = ''.join(random.choices(string.ascii_lowercase + string.digits, k=9))
    return x

In [None]:
tamil_train['id']=tamil_train['language'].map(lambda x: getid(x))
hindi_train['id']=hindi_train['language'].map(lambda x: getid(x))

In [None]:
tamil_train=tamil_train[['id','context', 'question', 'answer_text', 'answer_start', 'language']]
hindi_train=hindi_train[['id','context', 'question', 'answer_text', 'answer_start', 'language']]

In [None]:
tamil_train['answer_start']=tamil_train['answer_start'].map(lambda x: int(x))

In [None]:
hindi_train=hindi_train.append(tamil_train)
train = train.append(hindi_train)

In [None]:
train=train.sample(frac=1, random_state=seed)
train

In [None]:
google_hindi_train = pd.read_csv("../input/google-translated-squad20-to-hindi-and-tamil/squad_hi.csv")
google_tamil_train = pd.read_csv("../input/google-translated-squad20-to-hindi-and-tamil/squad_ta.csv")

In [None]:
import ast
google_tamil_train["answers"] =  google_tamil_train["answers"].apply(ast.literal_eval)
google_tamil_train["answer_text"] = google_tamil_train["answers"].apply(lambda x : x[0]["text"])
google_tamil_train["answer_start"] = google_tamil_train["answers"].apply(lambda x : x[0]["answer_start"])
google_hindi_train["answers"] =  google_hindi_train["answers"].apply(ast.literal_eval)
google_hindi_train["answer_text"] = google_hindi_train["answers"].apply(lambda x : x[0]["text"])
google_hindi_train["answer_start"] = google_hindi_train["answers"].apply(lambda x : x[0]["answer_start"])
google_tamil_train["language"] = "tamil"
google_hindi_train["language"] = "hindi"

In [None]:
google_tamil_train = google_tamil_train[google_tamil_train.is_in == True]
google_tamil_train = google_tamil_train[['id','context', 'question', 'answer_text', 'answer_start', 'language']]
google_hindi_train = google_hindi_train[google_hindi_train.is_in == True]
google_hindi_train = google_hindi_train[['id','context', 'question', 'answer_text', 'answer_start', 'language']]

In [None]:
google_tamil_train.columns, google_hindi_train.columns

In [None]:
external_data = pd.DataFrame()

In [None]:
external_data = external_data.append(google_tamil_train.sample(frac=0.5, random_state=seed))
external_data = external_data.append(google_hindi_train.sample(frac=0.5, random_state=seed))

In [None]:
external_data.drop_duplicates(["id"], inplace=True)
external_data.shape

In [None]:
external_data.isna().sum()

In [None]:
external_data=external_data.sample(frac=1, random_state=seed)
external_data

In [None]:
def preprocess_data(df):
    if "question" in df.columns.tolist():
        df["question"] = df["question"].apply(lambda x : x.lstrip())
    if "context" in df.columns.tolist():
        df["context"] = df["context"].apply(lambda x : x.lstrip())
    return df

In [None]:
external_data = preprocess_data(external_data)
train=train.append(external_data)

In [None]:
train

In [None]:
def adjust_answer_start(df):
    td = []
    for row in df.to_dict("records"):
        start_idx = int(row['answer_start'])
        end_idx = start_idx + len(row['answer_text'])

        if row['context'][start_idx:end_idx] == row['answer_text']:
            row['answer_end'] = end_idx
        else:
            for n in range(10000):
                if row['context'][start_idx-n:end_idx-n] == row['answer_text']:
                    row['answer_start'] = start_idx - n
                    row['answer_end'] = end_idx - n
                    break
        td.append(row)
    return pd.DataFrame(td)

train_data_cleaned = adjust_answer_start(train)

In [None]:
c = 0
inc = 0
final_data = []
for row in train_data_cleaned.to_dict("records"):
    start_idx = int(row['answer_start'])
    end_idx = start_idx + len(row['answer_text'])
    if row['context'][int(start_idx):start_idx+len(row["answer_text"])] == row['answer_text']:
        c += 1
        final_data.append(row)
    else:
        inc += 1

c, inc

In [None]:
train = pd.DataFrame(final_data)

In [None]:
train.shape

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
doc_stride = 128
max_length = 380

In [None]:
def prepare_train_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples=tokenizer(examples["question"],examples["context"],truncation=True, max_length=max_length,stride=doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True,padding="max_length",)

    sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
    offset_mapping = tokenized_examples["offset_mapping"]

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = sequence_ids.index(1)
            token_end_index = len(sequence_ids) - sequence_ids[::-1].index(1) - 1
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)
    return tokenized_examples

In [None]:
def convert_answers(r):
    start = r[0]
    text = r[1]
    return {
        'answer_start': [start],
        'text': [text]
    }

In [None]:
train = train.sample(frac=1, random_state=seed)
train['answers'] = train[['answer_start', 'answer_text']].apply(convert_answers, axis=1)

In [None]:
df_train = train[:-64].reset_index(drop=True)
df_valid = train[-64:].reset_index(drop=True)

In [None]:
%env WANDB_DISABLED=True

In [None]:
data_collator = default_data_collator

In [None]:
train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)

In [None]:
tokenized_train_ds = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
tokenized_valid_ds = valid_dataset.map(prepare_train_features, batched=True, remove_columns=valid_dataset.column_names)

In [None]:
config_args={"checkpoint":"CuttingChaii-masala",
            "learning_rate":3e-5,
            "warmup_ratio":0.1,
            "gradient_accumulation_steps":8,
            "num_train_epochs":1,
            "weight_decay":0.01,
            "strategy":"epoch",
            "epochs":1,
            "batch_size":4
}

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [None]:
args = TrainingArguments(
    config_args['checkpoint'],
    evaluation_strategy = config_args['strategy'],
    num_train_epochs=config_args['epochs'],
    save_strategy = config_args['strategy'],
    learning_rate=config_args['learning_rate'],
    weight_decay=config_args['weight_decay'],
    warmup_ratio=config_args['warmup_ratio'],
    gradient_accumulation_steps=config_args['gradient_accumulation_steps'],
    per_device_train_batch_size=config_args['batch_size'],
    per_device_eval_batch_size=config_args['batch_size'],
)
data_collator = default_data_collator
trainer = Trainer(model,args,train_dataset=tokenized_train_ds,eval_dataset=tokenized_valid_ds,\
                  data_collator=data_collator,tokenizer=tokenizer)

In [None]:
trainer.train()

In [None]:
trainer.save_model("google-dataset-extra-chaii-2021")

In [None]:
!zip -r 'model.zip' 'google-dataset-extra-chaii-2021' -i '*'

In [None]:
from IPython.display import FileLink
FileLink(r'model.zip')

In [None]:
val_raw_predictions = trainer.predict(tokenized_valid_ds)

In [None]:
import collections

def postprocess_qa_predictions(examples, features, all_start_logits, all_end_logits, n_best_size = 20, max_answer_length = 30):
    
    print("Started Post-processing")
    
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}

    features_per_example = {}
    for i, feature in enumerate(features):
        if example_id_to_index[feature["id"]] in features_per_example:
            features_per_example[example_id_to_index[feature["id"]]].append(i)
        else:
            features_per_example[example_id_to_index[feature["id"]]] = [i]

    predictions = {}
    for example_index, example in examples.iterrows():
        feature_indices = features_per_example[example_id_to_index[example["id"]]]

        possible_answers = list()
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            sequence_ids = features[feature_index]["sequence_ids"]
            context_index = 1
            updated_offset_mappings = []
            for idx, offset in enumerate(features[feature_index]["offset_mapping"]):
                if sequence_ids[idx] == 1:
                    updated_offset_mappings.append(offset)
                else:
                    updated_offset_mappings.append(None)
                    
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (start_index >= len(updated_offset_mappings) or end_index >= len(updated_offset_mappings) or\
                        updated_offset_mappings[start_index] is None or updated_offset_mappings[end_index] is None or\
                        end_index < start_index or end_index - start_index >= max_answer_length):
                        continue

                    start_char = updated_offset_mappings[start_index][0]
                    end_char = updated_offset_mappings[end_index][1]
                    possible_answers.append((start_logits[start_index] + end_logits[end_index], context[start_char: end_char]))
        
        if(len(possible_answers) > 0):
            best_answer = sorted(possible_answers, key=lambda x: x[0], reverse=True)[0]
        else:
            best_answer = (0.0,"")
        
        predictions.update({example["id"] : best_answer[1]})
        
    return predictions

In [None]:
def get_test_features(examples):
    tokenized_examples=tokenizer(examples["question"],examples["context"],truncation=True,\
                                 max_length=max_length,stride=doc_stride, return_overflowing_tokens=True,\
                                 return_offsets_mapping=True,padding="max_length",)
    all_features_per_example = []
    data_keys = ['id','context','question']
    tokenised_data_keys = ['input_ids','attention_mask','offset_mapping']
    for i in range(len(tokenized_examples["input_ids"])):
        feature = {}
        for key in data_keys:
            feature.update({key : examples[key]})
        for key in tokenised_data_keys:
            feature.update({key : tokenized_examples[key][i]})
        feature.update({"sequence_ids" : [0 if j is None else j for j in tokenized_examples.sequence_ids(i)]})
        all_features_per_example.append(feature)
    return all_features_per_example

In [None]:
val_features = []
for i, row in df_valid.iterrows():
    val_features.extend(get_test_features(row))

In [None]:
from string import punctuation

fin_preds = postprocess_qa_predictions(df_valid, val_features, val_raw_predictions.predictions[0], val_raw_predictions.predictions[1])

submission = []
final_predictions = {}
for p1, p2 in fin_preds.items():
    p2 = " ".join(p2.split()).strip()
    submission.append((p1, p2))
    final_predictions.update({p1:p2})
    
sample = pd.DataFrame(submission, columns=["id", "PredictionString"])

In [None]:
references = [{"id": ex["id"], "answer": ex["answers"]['text'][0]} for ex in valid_dataset]

In [None]:
test_data =pd.merge(left=df_valid,right=sample,on='id')
test_data[["id","answer_text","PredictionString"]]
def jaccard(row): 
    str1 = row[0]
    str2 = row[1]
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

test_data['jaccard'] = test_data[['answer_text', 'PredictionString']].apply(jaccard, axis=1)
np.mean(test_data.jaccard)

In [None]:
sub_data=pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/sample_submission.csv')
test=pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')

In [None]:
test_features = []
for i, row in test.iterrows():
    test_features.extend(get_test_features(row))

In [None]:
test_dataset = Dataset.from_pandas(test)

In [None]:
def prepare_test_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples=tokenizer(examples["question"],examples["context"],truncation=True, max_length=max_length,stride=doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True,padding="max_length",)
    return tokenized_examples

In [None]:
tokenized_test_ds = test_dataset.map(prepare_test_features, batched=True, remove_columns=test_dataset.column_names)

In [None]:
test_raw_predictions = trainer.predict(tokenized_test_ds)

In [None]:
from string import punctuation

fin_preds = postprocess_qa_predictions(test, test_features, test_raw_predictions.predictions[0], test_raw_predictions.predictions[1])

submission = []
final_predictions = {}
for p1, p2 in fin_preds.items():
    p2 = " ".join(p2.split())
    submission.append((p1, p2))
    final_predictions.update({p1:p2})

    
sample = pd.DataFrame(submission, columns=["id", "PredictionString"])

In [None]:
sub_data['PredictionString'] = sub_data['id'].apply(lambda r: fin_preds[r])
sub_data.head()

In [None]:
sub_data.to_csv('submission.csv', index=False)