In [1]:
import pandas as pd
import numpy as np
import transformers
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator

In [2]:
sub_data=pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/sample_submission.csv')
test=pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')

In [3]:
# model_name = "../input/huggingface-question-answering-models/multilingual/xlm-roberta-large-squad2"
model_name = "../input/greenchaii/extra-chaii-2021"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
max_length = 380
doc_stride = 128

def get_train_features(examples):

    tokenized_examples=tokenizer(examples["question"],examples["context"],truncation=True, max_length=max_length,stride=doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True,padding="max_length",)

    overflow_to_sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
    offset_mapping = tokenized_examples["offset_mapping"]
#     input_ids = tokenized_examples["input_ids"]
#     attention_masks = tokenized_example["attention_mask"]
    answer = examples["answer_text"]
    answer_start = examples["answer_start"]
    answer_end = examples["answer_end"]

    start_positions = []
    end_positions = []

    example_wise_features = []
    for i in range(len(offset_mapping)):
        temp_feature = {}
        
        temp_feature["input_ids"] = tokenized_examples["input_ids"][i]
        temp_feature["attention_mask"] = tokenized_examples["attention_mask"][i]
        temp_feature["offset_mapping"] = offset_mapping[i]
        
#         input_encodings = input_ids[i]
        sequence_ids = tokenized_examples.sequence_ids(i)
        CLS = tokenized_examples["input_ids"][i].index(tokenizer.cls_token_id)

        input_index = overflow_to_sample_mapping[i]
        answer_text = answer

        if answer_text is None:
            temp_feature["start_positions"] = CLS
            temp_feature["end_positions"] = CLS

        else:
            context_start = sequence_ids.index(1)
            context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

            if offset_mapping[i][context_start][0] > answer_start or offset_mapping[i][context_end][1] < answer_end:
                temp_feature["start_positions"] = CLS
                temp_feature["end_positions"] = CLS
            else:
                while context_start < len(offset_mapping[i]) and offset_mapping[i][context_start][0] <= answer_start:
                    context_start += 1
                while offset_mapping[i][context_end][1] >= answer_end:
                    context_end -= 1

                temp_feature["start_positions"] = context_start - 1
                temp_feature["end_positions"] = context_start + 1
        
        example_wise_features.append(temp_feature)
    return example_wise_features


def get_test_features(examples):
    tokenized_examples=tokenizer(examples["question"],examples["context"],truncation=True,\
                                 max_length=max_length,stride=doc_stride, return_overflowing_tokens=True,\
                                 return_offsets_mapping=True,padding="max_length",)
    all_features_per_example = []
    data_keys = ['id','context','question']
    tokenised_data_keys = ['input_ids','attention_mask','offset_mapping']
    for i in range(len(tokenized_examples["input_ids"])):
        feature = {}
        for key in data_keys:
            feature.update({key : examples[key]})
        for key in tokenised_data_keys:
            feature.update({key : tokenized_examples[key][i]})
#         feature = {key : examples[key] for key in data_keys}
#         feature = {key : tokenized_examples[key] for key in tokenised_data_keys}
        feature.update({"sequence_ids" : [0 if j is None else j for j in tokenized_examples.sequence_ids(i)]})
        all_features_per_example.append(feature)
    return all_features_per_example

In [5]:
import torch

class ChaiiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, run_type="train"):
        super(ChaiiDataset, self).__init__()
        self.input_data = encodings
        self.run_type = run_type
#         self.train_keys = ['input_ids','attention_mask','offset_mapping','start_position','end_position']
        self.train_keys = ['input_ids','attention_mask','start_positions','end_positions']
        self.test_keys = ['input_ids','attention_mask','id','context','question']

    def __getitem__(self, idx):
        if self.run_type=="train":
            return {key: torch.tensor(self.input_data[idx][key]) for key in self.train_keys}
        elif self.run_type=="test":
            res = {}
#             print(self.input_data[idx].keys())
            for key in self.test_keys:
                if key in ['input_ids','attention_mask']:
                    res.update({key: torch.tensor(self.input_data[idx][key])})
                else:
                    res.update({key: self.input_data[idx][key]})
            return res

    def __len__(self):
        return len(self.input_data)

In [6]:
import collections

def postprocess_qa_predictions(examples, features, all_start_logits, all_end_logits, n_best_size = 20, max_answer_length = 30):
    
    print("Started Post-processing")
    
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}

    features_per_example = {}
    for i, feature in enumerate(features):
        if example_id_to_index[feature["id"]] in features_per_example:
            features_per_example[example_id_to_index[feature["id"]]].append(i)
        else:
            features_per_example[example_id_to_index[feature["id"]]] = [i]

    predictions = {}
    for example_index, example in examples.iterrows():
        feature_indices = features_per_example[example_id_to_index[example["id"]]]

        possible_answers = list()
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            sequence_ids = features[feature_index]["sequence_ids"]
            context_index = 1
            updated_offset_mappings = []
            for idx, offset in enumerate(features[feature_index]["offset_mapping"]):
                if sequence_ids[idx] == 1:
                    updated_offset_mappings.append(offset)
                else:
                    updated_offset_mappings.append(None)
                    
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (start_index >= len(updated_offset_mappings) or end_index >= len(updated_offset_mappings) or\
                        updated_offset_mappings[start_index] is None or updated_offset_mappings[end_index] is None or\
                        end_index < start_index or end_index - start_index >= max_answer_length):
                        continue

                    start_char = updated_offset_mappings[start_index][0]
                    end_char = updated_offset_mappings[end_index][1]
                    possible_answers.append((start_logits[start_index] + end_logits[end_index], context[start_char: end_char]))
        
        if(len(possible_answers) > 0):
            best_answer = sorted(possible_answers, key=lambda x: x[0], reverse=True)[0]
        else:
            best_answer = (0.0,"")
        
        predictions.update({example["id"] : best_answer[1]})
        
    return predictions

In [7]:
test_features = []
for i, row in test.iterrows():
    test_features.extend(get_test_features(row))

test_dataset = ChaiiDataset(test_features, run_type='test')

In [8]:
# trained_model = AutoModelForQuestionAnswering.from_pretrained("./chaii-qa/checkpoint-36")
trained_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
pred_model = Trainer(trained_model)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
raw_predictions = pred_model.predict(test_dataset)

In [10]:
test_features = []
for i, row in test.iterrows():
    test_features.extend(get_test_features(row))

In [11]:
from string import punctuation

fin_preds = postprocess_qa_predictions(test, test_features, raw_predictions.predictions[0], raw_predictions.predictions[1])

submission = []
final_predictions = {}
for p1, p2 in fin_preds.items():
    p2 = " ".join(p2.split())
    submission.append((p1, p2))
    final_predictions.update({p1:p2})

    
sample = pd.DataFrame(submission, columns=["id", "PredictionString"])

Started Post-processing


In [12]:
sample

Unnamed: 0,id,PredictionString
0,22bff3dec,येलन
1,282758170,28 नवम्बर 2007
2,d60987e0e,१२ मार्च १८२४
3,f99c770dc,13
4,40dec1964,சுவாமிநாதன் மற்றும் வர்கீஸ் குரியன்


In [13]:
sub_data['PredictionString'] = sub_data['id'].apply(lambda r: final_predictions.get(r))
sub_data.head()

Unnamed: 0,id,PredictionString
0,22bff3dec,येलन
1,282758170,28 नवम्बर 2007
2,d60987e0e,१२ मार्च १८२४
3,f99c770dc,13
4,40dec1964,சுவாமிநாதன் மற்றும் வர்கீஸ் குரியன்


In [14]:
sub_data.to_csv('submission.csv', index=False)