# Chaii data training

Trainig using XLM-Roberta

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 8.8 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.2.0-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 390 kB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 46.3 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 48.8 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 45.3 MB/s 
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (192 kB)
[K    

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[?25l[K     |                                | 10 kB 27.3 MB/s eta 0:00:01[K     |▏                               | 20 kB 28.8 MB/s eta 0:00:01[K     |▎                               | 30 kB 21.8 MB/s eta 0:00:01[K     |▍                               | 40 kB 17.8 MB/s eta 0:00:01[K     |▌                               | 51 kB 9.9 MB/s eta 0:00:01[K     |▋                               | 61 kB 9.3 MB/s eta 0:00:01[K     |▊                               | 71 kB 9.6 MB/s eta 0:00:01[K     |▉                               | 81 kB 10.7 MB/s eta 0:00:01[K     |█                               | 92 kB 11.0 MB/s eta 0:00:01[K     |█                               | 102 kB 9.2 MB/s eta 0:00:01[K     |█▏                              | 112 kB 9.2 MB/s eta 0:00:01[K     |█▎                              | 122 kB 9.2 MB/s eta 0:00:01[K     |█▍                              | 133 kB 9.2 MB/s e

In [3]:
# importing the dependencies
EXP_NAME = 'pretraining_mbert'
FOLDER_NAME = EXP_NAME
SEED = 68

In [4]:
import torch
torch.cuda.is_available()

True

In [5]:
import os

import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler

from datasets import Dataset
from sklearn import model_selection 
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator

from tqdm.autonotebook import tqdm

import gc

import re
import random
random.seed(SEED)

import warnings
warnings.filterwarnings("ignore")

In [7]:
model_checkpoint = 'bert-base-multilingual-cased'
batch_size = 8
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
max_length = 384
doc_stride = 128
pad_on_right = tokenizer.padding_side == "right"
n_folds = 3

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

## Processing the features

In [8]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples


In [9]:
# Create data folds
def create_folds(data, num_splits):
    data["kfold"] = -1
    kf = model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=2021)
    for fold_num, (t_, v_) in enumerate(kf.split(X=data, y=data.language.values)):
        data.loc[v_, "kfold"] = fold_num
    return data

def convert_answers(row):
    return {"answer_start": [row[0]], "text": [row[1]]}


In [10]:
def negative_sampling(examples, ratio=0.1):
    
    def _sample(pos):
        if pos != 0: return True
        else: return random.random() < ratio
                
    indices = [i for i,x in enumerate(examples['start_positions']) if _sample(x)]

    for key in examples.keys():
        examples[key] = [x for i,x in enumerate(examples[key]) if i in indices]
        
    return examples

## Create training and validation

In [11]:
!mkdir datasets

In [13]:
# Import the files
extended_hi = '/content/datasets/xquad.csv'
hi_data = pd.read_csv(extended_hi)

# Tamil
extended_ta = '/content/datasets/squad_translated_tamil.csv'
ta_data = pd.read_csv(extended_ta)

# Edit the tamil dataset:
ta_data['language'] = 'tamil'

In [14]:
# concatenate the data:
ext_data = pd.concat([hi_data, ta_data])
ext_data.head()

Unnamed: 0,context,question,answer_text,answer_start,language
0,पैंथर्स की डिफ़ेन्स ने लीग में केवल 308 अंक दिए...,पैंथर्स डिफ़ेंस ने कितने अंक दिए?,308,35.0,hindi
1,पैंथर्स की डिफ़ेन्स ने लीग में केवल 308 अंक दिए...,जेरेड एलन के पास कितने करियर सैक थे?,136,380.0,hindi
2,पैंथर्स की डिफ़ेन्स ने लीग में केवल 308 अंक दिए...,ल्यूक कुएक्ली ने कितने टैकल रजिस्टर किए?,118,743.0,hindi
3,पैंथर्स की डिफ़ेन्स ने लीग में केवल 308 अंक दिए...,जोश नॉर्मन ने कितने बॉल को इंटरसेप्ट किया?,चार,90.0,hindi
4,पैंथर्स की डिफ़ेन्स ने लीग में केवल 308 अंक दिए...,इस सीज़न में टीम से किसने सबसे अधिक सैक रजिस्टर...,कावन शॉर्ट,169.0,hindi


In [15]:
# ad an id column:
index = 0
col = list(ext_data.index)
ext_data.insert(loc=index, column='id', value=col)
ext_data.head()

Unnamed: 0,id,context,question,answer_text,answer_start,language
0,0,पैंथर्स की डिफ़ेन्स ने लीग में केवल 308 अंक दिए...,पैंथर्स डिफ़ेंस ने कितने अंक दिए?,308,35.0,hindi
1,1,पैंथर्स की डिफ़ेन्स ने लीग में केवल 308 अंक दिए...,जेरेड एलन के पास कितने करियर सैक थे?,136,380.0,hindi
2,2,पैंथर्स की डिफ़ेन्स ने लीग में केवल 308 अंक दिए...,ल्यूक कुएक्ली ने कितने टैकल रजिस्टर किए?,118,743.0,hindi
3,3,पैंथर्स की डिफ़ेन्स ने लीग में केवल 308 अंक दिए...,जोश नॉर्मन ने कितने बॉल को इंटरसेप्ट किया?,चार,90.0,hindi
4,4,पैंथर्स की डिफ़ेन्स ने लीग में केवल 308 अंक दिए...,इस सीज़न में टीम से किसने सबसे अधिक सैक रजिस्टर...,कावन शॉर्ट,169.0,hindi


In [16]:
# Split into training and validation
# Shuffle the data
idx = np.arange(len(ext_data))
np.random.seed(111)
np.random.shuffle(idx)

ext_data = ext_data.iloc[idx[:]]

# Reset the indices:
ext_data = ext_data.reset_index(drop=True)


In [17]:
# Process the dataset
# Convert the answers:
def convert_answers(row):
    return {'answer_start': [row[0]], 'text': [row[1]]}

# Helper functions
def create_folds(data, num_splits):
    data["kfold"] = -1
    kf = model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=2021)
    for fold_num, (t_, v_) in enumerate(kf.split(X=data, y=data.language.values)):
        data.loc[v_, "kfold"] = fold_num
    return data

ext_data['answers'] = ext_data[['answer_start', 'answer_text']].apply(convert_answers, axis=1)
ext_data = create_folds(ext_data, 3)

fold = 0

train_set = ext_data[ext_data.kfold != fold]
val_set = ext_data[ext_data.kfold == fold]

In [18]:
#
train_data = Dataset.from_pandas(train_set)
valid_data = Dataset.from_pandas(val_set)

## Tokenize the features

In [19]:
# Tokenizing 
train_features = train_data.map(prepare_train_features, batched=True, remove_columns=train_data.column_names)
valid_features = valid_data.map(prepare_train_features, batched=True, remove_columns=valid_data.column_names)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [20]:
# Negative sampling
train_str1 = train_features.map(negative_sampling, fn_kwargs={'ratio':0.1}, batched=True, batch_size=8)
train_str2 = train_features.map(negative_sampling, fn_kwargs={'ratio':0.2}, batched=True, batch_size=8)

  0%|          | 0/513 [00:00<?, ?ba/s]

  0%|          | 0/513 [00:00<?, ?ba/s]

In [21]:
from datasets import concatenate_datasets
tokenized_train_all = concatenate_datasets([train_str1, train_str2])

## Actual training of the model

In [22]:
from datasets import load_metric
metric = load_metric("f1")

Downloading:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

In [23]:
from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler

from transformers.file_utils import is_datasets_available

class MyTrainer(Trainer): 

    def get_train_dataloader(self) -> DataLoader:
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        train_dataset = self.train_dataset
        if is_datasets_available() and isinstance(train_dataset, Dataset):
            train_dataset = self._remove_unused_columns(train_dataset, description="training")

        train_sampler = SequentialSampler(self.train_dataset)

        return DataLoader(
            train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=train_sampler,
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

In [24]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-bas

In [26]:
args = TrainingArguments(
    f"chaii-qa-{EXP_NAME}",
    evaluation_strategy = "steps",
    logging_strategy = "steps",
    logging_steps = 100,
    save_steps = 1000,
    save_strategy = "steps",
    learning_rate=3e-4,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='none',
    save_total_limit=15
)

data_collator = default_data_collator

trainer = MyTrainer(
    model,
    args,
    train_dataset=tokenized_train_all,
    eval_dataset=valid_features,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices


In [28]:
trainer.train()
trainer.save_model(f"{FOLDER_NAME}/final")

***** Running training *****
  Num examples = 6921
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 648


Step,Training Loss,Validation Loss


KeyboardInterrupt: ignored

## Preparing validation data

In [29]:
import collections

def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples



In [30]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        predictions[example["id"]] = best_answer["text"]

    return predictions

In [31]:
def jaccard(row): 
    str1 = row[0]
    str2 = row[1]
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [32]:
from string import punctuation

def postuning(s):
    s = " ".join(s.split())
    s = s.strip(punctuation)
    return s

In [33]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(row):
    truth = row[0]
    prediction = row[1]
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(row):
    truth = row[0]
    prediction = row[1]
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [34]:
train_data

Dataset({
    features: ['id', 'context', 'question', 'answer_text', 'answer_start', 'language', 'answers', 'kfold', '__index_level_0__'],
    num_rows: 3171
})

In [35]:
validation_features = valid_data.map(prepare_validation_features,batched=True, remove_columns=valid_data.column_names)

valid_feats_small = validation_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

max_answer_length = 30

examples = valid_data
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

references = [{"id": ex["id"], "context": ex["context"], "question": ex["question"], "answer": ex["answer_text"]} for ex in valid_data]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2047 [00:00<?, ?ex/s]

In [36]:
# final
raw_predictions = trainer.predict(valid_feats_small)
final_predictions = postprocess_qa_predictions(valid_data, validation_features, raw_predictions.predictions)
res = pd.DataFrame(references)
res['prediction'] = res['id'].apply(lambda r: final_predictions[r])
res['jaccard'] = res[['answer', 'prediction']].apply(jaccard, axis=1)
res['postuned'] = res['prediction'].apply(postuning)
res['pjaccard'] = res[['answer', 'postuned']].apply(jaccard, axis=1)
res.jaccard.mean(), res.pjaccard.mean()

***** Running Prediction *****
  Num examples = 2047
  Batch size = 8


Step,Training Loss,Validation Loss


Post-processing 1586 example predictions split into 2047 features.


  0%|          | 0/1586 [00:00<?, ?it/s]

(0.3471113839572225, 0.3378832784491673)

In [37]:
res['em'] = res[['answer', 'prediction']].apply(compute_exact_match, axis=1)
res.em.mean()

0.2931904161412358

In [38]:
res['f1'] = res[['answer', 'prediction']].apply(compute_f1, axis=1)
res.f1.mean()

0.3734037899028879

## 5. Repeat the same for next 2 folds

In [39]:
# Fold 1
from datasets import Dataset
fold = 1

train_set = ext_data[ext_data.kfold != fold]
val_set = ext_data[ext_data.kfold == fold]
train_data = Dataset.from_pandas(train_set)
valid_data = Dataset.from_pandas(val_set)

In [41]:
# Change the model checkpoint
model_checkpoint = '/content/pretraining_mbert/final'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
EXP_NAME = 'pretraining_mbert_2'
FOLDER_NAME = EXP_NAME


Didn't find file /content/pretraining_mbert/final/added_tokens.json. We won't load it.
loading file /content/pretraining_mbert/final/vocab.txt
loading file /content/pretraining_mbert/final/tokenizer.json
loading file None
loading file /content/pretraining_mbert/final/special_tokens_map.json
loading file /content/pretraining_mbert/final/tokenizer_config.json


In [42]:
# Repeat the same
# Tokenizing 
train_features = train_data.map(prepare_train_features, batched=True, remove_columns=train_data.column_names)
valid_features = valid_data.map(prepare_train_features, batched=True, remove_columns=valid_data.column_names)

# Negative sampling
train_str1 = train_features.map(negative_sampling, fn_kwargs={'ratio':0.1}, batched=True, batch_size=8)
train_str2 = train_features.map(negative_sampling, fn_kwargs={'ratio':0.2}, batched=True, batch_size=8)

from datasets import concatenate_datasets
tokenized_train_all = concatenate_datasets([train_str1, train_str2])

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

args = TrainingArguments(
    f"chaii-qa-{EXP_NAME}",
    evaluation_strategy = "steps",
    logging_strategy = "steps",
    logging_steps = 100,
    save_steps = 600,
    save_strategy = "steps",
    learning_rate=3e-4,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='none',
    save_total_limit=15
)

data_collator = default_data_collator

trainer = MyTrainer(
    model,
    args,
    train_dataset=tokenized_train_all,
    eval_dataset=valid_features,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/511 [00:00<?, ?ba/s]

  0%|          | 0/511 [00:00<?, ?ba/s]

loading configuration file /content/pretraining_mbert/final/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 119547
}

loading weights file /content/pretraining_mbert/final/pytorch_

In [43]:
trainer.train()
trainer.save_model(f"{FOLDER_NAME}/final")

***** Running training *****
  Num examples = 6880
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 645


Step,Training Loss,Validation Loss
100,2.009,1.51367
200,1.8303,1.438463
300,1.3726,1.852664
400,0.8582,1.760857
500,0.5089,1.836333
600,0.2336,1.84526


***** Running Evaluation *****
  Num examples = 2064
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2064
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2064
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2064
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2064
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2064
  Batch size = 8
Saving model checkpoint to chaii-qa-pretraining_mbert_2/checkpoint-600
Configuration saved in chaii-qa-pretraining_mbert_2/checkpoint-600/config.json
Model weights saved in chaii-qa-pretraining_mbert_2/checkpoint-600/pytorch_model.bin
tokenizer config file saved in chaii-qa-pretraining_mbert_2/checkpoint-600/tokenizer_config.json
Special tokens file saved in chaii-qa-pretraining_mbert_2/checkpoint-600/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to pretraining_mbert_2/final
Configuratio

In [44]:
validation_features = valid_data.map(prepare_validation_features,batched=True, remove_columns=valid_data.column_names)

valid_feats_small = validation_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

max_answer_length = 30

examples = valid_data
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

references = [{"id": ex["id"], "context": ex["context"], "question": ex["question"], "answer": ex["answer_text"]} for ex in valid_data]

# Round 2 results
# final
raw_predictions = trainer.predict(valid_feats_small)
final_predictions = postprocess_qa_predictions(valid_data, validation_features, raw_predictions.predictions)
res = pd.DataFrame(references)
res['prediction'] = res['id'].apply(lambda r: final_predictions[r])
res['jaccard'] = res[['answer', 'prediction']].apply(jaccard, axis=1)
res['postuned'] = res['prediction'].apply(postuning)
res['pjaccard'] = res[['answer', 'postuned']].apply(jaccard, axis=1)
res['em'] = res[['answer', 'prediction']].apply(compute_exact_match, axis=1)
res['f1'] = res[['answer', 'prediction']].apply(compute_f1, axis=1)

print("Model results:\n")
print(f"Jaccard Score: {res.jaccard.mean()}\nJaccard Score after post-tuning:{res.pjaccard.mean()}\nEM Score: {res.em.mean()}\nF1 Score: {res.f1.mean()}")



  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2064 [00:00<?, ?ex/s]

***** Running Prediction *****
  Num examples = 2064
  Batch size = 8


Post-processing 1586 example predictions split into 2064 features.


  0%|          | 0/1586 [00:00<?, ?it/s]

Model results:

Jaccard Score: 0.5811668096977056
Jaccard Score after post-tuning:0.5659348194657153
EM Score: 0.5245901639344263
F1 Score: 0.606713574045935


## Fold 3

In [45]:
from datasets import Dataset
fold = 2

train_set = ext_data[ext_data.kfold != fold]
val_set = ext_data[ext_data.kfold == fold]
train_data = Dataset.from_pandas(train_set)
valid_data = Dataset.from_pandas(val_set)

In [46]:
# Change the model checkpoint
model_checkpoint = '/content/pretraining_mbert_2/final'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
EXP_NAME = 'pretraining_mbert_3'
FOLDER_NAME = EXP_NAME


Didn't find file /content/pretraining_mbert_2/final/added_tokens.json. We won't load it.
loading file /content/pretraining_mbert_2/final/vocab.txt
loading file /content/pretraining_mbert_2/final/tokenizer.json
loading file None
loading file /content/pretraining_mbert_2/final/special_tokens_map.json
loading file /content/pretraining_mbert_2/final/tokenizer_config.json


In [47]:
# Repeat the same
# Tokenizing 
train_features = train_data.map(prepare_train_features, batched=True, remove_columns=train_data.column_names)
valid_features = valid_data.map(prepare_train_features, batched=True, remove_columns=valid_data.column_names)

# Negative sampling
train_str1 = train_features.map(negative_sampling, fn_kwargs={'ratio':0.1}, batched=True, batch_size=8)
train_str2 = train_features.map(negative_sampling, fn_kwargs={'ratio':0.2}, batched=True, batch_size=8)

from datasets import concatenate_datasets
tokenized_train_all = concatenate_datasets([train_str1, train_str2])

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

args = TrainingArguments(
    f"chaii-qa-{EXP_NAME}",
    evaluation_strategy = "steps",
    logging_strategy = "steps",
    logging_steps = 100,
    save_steps = 600,
    save_strategy = "steps",
    learning_rate=3e-4,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='none',
    save_total_limit=15
)

data_collator = default_data_collator

trainer = MyTrainer(
    model,
    args,
    train_dataset=tokenized_train_all,
    eval_dataset=valid_features,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/514 [00:00<?, ?ba/s]

  0%|          | 0/514 [00:00<?, ?ba/s]

loading configuration file /content/pretraining_mbert_2/final/config.json
Model config BertConfig {
  "_name_or_path": "/content/pretraining_mbert/final",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 119547
}

loading weights file /content/pretraining_mbert_2/final/

In [None]:
trainer.train()
trainer.save_model(f"{FOLDER_NAME}/final")

***** Running training *****
  Num examples = 6910
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 648


Step,Training Loss,Validation Loss
100,1.216,0.943776
200,1.1603,0.905501
300,0.805,1.45308
400,0.4868,0.899347
500,0.2661,1.188287
600,0.1374,0.870463


***** Running Evaluation *****
  Num examples = 2038
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2038
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2038
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2038
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2038
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2038
  Batch size = 8
Saving model checkpoint to chaii-qa-pretraining_mbert_3/checkpoint-600
Configuration saved in chaii-qa-pretraining_mbert_3/checkpoint-600/config.json
Model weights saved in chaii-qa-pretraining_mbert_3/checkpoint-600/pytorch_model.bin
tokenizer config file saved in chaii-qa-pretraining_mbert_3/checkpoint-600/tokenizer_config.json
Special tokens file saved in chaii-qa-pretraining_mbert_3/checkpoint-600/special_tokens_map.json


In [None]:
validation_features = valid_data.map(prepare_validation_features,batched=True, remove_columns=valid_data.column_names)

valid_feats_small = validation_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

max_answer_length = 30

examples = valid_data
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

references = [{"id": ex["id"], "context": ex["context"], "question": ex["question"], "answer": ex["answer_text"]} for ex in valid_data]

# Round 2 results
# final
raw_predictions = trainer.predict(valid_feats_small)
final_predictions = postprocess_qa_predictions(valid_data, validation_features, raw_predictions.predictions)
res = pd.DataFrame(references)
res['prediction'] = res['id'].apply(lambda r: final_predictions[r])
res['jaccard'] = res[['answer', 'prediction']].apply(jaccard, axis=1)
res['postuned'] = res['prediction'].apply(postuning)
res['pjaccard'] = res[['answer', 'postuned']].apply(jaccard, axis=1)
res['em'] = res[['answer', 'prediction']].apply(compute_exact_match, axis=1)
res['f1'] = res[['answer', 'prediction']].apply(compute_f1, axis=1)

print("Model results:\n")
print(f"Jaccard Score: {res.jaccard.mean()}\nJaccard Score after post-tuning:{res.pjaccard.mean()}\nEM Score: {res.em.mean()}\nF1 Score: {res.f1.mean()}")

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1702 [00:00<?, ?ex/s]

***** Running Prediction *****
  Num examples = 1702
  Batch size = 8


Post-processing 1585 example predictions split into 1702 features.


  0%|          | 0/1585 [00:00<?, ?it/s]

Model results:

Jaccard Score: 0.7353704163823703
Jaccard Score after post-tuning:0.7269389407426927
EM Score: 0.7141955835962145
F1 Score: 0.7623966071788132


## Working on the chaii dataset

In [None]:
# Run only once
from google.colab import files
uploaded = files.upload()
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [None]:
# Dataset
!kaggle competitions download -c chaii-hindi-and-tamil-question-answering

Downloading test.csv to /content
  0% 0.00/137k [00:00<?, ?B/s]
100% 137k/137k [00:00<00:00, 69.5MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/75.0 [00:00<?, ?B/s]
100% 75.0/75.0 [00:00<00:00, 75.1kB/s]
Downloading train.csv.zip to /content
  0% 0.00/6.78M [00:00<?, ?B/s]
100% 6.78M/6.78M [00:00<00:00, 59.8MB/s]


In [None]:
!unzip train.csv.zip

Archive:  train.csv.zip
  inflating: train.csv               


In [None]:
# Import the chaii dataset
chaii_data = pd.read_csv('/content/train.csv.zip')

# Preprocess the data
chaii_data['answers'] = chaii_data[['answer_start', 'answer_text']].apply(convert_answers, axis=1)

# Shuffle the data
chaii_data.head()


Unnamed: 0,id,context,question,answer_text,answer_start,language,answers
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,"{'answer_start': [53], 'text': ['206']}"
1,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,"{'answer_start': [2358], 'text': ['காசுமீரில்']}"
2,29d154b56,சர் அலெக்ஸாண்டர் ஃபிளெமிங் (Sir Alexander Flem...,பென்சிலின் கண்டுபிடித்தவர் யார்?,சர் அலெக்ஸாண்டர் ஃபிளெமிங்,0,tamil,"{'answer_start': [0], 'text': ['சர் அலெக்ஸாண்ட..."
3,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,68,tamil,"{'answer_start': [68], 'text': ['தாலாட்டு']}"
4,b29c82c22,சூரியக் குடும்பம் \nசூரியக் குடும்பம் (Solar S...,பூமியின் அருகில் உள்ள விண்மீன் எது?,சூரியனும்,585,tamil,"{'answer_start': [585], 'text': ['சூரியனும்']}"


In [None]:
# Split the data into training test:
# Setting the ratios
from sklearn.model_selection import train_test_split
full_train, test = train_test_split(chaii_data, test_size = 0.3)


In [None]:
# Model
fold = 0
model_checkpoint = '/content/pretraining_mbert_3/final'
FOLDER = f"chaii_fold_{fold}"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Didn't find file /content/pretraining_xlm_3/final/sentencepiece.bpe.model. We won't load it.
Didn't find file /content/pretraining_xlm_3/final/added_tokens.json. We won't load it.
loading file None
loading file /content/pretraining_xlm_3/final/tokenizer.json
loading file None
loading file /content/pretraining_xlm_3/final/special_tokens_map.json
loading file /content/pretraining_xlm_3/final/tokenizer_config.json


## Cross validation

In [None]:
full_train = create_folds(full_train, 2)
full_train.head()

In [None]:
# Process the data
train = full_train[full_train.kfold1 != fold]
valid = full_train[full_train.kfold1 == fold]

train_data = Dataset.from_pandas(train)
valid_data = Dataset.from_pandas(val)
test_data = Dataset.from_pandas(test)

# Prepare features
train_features = train_data.map(prepare_train_features, batched=True, remove_columns=train_data.column_names)
valid_features = valid_data.map(prepare_train_features, batched=True, remove_columns=valid_data.column_names)

# Negative sampling
train_str1 = train_features.map(negative_sampling, fn_kwargs={'ratio':0.1}, batched=True, batch_size=8)
train_str2 = train_features.map(negative_sampling, fn_kwargs={'ratio':0.2}, batched=True, batch_size=8)

from datasets import concatenate_datasets
tokenized_train_all = concatenate_datasets([train_str1, train_str2])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1325 [00:00<?, ?ba/s]

  0%|          | 0/1325 [00:00<?, ?ba/s]

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

args = TrainingArguments(
    f"chaii-qa-{EXP_NAME}",
    evaluation_strategy = "steps",
    logging_strategy = "steps",
    logging_steps = 100,
    save_steps = 600,
    save_strategy = "steps",
    learning_rate=3e-4,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='none',
    save_total_limit=15
)

data_collator = default_data_collator

trainer = MyTrainer(
    model,
    args,
    train_dataset=tokenized_train_all,
    eval_dataset=valid_features,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

loading configuration file /content/pretraining_xlm_3/final/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "/content/pretraining_xlm_2/final",
  "architectures": [
    "XLMRobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file /content/pretraining_xlm_3/final/pytorch_model.bin
All model checkpoint weights were used when initializing XLMRobertaForQuestionAnswering.

All

In [None]:
trainer.train()
trainer.save_model(FOLDER)

***** Running training *****
  Num examples = 4794
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 450


Step,Training Loss,Validation Loss
100,1.7339,0.81092
200,0.8678,1.071206
300,0.4559,0.6583
400,0.1858,0.824028


***** Running Evaluation *****
  Num examples = 1900
  Batch size = 8
***** Running Evaluation *****
  Num examples = 1900
  Batch size = 8
***** Running Evaluation *****
  Num examples = 1900
  Batch size = 8
***** Running Evaluation *****
  Num examples = 1900
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=450, training_loss=0.7313121096293131, metrics={'train_runtime': 715.0441, 'train_samples_per_second': 20.113, 'train_steps_per_second': 0.629, 'total_flos': 2818477466532864.0, 'train_loss': 0.7313121096293131, 'epoch': 3.0})

In [None]:
validation_features = valid_data.map(prepare_validation_features,batched=True, remove_columns=valid_data.column_names)

valid_feats_small = validation_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

max_answer_length = 30

examples = valid_data
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

references = [{"id": ex["id"], "context": ex["context"], "question": ex["question"], "answer": ex["answer_text"]} for ex in valid_data]

# Round 2 results
# final
raw_predictions = trainer.predict(valid_feats_small)
final_predictions = postprocess_qa_predictions(valid_data, validation_features, raw_predictions.predictions)
res = pd.DataFrame(references)
res['prediction'] = res['id'].apply(lambda r: final_predictions[r])
res['jaccard'] = res[['answer', 'prediction']].apply(jaccard, axis=1)
res['postuned'] = res['prediction'].apply(postuning)
res['pjaccard'] = res[['answer', 'postuned']].apply(jaccard, axis=1)
res['em'] = res[['answer', 'prediction']].apply(compute_exact_match, axis=1)
res['f1'] = res[['answer', 'prediction']].apply(compute_f1, axis=1)

print("Model results:\n")
print(f"Jaccard Score: {res.jaccard.mean()}\nJaccard Score after post-tuning:{res.pjaccard.mean()}\nEM Score: {res.em.mean()}\nF1 Score: {res.f1.mean()}")

### Fold 1

In [None]:
# Model
fold = 1
model_checkpoint = '/content/pretraining_mbert_3/final'
FOLDER = f"chaii_fold_{fold}"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Process the data
fold = 0
train = full_train[full_train.kfold1 != fold]
valid = full_train[full_train.kfold1 == fold]

train_data = Dataset.from_pandas(train)
valid_data = Dataset.from_pandas(val)
test_data = Dataset.from_pandas(test)

# Prepare features
train_features = train_data.map(prepare_train_features, batched=True, remove_columns=train_data.column_names)
valid_features = valid_data.map(prepare_train_features, batched=True, remove_columns=valid_data.column_names)

# Negative sampling
train_str1 = train_features.map(negative_sampling, fn_kwargs={'ratio':0.1}, batched=True, batch_size=8)
train_str2 = train_features.map(negative_sampling, fn_kwargs={'ratio':0.2}, batched=True, batch_size=8)

from datasets import concatenate_datasets
tokenized_train_all = concatenate_datasets([train_str1, train_str2])

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

args = TrainingArguments(
    f"chaii-qa-{EXP_NAME}",
    evaluation_strategy = "steps",
    logging_strategy = "steps",
    logging_steps = 100,
    save_steps = 600,
    save_strategy = "steps",
    learning_rate=3e-4,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='none',
    save_total_limit=15
)

data_collator = default_data_collator

trainer = MyTrainer(
    model,
    args,
    train_dataset=tokenized_train_all,
    eval_dataset=valid_features,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()
trainer.save_model(FOLDER)

In [None]:
validation_features = valid_data.map(prepare_validation_features,batched=True, remove_columns=valid_data.column_names)

valid_feats_small = validation_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

max_answer_length = 30

examples = valid_data
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

references = [{"id": ex["id"], "context": ex["context"], "question": ex["question"], "answer": ex["answer_text"]} for ex in valid_data]

# Round 2 results
# final
raw_predictions = trainer.predict(valid_feats_small)
final_predictions = postprocess_qa_predictions(valid_data, validation_features, raw_predictions.predictions)
res = pd.DataFrame(references)
res['prediction'] = res['id'].apply(lambda r: final_predictions[r])
res['jaccard'] = res[['answer', 'prediction']].apply(jaccard, axis=1)
res['postuned'] = res['prediction'].apply(postuning)
res['pjaccard'] = res[['answer', 'postuned']].apply(jaccard, axis=1)
res['em'] = res[['answer', 'prediction']].apply(compute_exact_match, axis=1)
res['f1'] = res[['answer', 'prediction']].apply(compute_f1, axis=1)

print("Model results:\n")
print(f"Jaccard Score: {res.jaccard.mean()}\nJaccard Score after post-tuning:{res.pjaccard.mean()}\nEM Score: {res.em.mean()}\nF1 Score: {res.f1.mean()}")

## Test on unknown data

In [None]:
# On test data:
test_features = test_data.map(prepare_validation_features,batched=True, remove_columns=valid_data.column_names)

test_feats_small = test_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

max_answer_length = 30

examples = test_data
features = test_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

references = [{"id": ex["id"], "context": ex["context"], "question": ex["question"], "answer": ex["answer_text"]} for ex in valid_data]

# Round 2 results
# final
raw_predictions = trainer.predict(test_feats_small)
final_predictions = postprocess_qa_predictions(test_data, test_features, raw_predictions.predictions)
res = pd.DataFrame(references)
res['prediction'] = res['id'].apply(lambda r: final_predictions[r])
res['jaccard'] = res[['answer', 'prediction']].apply(jaccard, axis=1)
res['postuned'] = res['prediction'].apply(postuning)
res['pjaccard'] = res[['answer', 'postuned']].apply(jaccard, axis=1)
res['em'] = res[['answer', 'prediction']].apply(compute_exact_match, axis=1)
res['f1'] = res[['answer', 'prediction']].apply(compute_f1, axis=1)

print("Model results:\n")
print(f"Jaccard Score: {res.jaccard.mean()}\nJaccard Score after post-tuning:{res.pjaccard.mean()}\nEM Score: {res.em.mean()}\nF1 Score: {res.f1.mean()}")