In [None]:
import pickle
import time
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from datasets import load_metric

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizerFast
from transformers import Trainer, TrainingArguments

In [None]:
BATCH_SIZE = 32
EVAL_PART_SIZE = 0.02
MODELS_PATH = './models/'
DATASETS_PATH = '../input/contradictory-my-dear-watson/'
CUR_MODEL_NAME = 'xlm-roberta-base'

In [None]:
def init_random_seed(value=42):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    torch.backends.cudnn.deterministic = True
init_random_seed()

# Data Explore

In [None]:
train_eval_df = pd.read_csv(DATASETS_PATH + 'train.csv')
test_df = pd.read_csv(DATASETS_PATH + 'test.csv')

In [None]:
train_eval_df

## Explore train/test intersection

In [None]:
train_eval_df.loc[train_eval_df.premise.isin(test_df.premise)]

In [None]:
train_eval_df.loc[train_eval_df.hypothesis.isin(test_df.hypothesis)]

## Explore label/language distribution

In [None]:
lang_agg = train_eval_df.groupby('language').count()

In [None]:
labels = lang_agg.iloc[:, 0].index
sizes = lang_agg.iloc[:, 0]


fig1, ax1 = plt.subplots(figsize=(10, 10))

ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal') 
plt.show()

In [None]:
lang_stats_list = []
lang_stats_list.append((train_eval_df.groupby('label')['id'].count()/train_eval_df.shape[0]).rename('general'))
for lang in train_eval_df.lang_abv.unique():
    lang_ser = train_eval_df.loc[train_eval_df.lang_abv == lang].groupby('label')['id'].count()/train_eval_df.loc[train_eval_df.lang_abv == lang].shape[0]
    lang_ser = lang_ser.rename(lang)
    lang_stats_list.append(lang_ser)

In [None]:
# Label proportion per language
pd.concat(lang_stats_list, axis=1)

In [None]:
train_eval_df.loc[train_eval_df[['premise', 'hypothesis', 'label']].duplicated()]

In [None]:
train_eval_df.premise.str.len().describe()

In [None]:
train_eval_df.hypothesis.str.len().describe()

In [None]:
print('Overall number of unique premises:', train_eval_df.premise.nunique())
print('Number of premises occure more than once:', train_eval_df.loc[train_eval_df.premise.isin(train_eval_df.loc[train_eval_df[['premise']].duplicated()].premise)].shape[0])

In [None]:
train_eval_df.groupby('premise')['hypothesis'].count().describe()

### Quick thoughts
* Label distibution is balanced along all languages: So **accuracy** metric remains representative.
* Dataset is quite small: So **overfitting** is to be expected, which will mostly affect prediction of '1' label.
* Wonder should train and eval datasets share some premises.

# Data splitting


In [None]:
# Create an index for each unique premise
prem_id_series = pd.Series(train_eval_df.premise.unique()).reset_index(name='premise')
train_eval_df = train_eval_df.merge(prem_id_series, how='left', on='premise').rename(columns={'index': 'premise_idx'})
train_eval_df

In [None]:
N_UNIQUE_PREMISES = train_eval_df.premise_idx.max()
TRAIN_PREMISES_NUMB = int((1 - EVAL_PART_SIZE) * N_UNIQUE_PREMISES)

In [None]:
premise_idxes_list = [i for i in range(N_UNIQUE_PREMISES)]
init_random_seed()
random.shuffle(premise_idxes_list)

In [None]:
train_premise_idxes = premise_idxes_list[:TRAIN_PREMISES_NUMB]

In [None]:
train_df =  train_eval_df.loc[train_eval_df.premise_idx.isin(train_premise_idxes)]
eval_df =  train_eval_df.loc[~train_eval_df.premise_idx.isin(train_premise_idxes)]
assert train_df.shape[0] + eval_df.shape[0] == train_eval_df.shape[0]
print('Train dataset size:', train_df.shape[0])
print('Eval dataset size:', eval_df.shape[0])
print('Test dataset size:', test_df.shape[0])

# Data Preprocess

 A RoBERTa sequence has the following format:

<p>single sequence: &lt;s> X &lt;/s></p>
<p>pair of sequences: &lt;s> A &lt;/s>&lt;/s> B &lt;/s></p>

In [None]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained(CUR_MODEL_NAME)

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
print('Vocab size:', tokenizer.vocab_size)

In [None]:
BOS_TOKEN_ID = tokenizer.bos_token_id
EOS_TOKEN_ID = tokenizer.eos_token_id
PAD_TOKEN_ID = tokenizer.pad_token_id

In [None]:
train_features_encoded = tokenizer(train_df[['premise', 'hypothesis']].values.tolist()).input_ids
train_labels = train_df.label.values.tolist()

eval_features_encoded = tokenizer(eval_df[['premise', 'hypothesis']].values.tolist()).input_ids
eval_labels = eval_df.label.values.tolist()

In [None]:
print('Max seq len in train/eval selections:', max(len(doc) for doc in train_features_encoded), max(len(doc) for doc in eval_features_encoded))

In [None]:
class SeqClassDataset(Dataset):
    def __init__(self, features, targets=None) -> None:
        super().__init__()
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        
        features = torch.LongTensor(self.features[idx])
        if self.targets is not None:
            target = torch.LongTensor([self.targets[idx]])
            return features, target
        else:
            target = None
            return features, target


In [None]:
train_dataset = SeqClassDataset(train_features_encoded, train_labels)
eval_dataset = SeqClassDataset(eval_features_encoded, eval_labels)

In [None]:
def custom_collate(data):
    features, labels = zip(*data)
    features = torch.nn.utils.rnn.pad_sequence(features, batch_first=True, padding_value=PAD_TOKEN_ID)
    attn_mask = (features != PAD_TOKEN_ID).int()
    labels = torch.cat(labels) if labels[0] is not None else None
    return {'input_ids': features, 'attention_mask': attn_mask, 'labels': labels} 

In [None]:
init_random_seed()
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=custom_collate, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, collate_fn=custom_collate, shuffle=False)

In [None]:
next(iter(train_dataloader))

# Model loading

In [None]:
!sudo apt-get install git-lfs
!git lfs install

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import wandb

wandb.login()

In [None]:
wandb.init(project="XLMRobertaBase NLI CMDW", entity="remeris")

In [None]:
model = XLMRobertaForSequenceClassification.from_pretrained(CUR_MODEL_NAME, num_labels=len(set(train_labels)))

# Train/Eval stage

In [None]:
glue_metric = load_metric('glue', 'mnli')

In [None]:
def compute_metrics(data):
    logits, labels = data
    output = glue_metric.compute(predictions=logits.argmax(1).tolist(), references=labels.squeeze().tolist())  
    return output

In [None]:
training_args = TrainingArguments(
    output_dir=MODELS_PATH,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    do_train=True,
    do_eval=True,
    
    # Setting arguments for early-stopping-ish
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    logging_strategy='epoch',
    
    # Couldn't find hyperparameters for XLM-R, so they were taken from the original RoBERTa paper
    learning_rate=1e-5,
    weight_decay=0.1,
    warmup_ratio=0.06,
    num_train_epochs=10,
    save_total_limit=1,
    
    push_to_hub=True,
    hub_model_id='Remeris/XLMRobertaBase_NLI_CMDW',
    hub_strategy='end'
)

In [None]:
trainer = Trainer(
    tokenizer=tokenizer,
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=custom_collate
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub()

In [None]:
predicted_labels = []
for batch_idx, batch in enumerate(eval_dataloader):
    for key in batch:
        batch[key] = batch[key].to(model.device)
    out_logits = model(**batch)[1]
    predicted_labels += out_logits.argmax(1).int().tolist()
eval_df['predicted_labels'] = predicted_labels

In [None]:
print(classification_report(eval_df.label, eval_df.predicted_labels))

# Test predict and submit

In [None]:
test_df

In [None]:
test_features_encoded = tokenizer(test_df[['premise', 'hypothesis']].values.tolist()).input_ids
test_dataset = SeqClassDataset(test_features_encoded)
test_dataloader = DataLoader(test_dataset, batch_size=16, collate_fn=custom_collate, shuffle=False)

In [None]:
predicted_labels = []
for batch_idx, batch in enumerate(test_dataloader):
    for key in batch:
        if batch[key] is not None:
            batch[key] = batch[key].to(model.device)
    out_logits = model(**batch)[0]
    predicted_labels += out_logits.argmax(1).int().tolist()
test_df['prediction'] = predicted_labels

In [None]:
test_df[['id', 'prediction']]

In [None]:
test_df[['id', 'prediction']].to_csv("submission.csv", index = False)