In [None]:
!nvidia-smi

In [None]:
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/Colab/kaggle/kaggle.json ~/.kaggle

!pip install --upgrade --force-reinstall --no-deps kaggle

In [None]:
!cp -r /content/drive/MyDrive/Colab/kaggle/chaii-hindi-and-tamil-question-answering/input /content
!mkdir /content/checkpoint

In [None]:
!pip install -U pytorch-lightning transformers wandb sentencepiece torchsummaryX

In [1]:
import os
import random
from dataclasses import dataclass
import copy
import gc

import numpy as np
import pandas as pd
import math
import sklearn.model_selection as sms

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
import torch.optim as optim
from torchmetrics import MeanMetric
from torch.autograd import Variable

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import WandbLogger
import wandb

from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from torchsummaryX import summary

from collections import defaultdict, OrderedDict
from tqdm.notebook import tqdm

import json
from pathlib import Path
from kaggle.api.kaggle_api_extended import KaggleApi

os.environ["TOKENIZERS_PARALLELISM"] = "false"

## config

In [2]:
@dataclass
class Config:
    #########################
    # Globals #
    #########################
    exp_name = "exp015"
    debug = False
    gpus = 1
    num_workers = 8
    num_epochs = 2
    grad_accumulate = 3
    fp16 = False
    seed = 1234
    #########################
    # Data #
    #########################
    train_csv = "input/train.csv"
    test_csv = "input/test.csv"
    sample_submission_csv = "input/sample_submission.csv"
    mlqa_hindi_csv = "input/external_data/mlqa_hindi.csv"
    xquad_hindi_csv = "input/external_data/xquad_hindi.csv"
    xquad_tamil_csv = "input/external_data/squad_translated_tamil.csv"
    checkpoint_dir = "drive/MyDrive/Colab/kaggle/chaii-hindi-and-tamil-question-answering/checkpoint"
    #########################
    # Split #
    #########################
    split_name = "StratifiedKFold"
    split_params = {
        "n_splits": 5,
        "random_state": 1234,
        "shuffle": True,
        }
    #########################
    # Tokenizer #
    #########################
    tokenizer_name = "deepset/xlm-roberta-large-squad2"
    max_seq_length = 400
    doc_stride = 135
    truncation = "only_second"
    padding = "max_length"
    #########################
    # Dataset #
    #########################
    #########################
    # DataLoader #
    #########################
    train_batch_size = 3
    valid_batch_size = 8
    test_batch_size = 128
    #########################
    # Model #
    #########################
    base_model_name = "deepset/xlm-roberta-large-squad2"
    base_model_config = "deepset/xlm-roberta-large-squad2"
    num_classes = 2
    init_layers = 1
    #########################
    # Criterion #
    #########################
    loss_name = "CrossEntropyLoss"
    loss_params = {
        "ignore_index": -1,
    }
    #########################
    # Optimizer #
    #########################
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 1.5e-5,
        "weight_decay": 1e-2,
        "eps": 1e-8,
        "correct_bias": True
    }
    no_decay = ["bias", "LayerNorm.weight"]
    header_weight_decay = 1e-2
    header_lr = 1e-3
    lr_decay = 0.98
    ######################
    # Scheduler #
    ######################
    scheduler_name = "linear-warmup"
    scheduler_params = {
        "warmup_ratio": 0.1,
    }
    ######################
    # Callbacks #
    ######################
    model_checkpoint_params = {
        "monitor": "val/loss_epoch",
        "save_top_k": 1,
        "save_weights_only": True,
        "mode": "min",
    }
    early_stopping_params = {
        "monitor": "val/jaccard_epoch",
        "min_delta": 0.0,
        "patience": 5,
        "verbose": False,
        "mode": "max",
    }
    wandb_logger_params = {
        "project": "kaggle-chaii-hindi-and-tamil-question-answering"
    }


cfg = Config()

## utils

In [3]:
def load_dataset(cfg):
    train_df = pd.read_csv(cfg.train_csv, nrows= 100 if cfg.debug else None)
    test_df = pd.read_csv(cfg.test_csv)
    external_mlqa_df = pd.read_csv(cfg.mlqa_hindi_csv)
    external_xquad_df = pd.read_csv(cfg.xquad_hindi_csv)
    external_tamil_xquad_df = pd.read_csv(cfg.xquad_tamil_csv)
    external_tamil_xquad_df['language'] = "tamil"
    external_tamil_xquad_df["answer_start"] = external_tamil_xquad_df["answer_start"].astype(int)
    external_train_df = pd.concat([
                                   external_mlqa_df, 
                                   external_xquad_df, 
                                   external_tamil_xquad_df
                                   ], axis=0).reset_index(drop=True)
    sample_submission_df = pd.read_csv(cfg.sample_submission_csv)

    return train_df, test_df, external_train_df, sample_submission_df


def get_split(cfg):
    split_name = cfg.split_name
    split_params = cfg.split_params

    return sms.__getattribute__(split_name)(**split_params)


def get_fold(cfg, train_df: pd.DataFrame, y_train: pd.DataFrame):
    splitter = get_split(cfg)
    train_df["fold"] = -1
    for fold_id, (train_idx, valid_idx) in enumerate(splitter.split(train_df, y_train)):
        train_df.loc[valid_idx, "fold"] = int(fold_id)

    return train_df


def convert_answers(row):
    return {"answer_start": [row[0]], "text": [row[1]]}


def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)

    return float(len(c)) / (len(a) + len(b) - len(c))

## load data

In [4]:
train_df, test_df, external_train_df, sample_submission_df = load_dataset(cfg)

train_df = get_fold(cfg, train_df, train_df["language"])

external_train_df["fold"] = -1
external_train_df["id"] = list(np.arange(1, len(external_train_df) + 1))
train_df = pd.concat([train_df, external_train_df], axis=0).reset_index(drop=True)

# 改行文字の削除
# train_df["context"] = train_df["context"].apply(lambda x: " ".join(x.split()))
# train_df["question"] = train_df["question"].apply(lambda x: " ".join(x.split()))

train_df["answers"] = train_df[["answer_start", "answer_text"]].apply(convert_answers, axis=1)
train_df

Unnamed: 0,id,context,question,answer_text,answer_start,language,fold,answers
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,2,"{'answer_start': [53], 'text': ['206']}"
1,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,2,"{'answer_start': [2358], 'text': ['காசுமீரில்']}"
2,29d154b56,சர் அலெக்ஸாண்டர் ஃபிளெமிங் (Sir Alexander Flem...,பென்சிலின் கண்டுபிடித்தவர் யார்?,சர் அலெக்ஸாண்டர் ஃபிளெமிங்,0,tamil,3,"{'answer_start': [0], 'text': ['சர் அலெக்ஸாண்ட..."
3,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,68,tamil,1,"{'answer_start': [68], 'text': ['தாலாட்டு']}"
4,b29c82c22,சூரியக் குடும்பம் \nசூரியக் குடும்பம் (Solar S...,பூமியின் அருகில் உள்ள விண்மீன் எது?,சூரியனும்,585,tamil,0,"{'answer_start': [585], 'text': ['சூரியனும்']}"
...,...,...,...,...,...,...,...,...
11291,10178,எமிட் அரசாங்கத்தின் பணி மற்றும் நோக்கங்களை ஆதர...,மாணவர்களின் எண்ணிக்கை என்ன?,50,421,tamil,-1,"{'answer_start': [421], 'text': ['50']}"
11292,10179,"தஜிகிஸ்தான்(I /tɑːdʒiːkᵻstɑːn/, /tədʒiːkᵻstæn/...",தஜிகிஸ்தானில் எத்தனை பேர் கணக்கிடப்படுகிறார்கள்?,8 மில்லியன்,353,tamil,-1,"{'answer_start': [353], 'text': ['8 மில்லியன்']}"
11293,10180,டீனேஜர் சஞ்சய மலகார் தனது அசாதாரணமான ஹேர்டோவிற...,சஞ்சய மலகார் அமெரிக்க சிலை மீது நீக்கப்பட்டதை ...,ஏப்ரல் 18,143,tamil,-1,"{'answer_start': [143], 'text': ['ஏப்ரல் 18']}"
11294,10181,"""இஸ்லாமிய தத்துவத்திற்கான"" பொதுவான வரையறைகளில்...",இபின் சினாவால் எத்தனை புத்தகங்கள் எழுதப்பட்டுள...,450,334,tamil,-1,"{'answer_start': [334], 'text': ['450']}"


## preprocess

In [5]:
def prepare_train_features(cfg, example, tokenizer):
    example["question"] = example["question"].lstrip()

    tokenized_example = tokenizer(
        example["question"],
        example["context"],
        truncation=cfg.truncation,
        max_length=cfg.max_seq_length,
        stride=cfg.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding=cfg.padding,
        # return_tensors="pt"
    )

    sample_mapping = tokenized_example.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_example.pop("offset_mapping")

    features = []
    for i, offsets in enumerate(offset_mapping):
        feature = {}

        input_ids  = tokenized_example["input_ids"][i]
        attention_mask = tokenized_example["attention_mask"][i]

        feature["input_ids"] = input_ids
        feature["attention_mask"] = attention_mask
        feature["offset_mapping"] = offsets
        feature["example_id"] = example["id"]
        feature["sequence_ids"] = [0 if i is None else i for i in tokenized_example.sequence_ids(i)]

        cls_index = input_ids.index(tokenizer.cls_token_id) # cls_token_id = 0, pad_token_id = 1
        sequence_ids = tokenized_example.sequence_ids(i)

        # sample_indx = sample_mapping[i]
        answers = example["answers"]

        if len(answers["answer_start"]) == 0: # sampleにanswerがなければ開始・終了位置をCLS(=[CLS], <s>)にする
            feature["start_position"] = cls_index
            feature["end_position"] = cls_index
        else:
            start_char = answers["answer_start"][0] # answer部分の文字レベルの開始位置
            end_char = start_char + len(answers["text"][0]) # answer部分の文字レベルの終了位置

            token_start_index = 0 # sequenceにおけるcontext部分のトークンレベルの開始位置
            while sequence_ids[token_start_index] != 1: # sequence_idsが1(=context部分)になるまで足す(<s>,</s>, querstion部分を飛ばすイメージ)
                token_start_index += 1

            token_end_index = len(input_ids) - 1 # sequenceにおけるcontext部分のトークンレベルの終了位置
            while sequence_ids[token_end_index] != 1: # sequence_idsが1(=context部分)になるまで引く(paddingはNoneなので<pad>部分を引くイメージ)
                token_end_index -= 1

            # token_start_index, token_end_indexがanswer部分の範囲外にあればpositionを0(=[CLS], <s>)とする(truncationされてる場合はこっち)
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                feature["start_position"] = cls_index
                feature["end_position"] = cls_index
            # star_char, end_charを含むtokenまでtoken_start_index, token_end_indexを調整
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                feature["start_position"] = token_start_index - 1 # 行き過ぎた1token分戻す

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                feature["end_position"] = token_end_index + 1 # 行き過ぎた1token分戻す

        features.append(feature)

    return features


def prepare_test_features(cfg, example, tokenizer):
    example["question"] = example["question"].lstrip()

    tokenized_example = tokenizer(
        example["question"],
        example["context"],
        truncation=cfg.truncation,
        max_length=cfg.max_seq_length,
        stride=cfg.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding=cfg.padding,
        # return_tensors="pt"
    )

    features = []
    for i in range(len(tokenized_example["input_ids"])):
        feature = {}
        feature["example_id"] = example["id"]
        feature["context"] = example["context"]
        feature["question"] = example["question"]
        feature["input_ids"] = tokenized_example["input_ids"][i]
        feature["attention_mask"] = tokenized_example["attention_mask"][i]
        feature["offset_mapping"] = tokenized_example["offset_mapping"][i]
        feature["sequence_ids"] = [0 if i is None else i for i in tokenized_example.sequence_ids(i)]
        features.append(feature)

    return features


def postprocess_qa_predictions(tokenizer, examples: pd.DataFrame, features, raw_predictions, n_best_size=20, max_answer_length=30):
    '''予測値の後処理関数
    '''
    all_start_logits, all_end_logits = raw_predictions

    example_id_to_index = {k: i for i, k in enumerate(examples["id"])} # dataframeのidをkeyとしたindexのmapping用dict
    features_per_example = defaultdict(list) # dataframeのidに対応するexample_id_to_indexのidをkeyとしたdict。valueはfeaturesに対応するidのリスト
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = OrderedDict()
    # print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in examples.iterrows():
        feature_indices = features_per_example[example_index] # 対応するfeaturesのindexを取り出す

        # min_null_score = None
        valid_answers = []

        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            sequence_ids = features[feature_index]["sequence_ids"]
            context_index = 1 # contextのsequence_id(questionは0)
            
            # contextのみoffset_mappingを保持(questionのoffset_mappingをNoneに)
            features[feature_index]["offset_mapping"] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(features[feature_index]["offset_mapping"])
            ]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)

            # feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            # if min_null_score is None or min_null_score < feature_null_score:
            #     min_null_score = feature_null_score

            # start_indexes = np.argsort(start_logits)[-1: -n_best_size - 1: -1].tolist()
            # end_indexes = np.argsort(end_logits)[-1: -n_best_size - 1: -1].tolist()
            start_indexes = np.argsort(start_logits)[::-1][:n_best_size].tolist()
            end_indexes = np.argsort(end_logits)[::-1][:n_best_size].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        # 各レコード・チャンク(feature)におけるstart+end出力値のスコアが最も大きいペアを最終的な予測値とする
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[example["id"]] = best_answer["text"]

    return predictions


def postprocess_cleaned_predictions(input_df: pd.DataFrame):
    bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";"]
    bad_endings = ["...", "-", "(", ")", "–", ",", ";"]

    tamil_ad = "கி.பி"
    tamil_bc = "கி.மு"
    tamil_km = "கி.மீ"
    hindi_ad = "ई"
    hindi_bc = "ई.पू"


    cleaned_preds = []
    for pred, context in input_df[["PredictionString", "context"]].to_numpy():
        if pred == "":
            cleaned_preds.append(pred)
            continue
        while any([pred.startswith(y) for y in bad_starts]):
            pred = pred[1:]
        while any([pred.endswith(y) for y in bad_endings]):
            if pred.endswith("..."):
                pred = pred[:-3]
            else:
                pred = pred[:-1]
        if pred.endswith("..."):
                pred = pred[:-3]

        if any([pred.endswith(tamil_ad), pred.endswith(tamil_bc), pred.endswith(tamil_km), pred.endswith(hindi_ad), pred.endswith(hindi_bc)]) and pred+"." in context:
            pred = pred+"."

        cleaned_preds.append(pred)
        
    return cleaned_preds

## Dataset

In [6]:
class ChaiiDataset(Dataset):
    def __init__(self, features, phase):
        super(ChaiiDataset, self).__init__()
        self.features = features
        self.phase = phase
        
    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        feature = self.features[index]
        segment_target = torch.zeros(400, dtype=torch.long)
        segment_target[feature["start_position"]: feature["end_position"] + 1] = 1

        if self.phase == "train":
            return {
                "input_ids": torch.tensor(feature["input_ids"], dtype=torch.long),
                "attention_mask": torch.tensor(feature["attention_mask"], dtype=torch.long),
                "offset_mapping": torch.tensor(feature["offset_mapping"], dtype=torch.long),
                "start_position": torch.tensor(feature["start_position"], dtype=torch.long),
                "end_position": torch.tensor(feature["end_position"], dtype=torch.long),
                "segment_target": segment_target,
            }
        else:
            return {
                "input_ids": torch.tensor(feature["input_ids"], dtype=torch.long),
                "attention_mask": torch.tensor(feature["attention_mask"], dtype=torch.long),
                "offset_mapping": torch.tensor(feature["offset_mapping"], dtype=torch.long),
                "sequence_ids": feature["sequence_ids"],
                "id": feature["example_id"],
                "context": feature["context"],
                "question": feature["question"],
            }


class ChaiiDataModule(pl.LightningDataModule):
    def __init__(self, cfg, tokenizer, input_df: pd.DataFrame, phase: str, fold: int = 0):
        super(ChaiiDataModule, self).__init__()
        self.cfg = cfg
        self.tokenizer = tokenizer
        self.input_df = input_df
        self.phase = phase
        self.fold = fold

    def get_train_features(self, cfg, input_df, tokenizer):
        features = []
        for i, row in tqdm(input_df.iterrows(), total=len(input_df), desc="[get features]"):
            features += prepare_train_features(cfg, row, tokenizer)
        
        return features

    def get_test_features(self, cfg, input_df, tokenizer):
        features = []
        for i, row in tqdm(input_df.iterrows(), total=len(input_df), desc="[get features]"):
            features += prepare_test_features(cfg, row, tokenizer)
        
        return features

    def setup(self, stage=None):
        assert self.phase in ("train", "test"), "Input phase is not exist."
        if self.phase == "train":
            self.train_df = self.input_df[self.input_df["fold"] != self.fold].reset_index(drop=True)
            self.valid_df = self.input_df[self.input_df["fold"] == self.fold].reset_index(drop=True)

            self.train_features = self.get_train_features(self.cfg, self.train_df, self.tokenizer)
            self.valid_features = self.get_train_features(self.cfg, self.valid_df, self.tokenizer)

            self.train_dataset = ChaiiDataset(self.train_features, self.phase)
            self.valid_dataset = ChaiiDataset(self.valid_features, self.phase)
            print(f"Number of train features: {len(self.train_dataset)}, Number of valid features: {len(self.valid_dataset)}")
        elif self.phase == "test":
            self.test_features = self.get_test_features(self.cfg, self.input_df, self.tokenizer)
            self.test_dataset = ChaiiDataset(self.test_features, self.phase)
            print(f"Number of test features: {len(self.test_dataset)}")
        else:
            raise NotImplementedError
        
    def train_dataloader(self):
        train_sampler = RandomSampler(self.train_dataset)
        return DataLoader(self.train_dataset, batch_size=self.cfg.train_batch_size, num_workers=self.cfg.num_workers, pin_memory=True, drop_last=False, sampler=train_sampler)

    def val_dataloader(self):
        valid_sampler = SequentialSampler(self.valid_dataset)
        return DataLoader(self.valid_dataset, batch_size=self.cfg.valid_batch_size, num_workers=self.cfg.num_workers, pin_memory=True, drop_last=False, sampler=valid_sampler)

    def predict_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.cfg.test_batch_size, num_workers=self.cfg.num_workers, pin_memory=True, shuffle=False, drop_last=False)

## Model

In [7]:
class ChaiiModel(nn.Module):
    def __init__(self, cfg):
        super(ChaiiModel, self).__init__()
        self.cfg = cfg
        self.model_config = AutoConfig.from_pretrained(self.cfg.base_model_config)
        self.encoder = AutoModel.from_pretrained(self.cfg.base_model_name, config=self.model_config)
        self.classifier = nn.Linear(self.model_config.hidden_size, cfg.num_classes)
        self._init_header_weights(self.classifier)
        # self._init_roberta_weights(self.encoder)

    def _init_header_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()

    def _init_roberta_weights(self, model):
        for layer in model.encoder.layer[-cfg.init_layers:]:
            for module in layer.modules():
                if isinstance(module, nn.Linear):
                    module.weight.data.normal_(mean=0.0, std=self.base_model.config.initializer_range)
                    if module.bias is not None:
                        module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                    module.weight.data.normal_(mean=0.0, std=self.base_model.config.initializer_range)
                    if module.padding_idx is not None:
                        module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                    module.bias.data.zero_()
                    module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask=None):
        output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        x = self.classifier(output[0]) # (batch_size, hidden_size, num_classes)
        x0, x1 = x.split(1, dim=-1) # (batch_size, hidden_size, 1)
        start_logits, end_logits = x0.squeeze(-1), x1.squeeze(-1) # (batch_size, hidden_size)

        return start_logits, end_logits


# model = ChaiiModel(cfg)
# summary(model, torch.zeros(1, 400, dtype=torch.long))

In [8]:
def lovasz_grad(gt_sorted):
    """
    Computes gradient of the Lovasz extension w.r.t sorted errors
    See Alg. 1 in paper
    """
    p = len(gt_sorted)
    gts = gt_sorted.sum()
    intersection = gts - gt_sorted.float().cumsum(0)
    union = gts + (1 - gt_sorted).float().cumsum(0)
    jaccard = 1. - intersection / union
    if p > 1: # cover 1-pixel case
        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
    return jaccard

# --------------------------- BINARY LOSSES ---------------------------
def lovasz_hinge(logits, labels, per_image=True, ignore=None):
    """
    Binary Lovasz hinge loss
      logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
      labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
      per_image: compute the loss per image instead of per batch
      ignore: void class id
    """
    if per_image:
        loss = mean(lovasz_hinge_flat(*flatten_binary_scores(log.unsqueeze(0), lab.unsqueeze(0), ignore))
                          for log, lab in zip(logits, labels))
    else:
        loss = lovasz_hinge_flat(*flatten_binary_scores(logits, labels, ignore))
    return loss


def lovasz_hinge_flat(logits, labels):
    """
    Binary Lovasz hinge loss
      logits: [P] Variable, logits at each prediction (between -\infty and +\infty)
      labels: [P] Tensor, binary ground truth labels (0 or 1)
      ignore: label to ignore
    """
    if len(labels) == 0:
        # only void pixels, the gradients should be 0
        return logits.sum() * 0.
    signs = 2. * labels.float() - 1.
    errors = (1. - logits * Variable(signs))
    errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
    perm = perm.data
    gt_sorted = labels[perm]
    grad = lovasz_grad(gt_sorted)
    loss = torch.dot(F.relu(errors_sorted), Variable(grad))
    return loss


def flatten_binary_scores(scores, labels, ignore=None):
    """
    Flattens predictions in the batch (binary case)
    Remove labels equal to 'ignore'
    """
    scores = scores.view(-1)
    labels = labels.view(-1)
    if ignore is None:
        return scores, labels
    valid = (labels != ignore)
    vscores = scores[valid]
    vlabels = labels[valid]
    return vscores, vlabels


class StableBCELoss(torch.nn.modules.Module):
    def __init__(self):
         super(StableBCELoss, self).__init__()
    def forward(self, input, target):
         neg_abs = - input.abs()
         loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log()
         return loss.mean()


def binary_xloss(logits, labels, ignore=None):
    """
    Binary Cross entropy loss
      logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
      labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
      ignore: void class id
    """
    logits, labels = flatten_binary_scores(logits, labels, ignore)
    loss = StableBCELoss()(logits, Variable(labels.float()))
    return loss


# --------------------------- HELPER FUNCTIONS ---------------------------
def isnan(x):
    return x != x
    
    
def mean(l, ignore_nan=False, empty=0):
    """
    nanmean compatible with generators.
    """
    l = iter(l)
    if ignore_nan:
        l = ifilterfalse(isnan, l)
    try:
        n = 1
        acc = next(l)
    except StopIteration:
        if empty == 'raise':
            raise ValueError('Empty mean')
        return empty
    for n, v in enumerate(l, 2):
        acc += v
    if n == 1:
        return acc
    return acc / n


class LovaszHingeLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(LovaszHingeLoss, self).__init__()

    def forward(self, inputs, targets):
        # inputs = torch.sigmoid(inputs)
        Lovasz = lovasz_hinge(inputs, targets, per_image=True)

        return Lovasz

## training

In [9]:
def get_optimizer_grouped_parameters(cfg, model):
    no_decay = cfg.no_decay
    # header layerのweight_decay, lr
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if "classifier" in n],
            "weight_decay": cfg.header_weight_decay,
            "lr": cfg.header_lr
        },
    ]

    # num_layers = model.base_model.config.num_hidden_layers
    layers = [getattr(model, "base_model").embeddings] + list(getattr(model, "base_model").encoder.layer)
    layers.reverse()
    lr = cfg.optimizer_params["lr"]
    
    for layer in layers:
        lr *= cfg.lr_decay
        optimizer_grouped_parameters += [
            # no_decayのリストに含まれないパラメータはweight decayを設定
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": cfg.optimizer_params["weight_decay"],
                "lr": lr,
            },
            # no_decayのリストに含まれるパラメータはweight decayを設定しない
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            }
        ]
    
    return optimizer_grouped_parameters


def get_criterion(cfg):
    loss_name = cfg.loss_name
    loss_params = cfg.loss_params
    return nn.__getattribute__(loss_name)(**loss_params)


def get_optimizer(cfg, model=None, optimizer_grouped_parameters=None):
    optimizer_name = cfg.optimizer_name
    optimizer_params = cfg.optimizer_params

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": cfg.optimizer_params["weight_decay"],
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

    if optimizer_name == "AdamW":
        return AdamW(
            optimizer_grouped_parameters,
            **optimizer_params
        )
    else:
        return optim.__getattribute__(optimizer_name)(model.parameters(), **optimizer_params)


def get_scheduler(cfg, optimizer, num_warmup_steps=None, num_training_steps=None):
    scheduler_name = cfg.scheduler_name
    scheduler_params = cfg.scheduler_params

    if scheduler_name == "cosine-warmup":
        return get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps,
        )
    elif scheduler_name == "linear-warmup":
        return get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps,
        )
    # elif scheduler_name not in ("cosine-warmup", "linear-warmup"):
    #     return optim.lr_scheduler.__getattribute__(scheduler_name)(optimizer, **scheduler_params)


class ChaiiLightningModule(pl.LightningModule):
    def __init__(self, cfg):
        super(ChaiiLightningModule, self).__init__()
        self.cfg = cfg
        self.model = ChaiiModel(self.cfg)
        self.start_criterion = get_criterion(self.cfg)
        self.end_criterion = get_criterion(self.cfg)
        self.train_mean_metric1 = MeanMetric()
        self.train_mean_metric2 = MeanMetric()
        self.valid_mean_metric1 = MeanMetric()
        self.valid_mean_metric2 = MeanMetric()
        self.lovasz_hinge = LovaszHingeLoss()

    def forward(self, input_ids, attention_mask):
        output_start, output_end = self.model(input_ids, attention_mask)
        return output_start, output_end

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, target_start, target_end, segment_target = batch["input_ids"], batch["attention_mask"], batch["start_position"], batch["end_position"], batch["segment_target"]
        output_start, output_end = self.forward(input_ids, attention_mask)

        # CrossEntropyLoss
        loss_start = self.start_criterion(output_start, target_start)
        loss_end = self.end_criterion(output_end, target_end)
        bce_loss = (loss_start + loss_end) / 2

        # Lovasz-hingeLoss
        cum_start_prob = torch.cumsum(torch.sigmoid(output_start), axis=1)
        cum_end_prob = torch.fliplr(torch.cumsum(torch.fliplr(torch.sigmoid(output_end)), axis=1))
        pred_prob = cum_start_prob * cum_end_prob
        lovaszloss = self.lovasz_hinge(pred_prob, segment_target)

        loss = bce_loss + lovaszloss * 0.5

        self.train_mean_metric1.update(loss)
        loss_avg = self.train_mean_metric1.compute()

        self.train_mean_metric2.update(bce_loss)
        bce_avg = self.train_mean_metric2.compute()

        self.log("train/loss", loss_avg, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log("train/bce_loss", bce_avg, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log("train/loss_start_epoch", loss_start.item(), on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log("train/loss_end_epoch", loss_end.item(), on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log("train/lovasz_hinge_epoch", lovaszloss.item(), on_step=False, on_epoch=True, prog_bar=True, logger=True)

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, target_start, target_end, segment_target = batch["input_ids"], batch["attention_mask"], batch["start_position"], batch["end_position"], batch["segment_target"]
        output_start, output_end = self.forward(input_ids, attention_mask)

        # CrossEntropyLoss
        loss_start = self.start_criterion(output_start, target_start)
        loss_end = self.end_criterion(output_end, target_end)
        bce_loss = (loss_start + loss_end) / 2

        # Lovasz-hingeLoss
        cum_start_prob = torch.cumsum(torch.sigmoid(output_start), axis=1)
        cum_end_prob = torch.fliplr(torch.cumsum(torch.fliplr(torch.sigmoid(output_end)), axis=1))
        pred_prob = cum_start_prob * cum_end_prob
        lovaszloss = self.lovasz_hinge(pred_prob, segment_target)

        loss = bce_loss + lovaszloss * 0.5

        self.valid_mean_metric1.update(loss)
        loss_avg = self.valid_mean_metric1.compute()

        self.valid_mean_metric2.update(bce_loss)
        bce_avg = self.valid_mean_metric2.compute()

        self.log("val/loss", loss_avg, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log("val/bce_loss", bce_avg, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log("val/loss_start_epoch", loss_start.item(), on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log("val/loss_end_epoch", loss_end.item(), on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log("val/lovasz_hinge_epoch", lovaszloss.item(), on_step=False, on_epoch=True, prog_bar=True, logger=True)

        outputs = OrderedDict({
            "start_logits": output_start.detach(),
            "end_logits": output_end.detach(),
        }
        )

        return outputs

    def validation_epoch_end(self, outputs):
        pred_start = torch.cat([output["start_logits"] for output in outputs]).cpu().numpy()
        pred_end = torch.cat([output["end_logits"] for output in outputs]).cpu().numpy()

        preds = postprocess_qa_predictions(
            self.trainer.datamodule.tokenizer,
            self.trainer.datamodule.valid_df,
            copy.deepcopy(self.trainer.datamodule.valid_features[:pred_start.shape[0]]),
            (pred_start, pred_end)
        )
        jaccard_score = np.mean([jaccard(x, y) for x, y in zip(self.trainer.datamodule.valid_df["answer_text"].values, preds.values())], axis=0)

        self.log("val/jaccard_epoch", jaccard_score, on_step=False, on_epoch=True, prog_bar=True, logger=True)

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        input_ids, attention_mask = batch["input_ids"], batch["attention_mask"]
        output_start, output_end = self.forward(input_ids, attention_mask)

        outputs = OrderedDict({
            "start_logits": output_start,
            "end_logits": output_end,
        }
        )

        return outputs

    def configure_optimizers(self):
        # optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.cfg, self.model)
        optimizer = get_optimizer(
            self.cfg,
            self.model,
            # optimizer_grouped_parameters=optimizer_grouped_parameters
        )

        num_training_steps = math.ceil(len(self.trainer.datamodule.train_dataloader()) / self.trainer.accumulate_grad_batches) * self.trainer.max_epochs
        # num_training_steps = len(self.trainer.datamodule.train_dataloader()) * self.trainer.max_epochs
        if self.cfg.scheduler_params["warmup_ratio"] > 0:
            num_warmup_steps = int(num_training_steps * self.cfg.scheduler_params["warmup_ratio"])
        else:
            num_warmup_steps = 0
        scheduler = get_scheduler(self.cfg, optimizer, num_warmup_steps, num_training_steps)
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        print(f"Total Training Steps: {num_training_steps}, Total Warmup Steps: {num_warmup_steps}")

        return [optimizer], [scheduler]


def run_fold(cfg, train_df, fold, tokenizer):
    seed_everything(cfg.seed)
    checkpoint_path = cfg.checkpoint_dir + "/" + cfg.exp_name

    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)

    checkpoint_callback = ModelCheckpoint(
        dirpath=checkpoint_path,
        filename=f"{cfg.exp_name}-fold-{fold}" + "-{epoch}",
        **cfg.model_checkpoint_params,
    )

    early_stopping_callback = EarlyStopping(**cfg.early_stopping_params)

    lr_monitor = LearningRateMonitor(logging_interval="step")

    wandb_logger = WandbLogger(
        name=f"{cfg.exp_name}_fold_{fold}",
        **cfg.wandb_logger_params,
    )

    trainer = Trainer(
        default_root_dir=cfg.checkpoint_dir,
        gpus=cfg.gpus,
        max_epochs=cfg.num_epochs,
        accumulate_grad_batches=cfg.grad_accumulate,
        precision=16 if cfg.fp16 else 32,
        callbacks=[
            checkpoint_callback,
            # early_stopping_callback,
            lr_monitor,
        ],
        logger=[
            wandb_logger,
        ],
        log_every_n_steps=10,
    )

    model = ChaiiLightningModule(cfg)
    datamodule = ChaiiDataModule(cfg, tokenizer=tokenizer, input_df=train_df, phase="train", fold=fold)
    trainer.fit(model, datamodule=datamodule)

    wandb.finish()

    del trainer, model, datamodule
    gc.collect()

    return checkpoint_callback.best_model_path, checkpoint_callback.best_model_score.item()


def run_training(cfg, train_df):
    tokenizer = AutoTokenizer.from_pretrained(cfg.tokenizer_name)

    checkpoint_path_list = []
    oof_score = 0

    for fold_id in range(cfg.split_params["n_splits"]):
        checkpoint_path, best_score = run_fold(cfg, train_df, fold_id, tokenizer)
        checkpoint_path_list.append(checkpoint_path)
        oof_score += best_score / int(cfg.split_params["n_splits"])

    print("CV jaccard score :", oof_score)

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(cfg.tokenizer_name)
# _, _ = run_fold(cfg, train_df, 0, tokenizer)
run_training(cfg, train_df)

Global seed set to 1234
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[get features]:   0%|          | 0/11073 [00:00<?, ?it/s]

[get features]:   0%|          | 0/223 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Number of train features: 22982, Number of valid features: 3106
Total Training Steps: 5108, Total Warmup Steps: 510


[34m[1mwandb[0m: Currently logged in as: [33mazupero[0m (use `wandb login --relogin` to force relogin)



  | Name               | Type             | Params
--------------------------------------------------------
0 | model              | ChaiiModel       | 559 M 
1 | start_criterion    | CrossEntropyLoss | 0     
2 | end_criterion      | CrossEntropyLoss | 0     
3 | train_mean_metric1 | MeanMetric       | 0     
4 | train_mean_metric2 | MeanMetric       | 0     
5 | valid_mean_metric1 | MeanMetric       | 0     
6 | valid_mean_metric2 | MeanMetric       | 0     
7 | lovasz_hinge       | LovaszHingeLoss  | 0     
--------------------------------------------------------
559 M     Trainable params
0         Non-trainable params
559 M     Total params
2,239.570 Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 1234


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
lr-AdamW/pg1,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
lr-AdamW/pg2,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/bce_loss_epoch,█▁
train/bce_loss_step,███▇▇▆▆▆▅▅▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train/loss_end_epoch,█▁
train/loss_epoch,█▁
train/loss_start_epoch,█▁
train/loss_step,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/lovasz_hinge_epoch,█▁

0,1
epoch,1.0
lr-AdamW/pg1,0.0
lr-AdamW/pg2,0.0
train/bce_loss_epoch,2.33904
train/bce_loss_step,2.00088
train/loss_end_epoch,1.26012
train/loss_epoch,53.82728
train/loss_start_epoch,1.06947
train/loss_step,39.34948
train/lovasz_hinge_epoch,1.0902


Global seed set to 1234
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[get features]:   0%|          | 0/11073 [00:00<?, ?it/s]

[get features]:   0%|          | 0/223 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Number of train features: 23279, Number of valid features: 2809
Total Training Steps: 5174, Total Warmup Steps: 517



  | Name               | Type             | Params
--------------------------------------------------------
0 | model              | ChaiiModel       | 559 M 
1 | start_criterion    | CrossEntropyLoss | 0     
2 | end_criterion      | CrossEntropyLoss | 0     
3 | train_mean_metric1 | MeanMetric       | 0     
4 | train_mean_metric2 | MeanMetric       | 0     
5 | valid_mean_metric1 | MeanMetric       | 0     
6 | valid_mean_metric2 | MeanMetric       | 0     
7 | lovasz_hinge       | LovaszHingeLoss  | 0     
--------------------------------------------------------
559 M     Trainable params
0         Non-trainable params
559 M     Total params
2,239.570 Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 1234


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
lr-AdamW/pg1,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
lr-AdamW/pg2,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/bce_loss_epoch,█▁
train/bce_loss_step,███▇▇▆▆▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train/loss_end_epoch,█▁
train/loss_epoch,█▁
train/loss_start_epoch,█▁
train/loss_step,█▅▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/lovasz_hinge_epoch,█▁

0,1
epoch,1.0
lr-AdamW/pg1,0.0
lr-AdamW/pg2,0.0
train/bce_loss_epoch,2.3095
train/bce_loss_step,1.97595
train/loss_end_epoch,1.24808
train/loss_epoch,53.86238
train/loss_start_epoch,1.06224
train/loss_step,39.34191
train/lovasz_hinge_epoch,1.0904


Global seed set to 1234
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[get features]:   0%|          | 0/11073 [00:00<?, ?it/s]

[get features]:   0%|          | 0/223 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Number of train features: 23409, Number of valid features: 2679
Total Training Steps: 5202, Total Warmup Steps: 520



  | Name               | Type             | Params
--------------------------------------------------------
0 | model              | ChaiiModel       | 559 M 
1 | start_criterion    | CrossEntropyLoss | 0     
2 | end_criterion      | CrossEntropyLoss | 0     
3 | train_mean_metric1 | MeanMetric       | 0     
4 | train_mean_metric2 | MeanMetric       | 0     
5 | valid_mean_metric1 | MeanMetric       | 0     
6 | valid_mean_metric2 | MeanMetric       | 0     
7 | lovasz_hinge       | LovaszHingeLoss  | 0     
--------------------------------------------------------
559 M     Trainable params
0         Non-trainable params
559 M     Total params
2,239.570 Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 1234


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
lr-AdamW/pg1,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
lr-AdamW/pg2,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/bce_loss_epoch,█▁
train/bce_loss_step,████▇▆▆▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train/loss_end_epoch,█▁
train/loss_epoch,█▁
train/loss_start_epoch,█▁
train/loss_step,█▅▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/lovasz_hinge_epoch,█▁

0,1
epoch,1.0
lr-AdamW/pg1,0.0
lr-AdamW/pg2,0.0
train/bce_loss_epoch,2.28323
train/bce_loss_step,1.95851
train/loss_end_epoch,1.24155
train/loss_epoch,53.53659
train/loss_start_epoch,1.05054
train/loss_step,39.09648
train/lovasz_hinge_epoch,1.0869


Global seed set to 1234
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[get features]:   0%|          | 0/11073 [00:00<?, ?it/s]

[get features]:   0%|          | 0/223 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Number of train features: 23156, Number of valid features: 2932
Total Training Steps: 5146, Total Warmup Steps: 514



  | Name               | Type             | Params
--------------------------------------------------------
0 | model              | ChaiiModel       | 559 M 
1 | start_criterion    | CrossEntropyLoss | 0     
2 | end_criterion      | CrossEntropyLoss | 0     
3 | train_mean_metric1 | MeanMetric       | 0     
4 | train_mean_metric2 | MeanMetric       | 0     
5 | valid_mean_metric1 | MeanMetric       | 0     
6 | valid_mean_metric2 | MeanMetric       | 0     
7 | lovasz_hinge       | LovaszHingeLoss  | 0     
--------------------------------------------------------
559 M     Trainable params
0         Non-trainable params
559 M     Total params
2,239.570 Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 1234


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
lr-AdamW/pg1,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
lr-AdamW/pg2,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/bce_loss_epoch,█▁
train/bce_loss_step,████▇▆▆▆▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train/loss_end_epoch,█▁
train/loss_epoch,█▁
train/loss_start_epoch,█▁
train/loss_step,█▅▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/lovasz_hinge_epoch,█▁

0,1
epoch,1.0
lr-AdamW/pg1,0.0
lr-AdamW/pg2,0.0
train/bce_loss_epoch,2.31663
train/bce_loss_step,1.98601
train/loss_end_epoch,1.25172
train/loss_epoch,53.6848
train/loss_start_epoch,1.06698
train/loss_step,39.2356
train/lovasz_hinge_epoch,1.08987


Global seed set to 1234
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[get features]:   0%|          | 0/11074 [00:00<?, ?it/s]

[get features]:   0%|          | 0/222 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Number of train features: 23426, Number of valid features: 2662
Total Training Steps: 5206, Total Warmup Steps: 520



  | Name               | Type             | Params
--------------------------------------------------------
0 | model              | ChaiiModel       | 559 M 
1 | start_criterion    | CrossEntropyLoss | 0     
2 | end_criterion      | CrossEntropyLoss | 0     
3 | train_mean_metric1 | MeanMetric       | 0     
4 | train_mean_metric2 | MeanMetric       | 0     
5 | valid_mean_metric1 | MeanMetric       | 0     
6 | valid_mean_metric2 | MeanMetric       | 0     
7 | lovasz_hinge       | LovaszHingeLoss  | 0     
--------------------------------------------------------
559 M     Trainable params
0         Non-trainable params
559 M     Total params
2,239.570 Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 1234


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
lr-AdamW/pg1,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
lr-AdamW/pg2,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/bce_loss_epoch,█▁
train/bce_loss_step,████▇▆▆▆▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train/loss_end_epoch,█▁
train/loss_epoch,█▁
train/loss_start_epoch,█▁
train/loss_step,█▅▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/lovasz_hinge_epoch,█▁

0,1
epoch,1.0
lr-AdamW/pg1,0.0
lr-AdamW/pg2,0.0
train/bce_loss_epoch,2.28216
train/bce_loss_step,1.95059
train/loss_end_epoch,1.22092
train/loss_epoch,52.86434
train/loss_start_epoch,1.03308
train/loss_step,38.63187
train/lovasz_hinge_epoch,1.08877


CV jaccard score : 23.58043899536133


In [None]:
ID = "azupero"
DATASET_ID = f"chaii-qa-checkpoint-{cfg.exp_name}"
checkpoint_path = cfg.checkpoint_dir + "/" + cfg.exp_name
UPLOAD_DIR = Path(checkpoint_path)

def dataset_create_new():
    dataset_metadata = {}
    dataset_metadata['id'] = f'{ID}/{DATASET_ID}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = DATASET_ID
    with open(UPLOAD_DIR / 'dataset-metadata.json', 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=UPLOAD_DIR, convert_to_csv=False, dir_mode='tar')

dataset_create_new()

Starting upload for file exp015-fold-0-epoch=1.ckpt


100%|██████████| 2.09G/2.09G [01:15<00:00, 29.6MB/s]


Upload successful: exp015-fold-0-epoch=1.ckpt (2GB)
Starting upload for file exp015-fold-1-epoch=1.ckpt


100%|██████████| 2.09G/2.09G [01:16<00:00, 29.2MB/s]


Upload successful: exp015-fold-1-epoch=1.ckpt (2GB)
Starting upload for file exp015-fold-2-epoch=1.ckpt


100%|██████████| 2.09G/2.09G [00:57<00:00, 38.9MB/s]


Upload successful: exp015-fold-2-epoch=1.ckpt (2GB)
Starting upload for file exp015-fold-3-epoch=1.ckpt


100%|██████████| 2.09G/2.09G [01:16<00:00, 29.3MB/s]


Upload successful: exp015-fold-3-epoch=1.ckpt (2GB)
Starting upload for file exp015-fold-4-epoch=1.ckpt


100%|██████████| 2.09G/2.09G [01:15<00:00, 29.5MB/s]


Upload successful: exp015-fold-4-epoch=1.ckpt (2GB)
