In [1]:
!pip install multiprocesspandas
!pip install sentencepiece

Collecting multiprocesspandas
  Downloading multiprocesspandas-1.1.3-py3-none-any.whl (4.6 kB)
Collecting multiprocess==0.70.11.1
  Downloading multiprocess-0.70.11.1-py37-none-any.whl (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.0/109.0 kB[0m [31m844.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: multiprocess, multiprocesspandas
  Attempting uninstall: multiprocess
    Found existing installation: multiprocess 0.70.14
    Uninstalling multiprocess-0.70.14:
      Successfully uninstalled multiprocess-0.70.14
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pathos 0.3.0 requires multiprocess>=0.70.14, but you have multiprocess 0.70.11.1 which is incompatible.[0m[31m
[0mSuccessfully installed multiprocess-0.70.11.1 multiprocesspandas-1.1.3
[0m

# CV Split

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from multiprocesspandas import applyparallel
from tqdm import tqdm

In [3]:
N_SPLITS = 5
DATA_PATH = '/kaggle/input/learning-equality-curriculum-recommendations/'

In [21]:
topic_df = pd.read_csv(DATA_PATH + 'topics.csv')
content_df = pd.read_csv(DATA_PATH + 'content.csv')
corr_df = pd.read_csv(DATA_PATH + 'correlations.csv')

# cause test set does not have data from `source`
topic_df_non_source = topic_df[topic_df['category']!='source'].reset_index(drop=True)
topic_df_non_source['stratify'] = topic_df_non_source['category'] + \
topic_df_non_source['language'] + topic_df_non_source['description'].apply(lambda x: str(isinstance(x, str))) + \
topic_df_non_source['has_content'].apply(str)

In [22]:
topic_df_non_source.head()

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content,stratify
0,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False,alignedenFalseFalse
1,t_0008768bdee6,100 સુધીનો સરવાળો,37 અને 49 જેવી બે-અંકની સંખ્યાઓ ઉમેરતા શીખો.,5223e0,supplemental,4,gu,t_0da7a331d666,True,supplementalguTrueTrue
2,t_0008a1bd84ba,12. 20: Bird Reproduction,,ebc86c,supplemental,5,en,t_c44ac9711007,True,supplementalenFalseTrue
3,t_000d1fb3f2f5,2.1.2 - Logarithms,,e77b55,aligned,5,en,t_b897d168db90,True,alignedenFalseTrue
4,t_00102869fbcb,Triangles and polygons,Learning outcomes: students must be able to so...,a91e32,aligned,3,en,t_039cecc12bb8,True,alignedenTrueTrue


In [23]:
kf = StratifiedGroupKFold(n_splits=N_SPLITS)
folds = list(kf.split(topic_df_non_source, y=topic_df_non_source['stratify'], groups=topic_df_non_source["channel"]))
topic_df_non_source['fold'] = -1

for fold, (train_idx, val_idx) in enumerate(folds):
    topic_df_non_source.loc[val_idx, 'fold'] = fold



In [24]:
fold_df = topic_df.merge(topic_df_non_source[['id', 'fold']], on='id', how='left').reset_index(drop=True)[['id', 'fold']].fillna(-1).rename(columns={'id': 'topic_id'})
fold_df['fold'] = fold_df['fold'].astype(int)

In [25]:
fold_df.head()

Unnamed: 0,topic_id,fold
0,t_00004da3a1b2,-1
1,t_000095e03056,0
2,t_00068291e9a4,-1
3,t_00069b63a70a,-1
4,t_0006d41a73a8,-1


In [26]:
corr_df['content_ids'] = corr_df['content_ids'].apply(lambda x:x.split())
corr_df = corr_df.explode('content_ids').reset_index(drop=True)

In [27]:
corr_df.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d
1,t_00004da3a1b2,c_376c5a8eb028
2,t_00004da3a1b2,c_5bc0e1e2cba0
3,t_00004da3a1b2,c_76231f9d0b5e
4,t_00068291e9a4,c_639ea2ef9c95


In [28]:
topic_df = topic_df.fillna('')
topic_df['topic_full_text'] = topic_df['title'] + '[SEP]' + topic_df['description']
topic_df = topic_df[['id', 'topic_full_text', 'language']]
topic_df.head()

Unnamed: 0,id,topic_full_text,language
0,t_00004da3a1b2,Откриването на резисторите[SEP]Изследване на м...,bg
1,t_000095e03056,Unit 3.3 Enlargements and Similarities[SEP],en
2,t_00068291e9a4,Entradas e saídas de uma função[SEP]Entenda um...,pt
3,t_00069b63a70a,Transcripts[SEP],en
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,bg


In [29]:
df = corr_df.merge(topic_df, left_on='topic_id', right_on='id', how='left')
df = df[['topic_id', 'content_ids', 'topic_full_text', 'language']]
df = df.rename(columns={'language': 'topic_language'})
df.head()

Unnamed: 0,topic_id,content_ids,topic_full_text,topic_language
0,t_00004da3a1b2,c_1108dd0c7a5d,Откриването на резисторите[SEP]Изследване на м...,bg
1,t_00004da3a1b2,c_376c5a8eb028,Откриването на резисторите[SEP]Изследване на м...,bg
2,t_00004da3a1b2,c_5bc0e1e2cba0,Откриването на резисторите[SEP]Изследване на м...,bg
3,t_00004da3a1b2,c_76231f9d0b5e,Откриването на резисторите[SEP]Изследване на м...,bg
4,t_00068291e9a4,c_639ea2ef9c95,Entradas e saídas de uma função[SEP]Entenda um...,pt


In [30]:
content_df = content_df.fillna('')
content_df['content_full_text'] =  content_df['title'] + ' [SEP] ' + content_df['description'] + ' [SEP] ' + content_df['text']
content_df = content_df[['id', 'content_full_text', 'language']]
content_df.head()

Unnamed: 0,id,content_full_text,language
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,93...",es
1,c_000087304a9e,Trovare i fattori di un numero [SEP] Sal trova...,it
2,c_0000ad142ddb,Sumar curvas de demanda [SEP] Cómo añadir curv...,es
3,c_0000c03adc8d,Nado de aproximação [SEP] Neste vídeo você vai...,pt
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf [SEP] geometr...,es


In [31]:
df = df.merge(content_df, left_on='content_ids', right_on='id', how='left')
df = df.rename(columns={'language':'content_language'})
df['label'] = 1
df.head()

Unnamed: 0,topic_id,content_ids,topic_full_text,topic_language,id,content_full_text,content_language,label
0,t_00004da3a1b2,c_1108dd0c7a5d,Откриването на резисторите[SEP]Изследване на м...,bg,c_1108dd0c7a5d,Молив като резистор [SEP] Моливът причинява пр...,bg,1
1,t_00004da3a1b2,c_376c5a8eb028,Откриването на резисторите[SEP]Изследване на м...,bg,c_376c5a8eb028,Да чуем променливото съпротивление [SEP] Тук ч...,bg,1
2,t_00004da3a1b2,c_5bc0e1e2cba0,Откриването на резисторите[SEP]Изследване на м...,bg,c_5bc0e1e2cba0,Променлив резистор (реостат) с графит от молив...,bg,1
3,t_00004da3a1b2,c_76231f9d0b5e,Откриването на резисторите[SEP]Изследване на м...,bg,c_76231f9d0b5e,Последователно свързване на галваничен елемент...,bg,1
4,t_00068291e9a4,c_639ea2ef9c95,Entradas e saídas de uma função[SEP]Entenda um...,pt,c_639ea2ef9c95,Dados e resultados de funções: gráficos [SEP] ...,pt,1


# Random Sample According to Language

In [33]:
neg_df = []
sample_n = 5

def negative_sample(x, candidates):
    topic_language = x['topic_language'][0]
    candidates = candidates[candidates['content_language'] == topic_language]
    return candidates[['topic_full_text', 'content_full_text']].sample(n=sample_n)

for topic_id in tqdm(df['topic_id'].unique()):
    sub_df = df[df['topic_id'] == topic_id]
    topic_language = sub_df['topic_language'].unique()[0]
    candidates = df[df['content_language'] == topic_language]
    sample_neg = candidates[['topic_full_text', 'content_full_text']].sample(n=sample_n)
    sample_neg['topic_id'] = topic_id
    sample_neg['label'] = 0
    neg_df.append(sample_neg)
    
neg_df = pd.concat(neg_df)
neg_df

100%|██████████| 61517/61517 [58:42<00:00, 17.46it/s]  


Unnamed: 0,topic_full_text,content_full_text,topic_id,label
254659,"Десетични знаци, дроби и проценти[SEP]Нека да ...",Преобразуване на обикновени дроби в десетични ...,t_00004da3a1b2,0
190772,Въведение в отрицателните числа[SEP]Какво са о...,Обяснение на отрицателните числа [SEP] Упражне...,t_00004da3a1b2,0
89535,Показателни и логаритмични уравнения и неравен...,Текстова задача с експоненциален модел: размно...,t_00004da3a1b2,0
273798,Структури на алканите [SEP]Чувстваш несигурнос...,Анализ на конформациите на пропана [SEP] Как с...,t_00004da3a1b2,0
49248,Свойство на ъглополовящите в триъгълник[SEP]В ...,Златното съотношение [SEP] Въведение към едно ...,t_00004da3a1b2,0
...,...,...,...,...
263708,رِحْلَةُ البَحْثِ عنْ خطِّ الاسْتِواءِ[SEP],القواعد [SEP] [SEP] ممّ تألّفت شبه الجملة هذه...,t_fffe811a6da9,0
1456,تَحَدّي العَصْرِ[SEP],"الاستيعاب [SEP] [SEP] حلّل شخصيّة المعلّمة ""ر...",t_fffe811a6da9,0
6354,يميّز معادلات خطّية بمتغيّرين ويكوّنها ويحلّها...,يميّز معادلات خطّية بمتغيّرين ويكوّنها ويحلّها...,t_fffe811a6da9,0
268062,"""سَلامُ"" في الهِنْدِ[SEP]",القواعد [SEP] [SEP] حدّد النّعت في الجملة الآ...,t_fffe811a6da9,0


In [34]:
df = df[['topic_id', 'topic_full_text', 'content_full_text', 'label']]
df = pd.concat([df, neg_df])
df = df.drop_duplicates()

In [35]:
df = df.merge(fold_df, left_on='topic_id', right_on='topic_id', how='left')
df = df[['topic_full_text', 'content_full_text', 'label' ,'fold']]
df = df[df['fold'].isin([0, 1, 2, 3, 4])]
df.head()

Unnamed: 0,topic_full_text,content_full_text,label,fold
14,100 સુધીનો સરવાળો[SEP]37 અને 49 જેવી બે-અંકની ...,સમૂહ બનાવ્યા વિના 2-અંકની સંખ્યા ઉમેરવી 2 [SEP...,1,2
15,100 સુધીનો સરવાળો[SEP]37 અને 49 જેવી બે-અંકની ...,સમૂહ બનાવીને ઉમેરવું [SEP] સલ સ્થાન કિંમત વિશ...,1,2
16,100 સુધીનો સરવાળો[SEP]37 અને 49 જેવી બે-અંકની ...,સ્થાનકિંમતના બ્લોકનો ઉપયોગ કરી 100 સુધીની સંખ્...,1,2
17,12. 20: Bird Reproduction[SEP],12. 20: Bird Reproduction [SEP] [SEP] Is this...,1,3
18,12. 20: Bird Reproduction[SEP],Astounding Mating Dance Birds of Paradise -- H...,1,3


In [36]:
df.to_csv('train_folds.csv', index=None)

In [None]:
df = pd.read_csv('train_folds.csv')
df = df[df['fold'].isin([0, 1, 2, 3, 4])]

# Create CFG

In [37]:
import numpy as np
import pandas as pd
import time
import math
from sklearn.metrics import f1_score
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
from transformers import BertTokenizer,AutoModel,AdamW,AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import torch.nn.functional as F
from tqdm import tqdm
import copy
import torch.nn as nn
import os
import json
import gc
import random
from torch.cuda.amp import autocast, GradScaler

In [38]:
class CFG:
    input_path = '/kaggle/input/learning-equality-curriculum-recommendations/'
    model_path = 'microsoft/mdeberta-v3-base' 
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 124
    epochs = 5  # 5
    encoder_lr = 20e-6
    decoder_lr = 1e-3
    min_lr = 0.5e-6
    eps = 1e-6
    betas = (0.9, 0.999)
    weight_decay = 0
    num_fold = 5
    batch_size = 16
    seed = 22
    OUTPUT_DIR = '/kaggle/working/'
    num_workers = 2
    device='cuda'
    print_freq = 100
    apex=False
    start_awp_epoch = 2 # 开始AWP epoch
    adv_lr = 1e-5 # AWP学习率
    adv_eps = 1e-3 # AWP epsilon
    adv_step = 1 # AWP step

In [39]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(CFG.seed)

In [40]:
class TrainDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.topic = df['topic_full_text'].values
        self.content = df['content_full_text'].values
        self.label = df['label'].values
        self.tokenizer = tokenizer
        self.sep_token = tokenizer.sep_token
    def __len__(self):
        return len(self.topic)
    def __getitem__(self, item):
        topic = self.topic[item].replace('[SEP]', self.sep_token)
        content = self.content[item].replace('[SEP]', self.sep_token)
        label = int(self.label[item])

        
        inputs_topic = self.tokenizer(topic, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        inputs_content = self.tokenizer(content, truncation=True, max_length=CFG.max_input_length, padding='max_length')
        return torch.as_tensor(inputs_topic['input_ids'], dtype=torch.long), \
            torch.as_tensor(inputs_topic['attention_mask'], dtype=torch.long), \
            torch.as_tensor(inputs_content['input_ids'], dtype=torch.long), \
            torch.as_tensor(inputs_content['attention_mask'], dtype=torch.long), \
            torch.as_tensor(label, dtype=torch.float)

# Build Model

In [41]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.modeling_outputs import SequenceClassifierOutput

class Custom_Bert_Simple(nn.Module):
    def __init__(self):
        super().__init__()

        self.base = AutoModel.from_pretrained(CFG.model_path)
        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.linear = nn.Linear(self.config.hidden_size*3, 1)

    def forward(self,
        topic_input_ids,
        content_input_ids,
        topic_attention_mask=None,
        content_attention_mask=None, 
        labels=None):
        topic_output = self.base(input_ids=topic_input_ids,attention_mask=topic_attention_mask)
        topic_output = topic_output.last_hidden_state
        topic_output = torch.mean(topic_output, dim=1)

        content_output = self.base(input_ids=content_input_ids,attention_mask=content_attention_mask)
        content_output = content_output.last_hidden_state
        content_output = torch.mean(content_output, dim=1)

        diff = torch.abs(topic_output-content_output)

        sentence_embedding = torch.cat([topic_output, content_output, diff], 1)

        output = self.linear(sentence_embedding)
        
        loss = None
        if labels is not None:
            loss = F.binary_cross_entropy_with_logits(output.view(-1), labels.view(-1))
        
        return loss

# Build Logger

In [42]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [43]:
def get_logger(filename=CFG.OUTPUT_DIR+ 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()
LOGGER.info('===============lr_{}==============='.format(CFG.encoder_lr))
LOGGER.info('===============seed_{}==============='.format(CFG.seed))
LOGGER.info('===============total_epochs_{}==============='.format(CFG.epochs))
LOGGER.info('===============num_warmup_steps_{}==============='.format(CFG.num_warmup_steps))



# Build Pipeline

In [44]:
def train_fn(train_loader, model, optimizer, epoch, scheduler, device):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, batch in enumerate(train_loader):
        batch = [i.to(device) for i in batch]
        topic_input_ids, topic_attention_mask, content_input_ids, content_attention_mask, label = batch
        batch_size = label.size(0)
        loss = model(topic_input_ids, content_input_ids, topic_attention_mask, content_attention_mask, label)
        losses.update(loss.item(), batch_size)
        optimizer.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 500)
        optimizer.step()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, step, len(train_loader),
                          remain=timeSince(start, float(step + 1) / len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg


def valid_fn(valid_loader, model, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    labels = []
    start = end = time.time()
    for step, batch in enumerate(valid_loader):
        label = batch[2].to(device)
        mask = batch[1].to(device)
        input_ids = batch[0].to(device)
        batch_size = label.size(0)
        with torch.no_grad():
            output = model(input_ids, mask, labels=label)
        loss = output.loss
        y_preds = output.logits.argmax(dim=-1)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        labels.append(label.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step + 1) / len(valid_loader))))
    predictions = np.concatenate(preds)
    labels = np.concatenate(labels)
    #print(predictions)
    return losses.avg, predictions, labels

def train_loop(fold, model, train_dataset, valid_dataset):
    LOGGER.info(f"========== training ==========")

    # ====================================================
    # loader
    # ====================================================

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    #model = Custom_Bert_Simple()
    #model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    model.to(CFG.device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
        ]
        return optimizer_parameters

    def get_optimizer(model):

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                'lr': CFG.encoder_lr, 'weight_decay': CFG.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                'lr': CFG.encoder_lr, 'weight_decay': 0.0}
            
        ]
        optimizer = AdamW(optimizer_parameters, lr = CFG.encoder_lr, eps = CFG.eps, betas = CFG.betas)
        return optimizer

    
    optimizer = get_optimizer(model)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        cfg.num_warmup_steps = cfg.num_warmup_steps * num_train_steps
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps,
                num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    # criterion = torch.nn.CrossEntropyLoss(ignore_index=- 1)

    # criterion = LabelSmoothingLoss()
    best_score = float('inf')

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        #avg_loss = train_fn_awp(train_loader, model, optimizer, epoch, scheduler, CFG.device)
        
        avg_loss = train_fn(train_loader, model, optimizer, epoch, scheduler, CFG.device)
        # eval
        #avg_val_loss, predictions, valid_labels = valid_fn(valid_loader, model, CFG.device)

        # scoring
        #score = get_score(predictions, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(
            f'Epoch {epoch + 1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s')
        #LOGGER.info(f'Epoch {epoch + 1} - Score: {score:.4f}')


        if best_score > avg_loss:
            best_score = avg_loss
            #best_predictions = predictions
            LOGGER.info(f'Epoch {epoch + 1} - Save Best Score: {best_score:.4f} Model')
            torch.save(model.state_dict(),
                       CFG.OUTPUT_DIR + "{}_best{}.pth".format(CFG.model_path.replace('/', '_'),fold))



    torch.cuda.empty_cache()
    gc.collect()
    del scheduler, optimizer, model
    return 

In [None]:
model = Custom_Bert_Simple()
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)
fold = 0
tr_data = df[df['fold']!=fold].reset_index(drop=True)
va_data = df[df['fold']==fold].reset_index(drop=True)
tr_dataset = TrainDataset(tr_data,tokenizer)
va_dataset = TrainDataset(va_data,tokenizer)
val_result = train_loop(fold, model,tr_dataset, va_dataset)

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/534M [00:00<?, ?B/s]

# What to do next?
* Learn KFold