In [None]:
!nvidia-smi

Thu Mar  2 21:36:38 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    25W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%pip install datasets transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [None]:
%%writefile 2nd_stage_model_training.py

# ===============================================================
#  Library
# ===============================================================
import os
import gc
import math
import time
import json
import pickle
import random
import requests
import argparse
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import warnings
warnings.simplefilter("ignore")
from collections import OrderedDict


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader


import transformers
transformers.logging.set_verbosity_error()
from transformers import AutoConfig, AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device: {device}")

# ===============================================================
#  args
# ===============================================================

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=42, required=False)
    parser.add_argument("--input_dir", type=str, default="/content/drive/MyDrive/KAGGLE-LECR/last_data/", 
                        required=False, help=" oof data path ")
    parser.add_argument("--data_dir", type=str, default="/content/drive/MyDrive/KAGGLE-LECR/", 
                        required=False, help=" competition data path ")
    parser.add_argument("--output_dir", type=str, default="/content/drive/MyDrive/KAGGLE-LECR/last_data/", required=False)
    parser.add_argument("--trn_fold", type=int, default=0, required=False)
    parser.add_argument("--filename", type=str, default="exp004", required=False)
    parser.add_argument("--batch_size", type=int, default=96, required=False)
    parser.add_argument("--max_len", type=int, default=256, required=False)
    parser.add_argument("--num_workers", type=int, default=0, required=False)
    parser.add_argument("--base_model", type=str, default="sentence-transformers/all-MiniLM-L6-v2", required=False)
    parser.add_argument("--target_cols", type=str, nargs="*", default=["target"], required=False)
    parser.add_argument("--encoder_lr", type=float, default=2e-5, required=False)
    parser.add_argument("--decoder_lr", type=float, default=2e-5, required=False)
    parser.add_argument("--eps", type=float, default=1e-6, required=False)
    parser.add_argument("--betas", type=float, default=(0.9, 0.999), required=False)
    parser.add_argument("--gradient_accumulation_steps", type=int, default=1, required=False)
    parser.add_argument("--max_grad_norm", type=float, default=0.012, required=False)
    parser.add_argument("--num_cycles", type=float, default=0.5, required=False)
    parser.add_argument("--epochs", type=int, default=4, required=False)
    parser.add_argument("--scheduler", type=str, default="cosine", choices=["cosine", "linear"], required=False)
    parser.add_argument("--print_freq", type=int, default=2000, required=False)
    parser.add_argument("--weight_decay", type=float, default=0.01, required=False)
    parser.add_argument("--num_warmup_steps_ratio", type=float, default=0.1, required=False)
    parser.add_argument("--patience", type=int, default=3, required=False)
    parser.add_argument("--steps_per_epoch", type=int, default=None, required=False)
    parser.add_argument("--save_freq", type=int, default=2, required=False)
    parser.add_argument("--debug", action="store_true", required=False)
    parser.add_argument("--apex", action="store_false", required=False)
    parser.add_argument("--resume", action="store_true", required=False)
    args = parser.parse_args()
    args.input_dir = args.input_dir + f"1st/{args.filename}/fold{args.trn_fold}/"
    args.output_dir = args.output_dir + f"2nd/{args.filename}/fold{args.trn_fold}/"
    args.model = args.input_dir + args.base_model.replace("/", "-")  + "_fine-tuned/"
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    return args


# ===============================================================
#  Utils
# ===============================================================

# 任意のメッセージを通知する関数
def send_slack_message_notification(message):
    webhook_url = ' [URL] '  
    data = json.dumps({'text': message})
    headers = {'content-type': 'application/json'}
    requests.post(webhook_url, data=data, headers=headers)


def seed_everything(cfg):
    """set seed"""
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, _ in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs



class EarlyStopping:
    """
    ref: https://qiita.com/ku_a_i/items/ba33c9ce3449da23b503
    """
    def __init__(self, cfg):
        self.cfg = cfg
        self.patience = self.cfg.patience
        self.counter = 0      
        self.best_score = None     
        self.early_stop = False

    def __call__(self, score):
        if self.best_score is None:
            self.best_score = score
        elif score <= self.best_score:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            send_slack_message_notification(f'[{self.cfg.filename}:fold{self.cfg.trn_fold}] EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0


def fbeta_score(y_true_ids: pd.Series, y_pred_ids: pd.Series, beta=2, eps=1e-15):
    """
    Args:
        y_true_ids: true labels
        y_pred_ids: predictions

    It is assumed that the above two are in the same topic order.
    """
    true_ids = y_true_ids.str.split()
    pred_ids = y_pred_ids.str.split()
    score_list = []
    for true, pred in zip(true_ids, pred_ids):
        TP = (set(true) & set(pred))
        try: # predictions があるとき
            precision = len(TP) / len(pred)
            recall = len(TP) / len(true)
            f2 = (1+beta**2) * (precision*recall) / ((beta**2)*precision+recall+eps)
            score_list.append(f2)
        except: # predictions がないとき
            score_list.append(0)
    score = sum(score_list) / len(score_list)
    return score


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def get_f2_score(predictions, valid_folds):
    valid_folds["sigmoid"] = sigmoid(predictions)
    best_score = -np.inf
    best_thrshold = 0
    df_corr = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/correlations.csv")
    topic = valid_folds[["topic_id"]].drop_duplicates().reset_index(drop=True) # valid_foldsに含まれるtopic_id

    best_topic = v
    with tqdm(np.arange(0.001, 0.5, 0.001), desc="Search best threshold") as pbar:
        for thre in pbar:
            valid_folds["pred"] = np.where(valid_folds["sigmoid"] > thre, 1, 0)        
            pred_1 = valid_folds[valid_folds["pred"] == 1].reset_index(drop=True)        
            topic_true = pd.DataFrame(pred_1.groupby("topic_id")["predictions"].agg(list)).reset_index()
            topic_true = pd.merge(topic, topic_true, on="topic_id", how="left").fillna(" ") # predictionsがないものも考慮する
            topic_true["predictions"] = topic_true["predictions"].apply(lambda x: " ".join(x))
            topic_true = pd.merge(topic_true, df_corr, on="topic_id", how="left")
            score = fbeta_score(topic_true["content_ids"], topic_true["predictions"])        
            if score > best_score:
                best_score = score
                best_threshold = thre
                pbar.set_postfix(OrderedDict(best_score=score, best_threshold = thre))
    return best_score, best_threshold


def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters


def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, 
            num_training_steps=num_train_steps, 
            num_cycles=cfg.num_cycles
        )
    return scheduler


# ===============================================================
#  DataLoading
# ===============================================================

def check_contains(row):
    """Check whether the content_id is included in the label. Set to 1 if included, 0 if not."""
    try: # row["content_ids"]が空でないとき
        ground_truth = set(row["content_ids"])
        pred = row["predictions"]
        if pred in ground_truth:
            return 1
        else:
            return 0
    except: # row["content_ids"]が空の時
        return 0
    

def determine(df_topics):
    """
    Determine if the extracted content is correct
    """
    df = df_topics[["topic_id", "predictions"]]
    df.loc[:, "predictions"] = df["predictions"].str.split()
    df = df.explode("predictions", ignore_index=True)
    df = pd.merge(df,
                    df_topics[["topic_id", "content_ids"]],
                    on="topic_id", how="left")
    df.loc[:, "content_ids"] = df["content_ids"].apply(lambda x: x.split())
    tqdm.pandas()
    df.loc[:, "target"] = df.progress_apply(check_contains, axis=1)
    df.drop("content_ids", axis=1, inplace=True)
    return df


def tokenize(cfg, text, text_pair):
    """encode to sort by token_length"""
    inputs = cfg.tokenizer.encode_plus(
        text,
        text_pair,
        return_tensors = None, 
        add_special_tokens = True, 
        # pad_to_max_length = True,
        max_length = cfg.max_len,
        truncation = True
    )["input_ids"]
    return inputs


def prepare_df(cfg, data_type):
    print(f"{data_type} loading...")
    if data_type == "train":
        df = pd.read_csv(f"/content/drive/MyDrive/KAGGLE-LECR/last_data/1st/{cfg.filename}/fold{cfg.trn_fold}/df_train_for_{cfg.filename}.csv")
        
    elif data_type == "validation":
        df = pd.read_csv(f"/content/drive/MyDrive/KAGGLE-LECR/last_data/1st/{cfg.filename}/fold{cfg.trn_fold}/df_valid_for_{cfg.filename}.csv")
        df = df.sort_values('tokenize_length', ascending=True).reset_index(drop=True)

    if cfg.debug:
        df = df.sample(n=200, random_state=42).reset_index(drop=True)
    
    print(f'Input Example[0]: \n  topic  {df["topic_sentence"].values[0]}\n content  {df["content_sentence"].values[0]}')
    print(f'Input Example[1]: \n  topic  {df["topic_sentence"].values[1]}\n content  {df["content_sentence"].values[1]}')
    return df


def load_data(cfg):
    df_train = prepare_df(cfg, data_type="train")
    df_valid = prepare_df(cfg, data_type="validation")

    print("train: \n", df_train["target"].value_counts())
    print("valid: \n", df_valid["target"].value_counts())
    return df_train, df_valid


# ===============================================================
#  tokenizer
# ===============================================================
def tokenizer(cfg):
    cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.model, is_fast=True)
    cfg.tokenizer.save_pretrained(cfg.output_dir+'tokenizer/')
    return 

# ===============================================================
#  Dataset
# ===============================================================
def prepare_input(cfg, text, text_pair):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        text_pair,
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.topic_sentence = df["topic_sentence"].values
        self.content_sentence = df["content_sentence"].values
        self.target = df["target"].values
    def __len__(self):
        return len(self.target)
    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.topic_sentence[item], text_pair=self.content_sentence[item])
        label = torch.tensor(self.target[item], dtype = torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs


# ===============================================================
#  Model
# ===============================================================

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0
            self.config.hidden_dropout_prob = 0
            self.config.attention_dropout = 0
            self.config.attetnion_probs_dropout_prob = 0
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        try:
            self.model.gradient_checkpointing_enable()
        except:
            print(f'{cfg.model} does not support gradient checkpoint.')
        self.fc = nn.Linear(self.config.hidden_size, len(cfg.target_cols))
        self._init_weights(self.fc)
        self.pooling = MeanPooling()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.paddding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            
    def forward(self, inputs):
        transformer_out = self.model(**inputs)
        last_hidden_state = transformer_out.last_hidden_state
        feature = self.pooling(last_hidden_state, inputs['attention_mask'])
        output = self.fc(feature)
        return output


# ===============================================================
#  _loop_fn
# ===============================================================
def train_fn(cfg, fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled = cfg.apex)
    losses = AverageMeter()
    start = time.time()
    global_step = 0
    for step, (inputs, label) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        label = label.to(device)
        batch_size = label.size(0)
        with torch.cuda.amp.autocast(enabled = cfg.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds.view(-1), label)
            if cfg.gradient_accumulation_steps > 1:
                loss = loss / cfg.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        global_step += 1
        scheduler.step()
        if step % cfg.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'maxLR: {maxlr:.8f}  '
                  'minLR: {maxlr:.8f}  '
                  .format(epoch + 1, 
                          step, 
                          len(train_loader), 
                          remain = timeSince(start, float(step + 1) / len(train_loader)),
                          loss = losses,
                          grad_norm = grad_norm,
                          maxlr = scheduler.get_lr()[0],
                          minlr = scheduler.get_lr()[0])
                 )
            send_slack_message_notification("[{filename}:fold{trn_fold}] "
                                            'Epoch: [{epoch}][{step}/{le}] '
                                            'Elapsed {remain:s} '
                                            'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                                            'Grad: {grad_norm:.4f}  '
                                            'maxLR: {maxlr:.8f}  '
                                            'minLR: {maxlr:.8f}  '
                                            .format(filename = cfg.filename,
                                                    trn_fold = cfg.trn_fold,
                                                    epoch = epoch + 1, 
                                                    step = step, 
                                                    le = len(train_loader), 
                                                    remain = timeSince(start, float(step + 1) / len(train_loader)),
                                                    loss = losses,
                                                    grad_norm = grad_norm,
                                                    maxlr = scheduler.get_lr()[0],
                                                    minlr = scheduler.get_lr()[0]))
    return losses.avg


def valid_fn(cfg, valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = time.time()
    for step, (inputs, target) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds.view(-1), target)
            if cfg.gradient_accumulation_steps > 1:
                loss = loss / cfg.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.squeeze().to('cpu').numpy().reshape(-1))
        if step % cfg.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, 
                          len(valid_loader),
                          loss = losses,
                          remain = timeSince(start, float(step + 1) / len(valid_loader))))
            send_slack_message_notification("[{filename}:fold{trn_fold}] "
                                            'EVAL: [{step}/{le}] '
                                            'Elapsed {remain:s} '
                                            'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                                            .format(filename = cfg.filename,
                                                    trn_fold = cfg.trn_fold,
                                                    step = step, 
                                                    le = len(valid_loader),
                                                    loss = losses,
                                                    remain = timeSince(start, float(step + 1) / len(valid_loader))))
    predictions = np.concatenate(preds, axis = 0)
    return losses.avg, predictions

# ===============================================================
#  loop_fn
# ===============================================================

def train_loop(cfg, df_train, df_valid, fold, device):
    
    print(f"========== fold: {fold} training ==========")

    print("train.shape: ", df_train.shape)
    print("valid.shape: ", df_valid.shape)

    # ====================================================
    # loader
    # ====================================================
    train_dataset = TrainDataset(cfg, df_train)
    valid_dataset = TrainDataset(cfg, df_valid)

    train_loader = DataLoader(train_dataset,
                              batch_size=cfg.batch_size,
                              shuffle=True,
                              num_workers=cfg.num_workers,
                              pin_memory=True,
                              drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=cfg.batch_size,
                              shuffle=False,
                              num_workers=cfg.num_workers,
                              pin_memory=True,
                              drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(cfg, config_path=None, pretrained=True)
    torch.save(model.config, cfg.output_dir+'config.pth')
    
    model.to(device)

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=cfg.encoder_lr, 
                                                decoder_lr=cfg.decoder_lr,
                                                weight_decay=cfg.weight_decay)

    optimizer = AdamW(optimizer_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas)

    #for name, p in model.named_parameters():
    #   print(name, p.requires_grad)
    
    # ====================================================
    # scheduler
    # ====================================================    
    if cfg.steps_per_epoch is None:
        num_train_steps=len(train_loader)*(cfg.epochs)//cfg.gradient_accumulation_steps
    else:
        num_train_steps=cfg.steps_per_epoch*(cfg.epochs)//cfg.gradient_accumulation_steps

    cfg.num_warmup_steps=int((len(train_loader)*(cfg.epochs)//cfg.gradient_accumulation_steps)*cfg.num_warmup_steps_ratio)

    callback = EarlyStopping(cfg)
    criterion = nn.BCEWithLogitsLoss(reduction = "mean")

    best_score = -np.inf

    if cfg.resume:
        checkpoint = torch.load(cfg.output_dir+'model_tmp.pth')
        _epoch = checkpoint['epoch']
        best_score = checkpoint['best_score']
        callback.counter = checkpoint['early_stopping']
        callback.best_score = best_score
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        for state in optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(device)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        print(f"Resume from Epoch {_epoch+1}")    
    else:
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================

    for epoch in range(cfg.epochs):
        if cfg.resume and epoch <= _epoch:
            continue

        start_time = time.time()

        # train
        avg_loss = train_fn(cfg, fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(cfg, valid_loader, model, criterion, device)

        # scoring
        #score, thres = get_f2_score(predictions, df_valid)

        elapsed = time.time() - start_time

        print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        #print(f'Epoch {epoch+1} - Score: {score:.4f}  Thres: {thres}')

        #send_slack_message_notification(f'[{cfg.filename}:fold{cfg.trn_fold}]  Epoch {epoch+1} - Score: {score:.4f}  Thres: {thres}')
        
        #print('\033[32m'+f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model'+'\033[0m')
        #send_slack_message_notification(f'[{cfg.filename}:fold{cfg.trn_fold}]  Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
        torch.save({'model': model.state_dict(),
                    'predictions': predictions},
                    cfg.output_dir+f"{cfg.base_model.replace('/', '-')}_fold{fold}_epoch{epoch}_best.pth")
        
        predictions = torch.load(cfg.output_dir+f"{cfg.base_model.replace('/', '-')}_fold{fold}_epoch{epoch}_best.pth", 
                                 map_location=torch.device('cpu'))['predictions']
        df_valid["valid_pred"] = predictions
        df_valid.to_csv(cfg.output_dir+f"oof_df_fold{cfg.trn_fold}_epoch{epoch}.csv", index=False)

        # 途中保存用
        torch.save({'epoch': epoch, 
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    "best_score":best_score,
                    "early_stopping":callback.counter,
                    'scheduler_state_dict':scheduler.state_dict()},
                cfg.output_dir+'model_tmp.pth')
        print(f"Epoch {epoch+1} Model save is finished.")
        break

    torch.cuda.empty_cache()
    gc.collect()
    
    return 


# ===============================================================
#  Main
# ===============================================================
def main(cfg):
    seed_everything(cfg)
    tokenizer(cfg)
    df_train, df_valid = load_data(cfg)
    train_loop(cfg, df_train, df_valid, cfg.trn_fold, device)
    return

# ===============================================================
#  Execute
# ===============================================================
if __name__ == "__main__":
    args = parse_args()
    for k, v in vars(args).items():
        print(f"{k}: {v}")
    main(args)

Writing 2nd_stage_model_training.py


In [None]:
!python 2nd_stage_model_training.py\
--print_freq 5000\
--base_model sentence-transformers/all-mpnet-base-v2\
--filename exp006\
--batch_size 32

2023-03-02 21:37:18.249027: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-02 21:37:19.246722: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2023-03-02 21:37:19.246858: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
device: cuda
seed: 42
input_dir: /content/drive/MyDrive/KAGGLE-LECR/last_data/1st/exp006/fold0

In [None]:
from google.colab import runtime
runtime.unassign()

In [None]:
from google.colab import runtime
runtime.unassign()