In [1]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb = True
    DEBUG = False
    DL = False
    file_name = "002"
    model="mpnet-base"
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    model_config_path = f"/home/jupyter/models/{model}"
    model_bin_path = f"/home/jupyter/models/{model}"
    competition='FB3'
    apex=True
    print_freq=20
    num_workers=4
    gradient_checkpointing=False
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=6
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    train=True
    
if CFG.DEBUG:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [2]:
# ========================================
# library
# ========================================
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import MPNetModel, MPNetTokenizer
import logging
import sys
from contextlib import contextmanager
import time
import random
from tqdm import tqdm
import os


# ==================
# Constant
# ==================
ex = "292"

TRAIN_PATH = '/home/jupyter/feedback-prize-english-language-learning/train.csv'


import os
import datetime
import pickle

# ====================================================
# datetime
# ====================================================
t_delta = datetime.timedelta(hours=9)
JST = datetime.timezone(t_delta, 'JST')
now = datetime.datetime.now(JST)
date = now.strftime('%Y%m%d')
date2 = now.strftime('%Y%m%d%H%M')


# ====================================================
# file_path
# ====================================================
if "/" in CFG.model:
    model_name = CFG.model.split("/")[1]
else:
    model_name = CFG.model

path ="/home/jupyter/feedback-prize-english-language-learning/"
if CFG.DEBUG:
    OUTPUT_DIR = f'/home/jupyter/output/ex/DEBUG/{model_name}/{CFG.file_name}/{date2}/'
else:
    OUTPUT_DIR = f'/home/jupyter/output/ex/{model_name}/{CFG.file_name}/{date2}/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)



MODEL_PATH_BASE = OUTPUT_DIR+f"ex{ex}"
OOF_SAVE_PATH = OUTPUT_DIR+f"ex{ex}_oof.npy"
LOGGER_PATH = OUTPUT_DIR+f"/ex{ex}.txt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# ===============
# Settings
# ===============
SEED = 0
num_workers = 4
BATCH_SIZE = 8
n_epochs = 5
es_patience = 10
max_len = 256
weight_decay = 0.1
lr = 3e-5
num_warmup_steps_rate = 0
eval_steps = 40

MODEL_PATH = "/home/jupyter/models/mpnet-base/"
tokenizer = MPNetTokenizer.from_pretrained(MODEL_PATH)




In [3]:
class CommonLitDataset(Dataset):
    def __init__(self, excerpt, tokenizer, max_len, target=None):
        self.excerpt = excerpt
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.target = target

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, item):
        text = str(self.excerpt[item])
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        if self.target is not None:
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                "target": torch.tensor(self.target[item], dtype=torch.float32)
            }
        else:
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long)
            }


class mpnet_model(nn.Module):
    def __init__(self):
        super(mpnet_model, self).__init__()
        self.mpnet = MPNetModel.from_pretrained(
            MODEL_PATH,
            hidden_dropout_prob=0,
            attention_probs_dropout_prob=0
        )

        # self.dropout = nn.Dropout(p=0.2)
        self.ln = nn.LayerNorm(768)
        self.out = nn.Linear(768, 6)

    def forward(self, ids, mask, token_type_ids):
        # pooler
        emb = self.mpnet(ids, attention_mask=mask, token_type_ids=token_type_ids)[
            "last_hidden_state"]
        emb = torch.mean(emb, axis=1)
        output = self.ln(emb)
        # output = self.dropout(output)
        output = self.out(output)
        return output, emb


def calc_loss(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def setup_logger(out_file=None, stderr=True, stderr_level=logging.INFO, file_level=logging.DEBUG):
    LOGGER.handlers = []
    LOGGER.setLevel(min(stderr_level, file_level))

    if stderr:
        handler = logging.StreamHandler(sys.stderr)
        handler.setFormatter(FORMATTER)
        handler.setLevel(stderr_level)
        LOGGER.addHandler(handler)

    if out_file is not None:
        handler = logging.FileHandler(out_file)
        handler.setFormatter(FORMATTER)
        handler.setLevel(file_level)
        LOGGER.addHandler(handler)

    LOGGER.info("logger set up")
    return LOGGER


@contextmanager
def timer(name):
    t0 = time.time()
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')


LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
setup_logger(out_file=LOGGER_PATH)


2022-10-16 07:26:10,628 - INFO - logger set up


<RootLogger root (DEBUG)>

In [4]:
# ================================
# Main
# ================================
train = pd.read_csv(TRAIN_PATH)
y = train[CFG.target_cols]
#fold_df = pd.read_csv(FOLD_PATH)
fold_array = np.load("/home/jupyter/output/fold/4fold.npy")

if CFG.DEBUG:
    train = pd.read_csv(TRAIN_PATH,nrows=100)
    y = train[CFG.target_cols]
    #fold_df = pd.read_csv(FOLD_PATH)
    fold_array = np.load("/home/jupyter/output/fold/4fold.npy")[:100]

In [5]:
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


In [6]:
with timer("mpnet"):
    set_seed(SEED)
    oof = np.zeros([len(train),len(CFG.target_cols)])
    for fold in range(4):
        x_train, y_train = train.iloc[fold_array !=
                                      fold], y.iloc[fold_array != fold]
        x_val, y_val = train.iloc[fold_array ==
                                  fold], y.iloc[fold_array == fold]

        # dataset
        train_ = CommonLitDataset(
            x_train[CFG.target_cols].values, tokenizer, max_len, y_train.values.reshape(-1, 1))
        val_ = CommonLitDataset(
            x_val[CFG.target_cols].values, tokenizer, max_len, y_val.values.reshape(-1, 1))

        # loader
        train_loader = DataLoader(
            dataset=train_, batch_size=BATCH_SIZE, shuffle=True, num_workers=num_workers)
        val_loader = DataLoader(
            dataset=val_, batch_size=BATCH_SIZE, shuffle=False, num_workers=num_workers)

        # model
        model = mpnet_model()
        model = model.to(device)

        # optimizer, scheduler
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(
                nd in n for nd in no_decay)], 'weight_decay': weight_decay},
            {'params': [p for n, p in param_optimizer if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=lr,
                          betas=(0.9, 0.98),
                          weight_decay=weight_decay,
                          )
        num_train_optimization_steps = int(len(train_loader) * n_epochs)
        num_warmup_steps = int(
            num_train_optimization_steps * num_warmup_steps_rate)
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=num_warmup_steps,
                                                    num_training_steps=num_train_optimization_steps)

        criterion = nn.SmoothL1Loss(reduction='mean')
        best_val = None
        patience = es_patience
        for epoch in tqdm(range(n_epochs)):
            with timer(f"model_fold:{epoch}"):

                # train
                model.train()
                train_losses_batch = []

                epoch_loss = 0

                for i, d in enumerate(train_loader):

                    input_ids = d['input_ids']
                    mask = d['attention_mask']
                    token_type_ids = d["token_type_ids"]
                    target = d["target"]

                    input_ids = input_ids.to(device)
                    mask = mask.to(device)
                    token_type_ids = token_type_ids.to(device)
                    target = target.to(device)
                    optimizer.zero_grad()
                    output, _ = model(input_ids, mask, token_type_ids)
                    loss = criterion(output, target)
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    scheduler.step()
                    train_losses_batch.append(loss.item())

                    if i % eval_steps == 0:
                        # val
                        val_losses_batch = []
                        model.eval()  # switch model to the evaluation mode
                        val_preds = np.ndarray((0, 6))
                        with torch.no_grad():
                            # Predicting on validation set
                            for d in val_loader:
                                # =========================
                                # data loader
                                # =========================
                                input_ids = d['input_ids']
                                mask = d['attention_mask']
                                token_type_ids = d["token_type_ids"]
                                target = d["target"]

                                input_ids = input_ids.to(device)
                                mask = mask.to(device)
                                token_type_ids = token_type_ids.to(device)
                                target = target.to(device)
                                output, _ = model(
                                    input_ids, mask, token_type_ids)

                                loss = criterion(output, target)
                                val_preds = np.concatenate(
                                    [val_preds, output.detach().cpu().numpy()], axis=0)
                                val_losses_batch.append(loss.item())

                        val_loss = np.mean(val_losses_batch)
                        val_rmse = get_score(y_val.to_numpy(), val_preds)[0]
                        LOGGER.info(
                            f'{fold},{epoch}:{i},val_loss:{val_loss},val_rmse:{val_rmse}')
                        # ===================
                        # early stop
                        # ===================

                        if not best_val:
                            best_val = val_rmse
                
                            oof[fold_array == fold] = val_preds
                            # Saving the model
                            torch.save(model.state_dict(),
                                       MODEL_PATH_BASE + f"_{fold}.pth")
                            continue

                        if val_rmse <= best_val:
                            best_val = val_rmse
                            oof[fold_array == fold] = val_preds
                            patience = es_patience
                            # Saving current best model
                            torch.save(model.state_dict(),
                                       MODEL_PATH_BASE + f"_{fold}.pth")
                        # else:
                        #    patience -= 1
                        #    if patience == 0:
                        #        LOGGER.info(f'Early stopping. Best Val : {best_val} Best Rmse : {best_rmse}')
                        #        break
                        model.train()

                train_loss = np.mean(train_losses_batch)
        break

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
2022-10-16 07:26:33,996 - INFO - 0,0:0,val_loss:1.567530895151743,val_rmse:2.2084815796330974
  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
2022-10-16 07:27:17,528 - INFO - 0,0:40,val_loss:0.21617305560446368,val_rmse:0.6629622877971874
  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
2022-10-16 07:27:51,902 - INFO - 0,0:80,val_loss:0.2250841147440478,val_rmse:0.6562950915428752
  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
2022-10-16 07:28:26,191 - INFO - 0,0:120,val_loss

In [7]:
get_score(y_val.to_numpy(),oof[fold_array == fold])

(0.6424162115720078,
 [0.6396375489962369,
  0.6429080496628525,
  0.5610570605951414,
  0.6517650150222908,
  0.701356709073021,
  0.6577728860825043])

In [8]:
np.save(OOF_SAVE_PATH, oof)