# import

In [1]:
import gc
import os
import re
import sys
import time
import warnings

import numpy as np
import pandas as pd
import torch
from ignite.handlers import create_lr_scheduler_with_warmup
from matplotlib import pyplot as plt
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import (StratifiedGroupKFold, StratifiedKFold,
                                     train_test_split)
from sklearn.preprocessing import LabelEncoder
from torch import nn, optim
from torch.utils.checkpoint import checkpoint
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (AutoConfig, AutoModel, AutoTokenizer,
                          DataCollatorWithPadding)

#import bitsandbytes as bnb

In [2]:
pd.set_option("display.max_column", 100)
pd.set_option("display.max_row", 100)

# パラメータの設定

In [3]:
# パラメータの設定
MODELNAME="microsoft/deberta-v3-base"
config = AutoConfig.from_pretrained(MODELNAME).to_dict()
config["model_name"] = MODELNAME
config["max_token_len"] = 512
config["drop_rate"] = 0.4
config["output_size"] = 3
config["fold_split"] = 3

config["train_batch_size"] = 16
config["valid_batch_size"] = 16
config["num_epochs"] = 4

config["learning_rate"] = 1e-5
config["lr_T_max"] = 500
config["min_lr"] = 1e-6
config["weight_decay"] = 0.005
# config["warmup_start_value"] = 0.0
# config["warmup_end_value"] = 0.1
# config["warmup_duration"] = 3

config["label_smoothing"] = 0.2

config["gradient_checkpoint"] = True
config["freezing"] = True
config["header_type"] = "Concatenate"

# データの読み込み

In [4]:
essay_data_path = "/home/jovyan/work/data/train/"
def get_essay(essay_id):
    essay_path = os.path.join(essay_data_path, f"{essay_id}.txt")
    essay_text = open(essay_path, 'r').read()
    return essay_text

In [5]:
df = pd.read_csv(
    "/home/jovyan/work/data/train.csv",
    index_col="discourse_id",
)

In [6]:
df["essay_text"] = df["essay_id"].apply(get_essay)

## データのラベル定義

In [7]:
discourse_types = ["Lead", "Position", "Claim", "Evidence", "Counterclaim", "Concluding Statement", "Rebuttal"]
dicourse_effectiveness_cols = ["Ineffective", "Effective", "Adequate"]

## データ整形

In [8]:
#下処理:discourse_typeとdiscourse_textとessayを結合する
sep = AutoTokenizer.from_pretrained(config["model_name"]).sep_token
df["inputs"] = df.discourse_type + sep + df.discourse_text + sep + df.essay_text

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
# 目的変数をone hotエンコーディングする
df = pd.concat([df, pd.get_dummies(df.discourse_effectiveness)], axis=1)

In [10]:
# データの分割
df_train_valid, df_test = train_test_split(
    df, test_size=0.2, shuffle=True, random_state=0, stratify=df["discourse_effectiveness"]
)

In [11]:
# indexを整数連番に
df_train_valid = df_train_valid.reset_index(drop=False)

In [12]:
# Stratified K foldの作成
sgkf = StratifiedGroupKFold(n_splits=config["fold_split"], random_state=None)

for fold, (_train_index, _valid_index) in enumerate(
    sgkf.split(
        X=df_train_valid,
        y=df_train_valid.discourse_effectiveness,
        groups=df_train_valid.essay_id,
    )
):
    df_train_valid.loc[_valid_index, "kfold"] = int(fold)
    
df_train_valid["kfold"] = df_train_valid["kfold"].astype(int)

# definition

## data processing

In [13]:
#textをtokenizeするクラス(前処理)
class tokenizer(object):
    def __init__(self, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.padding_side=tokenizer.padding_side
        self.pad_token_id = tokenizer.pad_token_id

    def __call__(self, text):
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True
        )
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
        }

In [14]:
# Dynamic Padding (Collate)
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        output["token_type_ids"] = [sample["token_type_ids"] for sample in batch]
        if self.isTrain:
            output["labels"] = [sample["labels"].tolist() for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
            output["token_type_ids"] = [s + (batch_max - len(s)) * [0] for s in output["token_type_ids"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]
            output["token_type_ids"] = [(batch_max - len(s)) * [0] + s for s in output["token_type_ids"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        output["token_type_ids"] = torch.tensor(output["token_type_ids"], dtype=torch.long)
        if self.isTrain:
            output["labels"] = torch.tensor(output["labels"], dtype=torch.float)

        return output

In [15]:
# Datasetの定義
class CreateDataset(Dataset):
    def __init__(self, X, y, transform):
        self.X = X
        self.y = y
        self.transform=transform
    def __len__(self):  # len(Dataset)で返す値を指定
        return len(self.y)

    def __getitem__(self, index):  # Dataset[index]で返す値を指定
        text = self.X[index]
        output = self.transform(text)
        output["labels"] = self.y[index]
        return output

In [16]:
# Subsetの定義
class CreateSubset(Dataset):
    def __init__(self, X, y, transform, indices):
        self.X = X
        self.y = y
        self.indices = indices
        self.transform = transform

    def __len__(self):  # len(Dataset)で返す値を指定
        return len(self.indices)

    def __getitem__(self, idx):  # Dataset[index]で返す値を指定
        _X = self.X[self.indices[idx]]
        output_dict = self.transform(_X)
        output_dict["labels"] = self.y[self.indices[idx]]
        return output_dict

## utils

In [17]:
# freezing
def freeze(module):
    for parameter in module.parameters():
        parameter.require_grad = False

In [18]:
# 8-bit optimizer
def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [19]:
def calculate_loss_f1(model, criterion, loader, device):
    """損失・正解率を計算"""
    model.eval()
    loss = 0.0
    total = 0
    correct = 0
    with torch.no_grad():
        for data in loader:
            # デバイスの指定
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            token_type_ids = data["token_type_ids"].to(device)
            labels = data["labels"].to(device)

            # 順伝播
            outputs = model(input_ids, attention_mask, token_type_ids)

            # 損失計算
            loss += criterion(outputs, labels).item()

            # 確率計算
            prob = torch.sigmoid(outputs)
            pred = torch.where(prob > 0.5, 1, 0)

            # f1スコア計算
            f1 = f1_score(pred.cpu().numpy(), labels.cpu().numpy(), average="macro", zero_division=0)

    return loss / len(loader), f1

In [20]:
# EralyStopクラス
class EarlyStopping:
    def __init__(
        self,
        patience,
        threshold,
        verbose=False,
        path="/home/jovyan/work/data/checkpoint/checkpoint_model.pth",
    ):
        """引数：最小値の非更新数カウンタ、最小値判定の閾値, 表示設定、モデル格納path"""

        self.patience = patience  # 設定ストップカウンタ
        self.threshold = threshold  # 最小値判定の閾値。比率で指定
        self.verbose = verbose  # 表示の有無
        self.counter = 0  # 現在のカウンタ値
        self.early_stop = False  # ストップフラグ
        self.val_loss_min = np.Inf  # 前回のベストスコア記憶用
        self.path = path  # ベストモデル格納path

    def __call__(self, val_loss, model):
        if val_loss > (1 - self.threshold) * self.val_loss_min:  # ベストスコアを更新できなかった場合
            self.counter += 1  # ストップカウンタを+1
            if self.verbose:  # 表示を有効にした場合は経過を表示
                print(
                    f"EarlyStopping counter: {self.counter} out of {self.patience}"
                )  # 現在のカウンタを表示する
            if self.counter >= self.patience:  # 設定カウントを上回ったらストップフラグをTrueに変更
                self.early_stop = True

        else:  # ベストスコアを更新した場合
            if self.verbose:  # 表示を有効にした場合は、前回のベストスコアからどれだけ更新したか？を表示
                print(
                    f"Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ..."
                )
            torch.save(model.state_dict(), self.path)  # ベストモデルを指定したpathに保存

            self.val_loss_min = val_loss
            self.counter = 0  # ストップカウンタリセット

## model

In [21]:
# BERT分類モデル
class BERTClass(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.bert = AutoModel.from_pretrained(
            self.config["model_name"], output_hidden_states=True
        )
        self.drop = nn.Dropout(self.config["drop_rate"])
        if self.config["header_type"] == "Linear":
            self.fc = nn.Linear(self.config["hidden_size"], self.config["output_size"])
        elif self.config["header_type"] == "Pooling":
            self.pooling = nn.AdaptiveMaxPool1d(1)
            self.fc = nn.Linear(
                self.config["hidden_size"], self.config["output_size"]
            )
        elif self.config["header_type"] == "Couvolution":
            self.cnn1 = nn.Conv1d(
                self.config["hidden_size"], 256, kernel_size=2, padding=1
            )
            self.cnn2 = nn.Conv1d(256, 1, kernel_size=2, padding=1)
        elif self.config["header_type"] == "Concatenate":
            self.fc = nn.Linear(
                self.config["hidden_size"] * 4, self.config["output_size"]
            )
        else:
            raise NotImplementedError

        # Gradient Checkpointing
        if self.config["gradient_checkpoint"]:
            self.bert.gradient_checkpointing_enable()
        # Freeze
        if self.config["freezing"]:
            freeze(self.bert.embeddings)
            freeze(self.bert.encoder.layer[:2])

    def forward(self, input_ids, attention_mask, token_type_ids):
        x = self.bert(
                input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
            )
        if self.config["header_type"] == "Linear":
            x = self.drop(x.hidden_states[-1])
            x = self.fc(x[:, 0, :])
        elif self.config["header_type"] == "Pooling":
            x = self.drop(x.hidden_states[-1])
            x, _ = x.max(1)
            x = self.fc(x)
        elif self.config["header_type"] == "Couvolution":
            x.hidden_states[-1].permute(0, 2, 1)
            x = nn.functional.relu(self.cnn1(x))
            x = self.cnn2(x)
            x, _ = torch.max(x, 2)
        elif self.config["header_type"] == "Concatenate":
            x = torch.cat(
                [x["hidden_states"][-1 * i][:, 0] for i in range(1, 4 + 1)], dim=1
            )  # concatenate
            x = self.fc(x)
        else:
            raise NotImplementedError
        return x

# training

In [22]:
def train_model(
    X_train_valid,
    y_train_valid,
    df_fold,
    model,
    criterion,
    optimizer,
    schedular,
    config,
    device=None,
):
    """モデルの学習を実行し、損失・正解率のログを返す"""
    # デバイスの指定
    model.to(device)

    # tokenizerの作成
    _tokenizer = tokenizer(
        AutoTokenizer.from_pretrained(config["model_name"]), config["max_token_len"]
    )
    # collate_fnの設定(dynamic padding)
    # collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)
    collate_fn = Collate(_tokenizer)

    torch.backends.cudnn.benchmark = True

    # 学習
    log_train = []
    log_valid = []

    earlystopping = EarlyStopping(patience=15, threshold=0.001, verbose=True)
    for epoch_num in range(config["num_epochs"]):
        # 開始時刻の記録
        s_time = time.time()

        # Kfoldsの選択
        fold = epoch_num % config["fold_split"]
        train_index =  df_fold[df_fold != fold].index
        valid_index =  df_fold[df_fold == fold].index

        # dataloaderの作成
        dataset_train = CreateSubset(
            X_train_valid, y_train_valid, _tokenizer, train_index
        )
        dataloader_train = DataLoader(
            dataset_train,
            batch_size=config["train_batch_size"],
            collate_fn=collate_fn,
            shuffle=True,
            num_workers=4,
            pin_memory=True,
        )
        dataset_valid = CreateSubset(
            X_train_valid, y_train_valid, _tokenizer, valid_index
        )
        dataloader_valid = DataLoader(
            dataset_valid,
            batch_size=config["valid_batch_size"],
            collate_fn=collate_fn,
            shuffle=False,
            num_workers=4,
            pin_memory=True,
        )

        # 訓練モードに設定
        model.train()

        # Automatic Mixed Precision
        scaler = torch.cuda.amp.GradScaler()

        for data in tqdm(dataloader_train, leave=False):
            # デバイスの指定
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            token_type_ids = data["token_type_ids"].to(device)
            labels = data["labels"].to(device)

            # 勾配をゼロで初期化
            optimizer.zero_grad()

            # 順伝播 + 誤差逆伝播 + 重み更新
            with torch.cuda.amp.autocast():
                outputs = model(input_ids, attention_mask, token_type_ids)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            torch.cuda.empty_cache()

            # 損失と正解率の算出
            loss_train, f1_train = calculate_loss_f1(
                model, criterion, dataloader_train, device
            )
            loss_valid, f1_valid = calculate_loss_f1(
                model, criterion, dataloader_valid, device
            )
            log_train.append([loss_train, f1_train])
            log_valid.append([loss_valid, f1_valid])

            # 終了時刻の記録
            e_time = time.time()

            # 毎エポックearlystoppingの判定をさせる
            if epoch_num > 0:
                earlystopping(loss_valid, model)  # callメソッド呼び出し
                if earlystopping.early_stop:  # ストップフラグがTrueの場合、breakでforループを抜ける
                    print("Early Stopping!")
                    break
        schedular.step()
        # ログを出力
        print(
            f"epoch: {epoch_num + 1}, loss_train: {loss_train:.4f}, loss_valid: {loss_valid:.4f}, {(e_time - s_time):.4f}sec"
        )
        if epoch_num > 0:
            if earlystopping.early_stop:
                break
    return {"train": log_train, "valid": log_valid}

In [None]:
# モデルの定義
model = BERTClass(config)

# 損失関数の定義
criterion = torch.nn.CrossEntropyLoss(label_smoothing=config["label_smoothing"])

# 8-bit optimizer
model_parameters = filter(lambda parameter: parameter.requires_grad, model.parameters())

# オプティマイザの定義
optimizer = torch.optim.AdamW(
    params=model_parameters,
    lr=config["learning_rate"],
    weight_decay=config["weight_decay"],
)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=config["lr_T_max"], eta_min=config["min_lr"]
)
# scheduler = create_lr_scheduler_with_warmup(
#     torch_lr_scheduler,
#     warmup_start_value=config["warmup_start_value"],
#     warmup_end_value=config["warmup_end_value"],
#     warmup_duration=config["warmup_duration"],
# )

# デバイスの指定
device = torch.device("cuda:0")

# モデルの学習
log = train_model(
    df_train_valid["inputs"],
    df_train_valid[dicourse_effectiveness_cols].to_numpy(),
    df_train_valid["kfold"],
    model,
    criterion,
    optimizer,
    scheduler,
    config,
    device=device,
)

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have b

In [None]:
# ログの可視化
x_axis = [x for x in range(1, len(log["train"]) + 1)]
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].plot(x_axis, np.array(log["train"]).T[0], label="train")
ax[0].plot(x_axis, np.array(log["valid"]).T[0], label="valid")
ax[0].set_xlabel("epoch")
ax[0].set_ylabel("loss")
ax[0].legend()
ax[1].plot(x_axis, np.array(log["train"]).T[1], label="train")
ax[1].plot(x_axis, np.array(log["valid"]).T[1], label="valid")
ax[1].set_xlabel("epoch")
ax[1].set_ylabel("accuracy")
ax[1].legend()
plt.show()

# infer

In [None]:
# Datasetの作成
_tokenizer = tokenizer(AutoTokenizer.from_pretrained(config["model_name"]), config["max_token_len"])
y_test = pd.get_dummies(df_test["discourse_effectiveness"]).to_numpy()
dataset_test = CreateDataset(df_test["discourse_text"], y_test, _tokenizer)

In [None]:
def prediction(model, dataset, device):
    # Dataloaderの作成
    loader = DataLoader(dataset, batch_size=100, shuffle=False)

    model.eval()
  
    prob = []
    pred = []
    label = []
    with torch.no_grad():
        for data in loader:
            # デバイスの指定
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            token_type_ids = data["token_type_ids"].to(device)
            label.append(data["labels"].to(device))

            # 順伝播 + 予測値の取得 
            outputs = model.forward(input_ids, attention_mask, token_type_ids)
            prob.append(torch.sigmoid(outputs))
            pred.append(torch.where(outputs>0.5, 1, 0))
            
        pred = torch.cat(pred, dim=0)
        prob = torch.cat(prob, dim=0)
        label = torch.cat(label, dim=0)
        
        del model, loader
        gc.collect()
        return pred, label, prob

In [None]:
y_pred, y_label, y_prob = prediction(model, dataset_test, device)

In [None]:
#precision, recallの算出
def calculate_pr_rc(pred, label):
    pr_rc = []
    for col in pred.columns:
        pr = precision_score(pred.loc[:, col].values, label.loc[:, col].values)
        rc = recall_score(pred.loc[:, col].values, label.loc[:, col].values)
        pr_rc.append([pr, rc])
    return pr_rc

In [None]:
precision_score(y_pred.to('cpu').numpy(), y_label.to('cpu').numpy(), average="macro")

In [None]:
recall_score(y_pred.to('cpu').numpy(), y_label.to('cpu').numpy(), average="macro")

In [None]:
def calc_cross_entropy(y_prob, y_label):
    loss = -1 * np.mean(np.sum( y_label * np.log(y_prob), axis=1))
    return loss

In [None]:
calc_cross_entropy(y_prob.to('cpu').numpy(), y_label.to('cpu').numpy())

In [None]:
torch.save(model.state_dict(), "/home/jovyan/work/data/checkpoint/checkpoint_model_.pth")

In [None]:
def plot_performance(performances):
    plt.rcParams["font.size"] = 16
    
    length = len(performances.columns)
    
    plt.figure(figsize=(25, 10))
    left = np.arange(length)
    
    plt.bar(left - 0.2, performances.loc["precision", :], width=0.4, label="precision")
    plt.bar(left + 0.2, performances.loc["recall", :], width=0.4, label="recall")
    plt.xticks(np.arange(length), performances.columns.tolist(), rotation=90)
    plt.yticks(np.arange(11)*0.1, np.round(np.arange(11)*0.1, 1))
    plt.grid(axis="y")
    plt.title("precision  vs recall ")
    plt.legend(loc="upper left")
    plt.show()

In [None]:
plot_performance(test_pr_rc)