# import

In [1]:
import gc
import os
import re
import sys
import time
import warnings

import numpy as np
import pandas as pd
import torch
from matplotlib import pyplot as plt
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import AutoConfig, AutoModel, AutoTokenizer

In [2]:
#warnings.filterwarnings("ignore")

In [3]:
pd.set_option("display.max_column", 100)
pd.set_option("display.max_row", 100)

# データの読み込み

In [4]:
df = pd.read_csv(
    "/home/jovyan/work/data/train.csv",
    index_col="discourse_id"
)

## データのラベル定義

In [5]:
discourse_types = ["Lead", "Position", "Claim", "Evidence", "Counterclaim", "Concluding Statement", "Rebuttal"]
target_col_names = ["Adequate", "Effective", "Ineffective"]

## データ整形

In [6]:
# データの抽出
df = df.loc[
    df["discourse_type"]==discourse_types[0]
]

# データの分割
train, valid_test = train_test_split(
    df, test_size=0.2, shuffle=True, random_state=0, stratify=df["discourse_effectiveness"]
)
valid, test = train_test_split(
    valid_test,
    test_size=0.2,
    shuffle=True,
    random_state=0,
    stratify=valid_test["discourse_effectiveness"],
)

## インデックスの割り当て

In [7]:
def set_index(df, id_col_name):
    df.reset_index(inplace=True)
    index_id = dict(zip(df.index, df[id_col_name]))
    df.drop(id_col_name, inplace=True, axis=1)
    return df, index_id

In [8]:
train, train_index2id = set_index(train, "discourse_id")
valid, valid_index2id = set_index(valid, "discourse_id")
test, test_index2id = set_index(test, "discourse_id")

In [9]:
# 正解ラベルのone-hot化
y_train = pd.get_dummies(train["discourse_effectiveness"]).to_numpy()
y_valid = pd.get_dummies(valid["discourse_effectiveness"]).to_numpy()
y_test = pd.get_dummies(test["discourse_effectiveness"]).to_numpy()

In [10]:
# Datasetの定義
class CreateDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_len):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):  # len(Dataset)で返す値を指定
        return len(self.y)

    def __getitem__(self, index):  # Dataset[index]で返す値を指定
        text = self.X[index]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
        )
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "input_ids": torch.LongTensor(input_ids),
            "attention_mask": torch.LongTensor(attention_mask),
            "token_type_ids": torch.LongTensor(token_type_ids),
            "labels": torch.Tensor(self.y[index]),
        }

In [11]:
# Datasetの作成
max_len = 100
MODELNAME = "microsoft/deberta-xlarge-mnli"
tokenizer = AutoTokenizer.from_pretrained(MODELNAME)

dataset_train = CreateDataset(train["discourse_text"], y_train, tokenizer, max_len)
dataset_valid = CreateDataset(valid["discourse_text"], y_valid, tokenizer, max_len)
dataset_test = CreateDataset(test["discourse_text"], y_test, tokenizer, max_len)

In [12]:
# BERT分類モデルの抽象クラス
class BERTClass(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.bert = AutoModel.from_pretrained(
            MODELNAME, output_hidden_states=True
        )

In [13]:
# BERT分類モデルの定義(Linear)
class BERTLinearClass(BERTClass):
    def __init__(self, config):
        super().__init__(config)
        self.drop = nn.Dropout(self.config["drop_rate"])
        self.fc = nn.Linear(
            self.config["hidden_size"], self.config["output_size"]
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        out = self.bert(
            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )
        out = self.drop(out.hidden_states[-1])
        out = self.fc(out[:, 0, :])
        return out

In [14]:
# BERT分類モデルの定義(Pooling)
class BERTPoolingClass(BERTClass):
    def __init__(self, config):
        super().__init__(config)
        self.drop = nn.Dropout(self.config["drop_rate"])
        self.drop = nn.Dropout(self.config["drop_rate"])
        self.pooling = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(
            self.config["hidden_size"], self.config["output_size"]
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        out = self.bert(
            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )
        out = self.drop(out.hidden_states[-1])
        out, _ = out.max(1)
        out = self.fc(out)
        return out

In [15]:
# BERT分類モデルの定義(Couvolution)
class BERTConvolutionClass(BERTClass):
    def __init__(self, config):
        super().__init__(config)
        self.drop = nn.Dropout(self.config["drop_rate"])
        self.cnn1 = nn.Conv1d(
            self.config["hidden_size"], 256, kernel_size=2, padding=1
        )
        self.cnn2 = nn.Conv1d(256, 1, kernel_size=2, padding=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        out = (
            self.bert(
                input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
            )
            .hidden_states[-1]
            .permute(0, 2, 1)
        )
        out = nn.functional.relu(self.cnn1(out))
        out = self.cnn2(out)
        out, _ = torch.max(out, 2)
        return out

In [16]:
# BERT分類モデルの定義(Concatenate)
class BERTConcatenateClass(BERTClass):
    def __init__(self, config):
        super().__init__(config)
        self.drop = nn.Dropout(self.config["drop_rate"])
        self.fc = nn.Linear(
            self.config["hidden_size"] * 4, self.config["output_size"]
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        out = self.bert(
            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )
        out = torch.cat(
            [out["hidden_states"][-1 * i][:, 0] for i in range(1, 4 + 1)], dim=1
        )  # concatenate
        out = self.fc(out)

        return out

In [17]:
def calculate_loss_f1(model, criterion, loader, device):
    """損失・正解率を計算"""
    model.eval()
    loss = 0.0
    total = 0
    correct = 0
    with torch.no_grad():
        for data in loader:
            # デバイスの指定
            ids = data["ids"].to(device)
            mask = data["mask"].to(device)
            labels = data["labels"].to(device)

            # 順伝播
            outputs = model(ids, mask)

            # 損失計算
            loss += criterion(outputs, labels).item()

            # 確率計算
            prob = torch.sigmoid(outputs)
            pred = torch.where(prob > 0.5, 1, 0)

            # f1スコア計算
            f1 = f1_score(pred.cpu().numpy(), labels.cpu().numpy(), average="macro")

    return loss / len(loader), f1

In [18]:
# EralyStopクラス
class EarlyStopping:
    def __init__(
        self, patience=3, threshold=0.1, verbose=False, path="checkpoint_model.pth"
    ):
        """引数：最小値の非更新数カウンタ、最小値判定の閾値, 表示設定、モデル格納path"""

        self.patience = patience  # 設定ストップカウンタ
        self.threshold = threshold  # 最小値判定の閾値。比率で指定
        self.verbose = verbose  # 表示の有無
        self.counter = 0  # 現在のカウンタ値
        self.early_stop = False  # ストップフラグ
        self.val_loss_min = np.Inf  # 前回のベストスコア記憶用
        self.path = path  # ベストモデル格納path

    def __call__(self, val_loss, model):
        if val_loss > (1 - self.threshold) * self.val_loss_min:  # ベストスコアを更新できなかった場合
            self.counter += 1  # ストップカウンタを+1
            if self.verbose:  # 表示を有効にした場合は経過を表示
                print(
                    f"EarlyStopping counter: {self.counter} out of {self.patience}"
                )  # 現在のカウンタを表示する
            if self.counter >= self.patience:  # 設定カウントを上回ったらストップフラグをTrueに変更
                self.early_stop = True
                
        else:  # ベストスコアを更新した場合
            if self.verbose:  # 表示を有効にした場合は、前回のベストスコアからどれだけ更新したか？を表示
                print(
                    f"Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ..."
                )
            torch.save(model.state_dict(), self.path)  # ベストモデルを指定したpathに保存
            
            self.val_loss_min = val_loss
            self.counter = 0  # ストップカウンタリセット

In [16]:
def train_model(
    dataset_train,
    dataset_valid,
    model,
    criterion,
    optimizer,
    config,
    device=None,
):
    """モデルの学習を実行し、損失・正解率のログを返す"""
    # デバイスの指定
    model.to(device)

    # dataloaderの作成
    dataloader_train = DataLoader(
        dataset_train, batch_size=config["train_batch_size"], shuffle=True
    )
    dataloader_valid = DataLoader(
        dataset_valid, batch_size=config["valid_batch_size"], shuffle=False
    )

    # 学習
    log_train = []
    log_valid = []

    earlystopping = EarlyStopping(patience=2, threshold=0.05, verbose=True)
    for epoch in range(config["num_epochs"]):
        # 開始時刻の記録
        s_time = time.time()

        # 訓練モードに設定
        model.train()
        for data in tqdm(dataloader_train):
            # デバイスの指定
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            token_type_ids = data["token_type_ids"].to(device)
            labels = data["labels"].to(device)

            # 勾配をゼロで初期化
            optimizer.zero_grad()

            # 順伝播 + 誤差逆伝播 + 重み更新
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            torch.cuda.empty_cache()

            # 損失と正解率の算出
            loss_train, f1_train = calculate_loss_f1(
                model, criterion, dataloader_train, device
            )
            loss_valid, f1_valid = calculate_loss_f1(
                model, criterion, dataloader_valid, device
            )
            log_train.append([loss_train, f1_train])
            log_valid.append([loss_valid, f1_valid])

            # 終了時刻の記録
            e_time = time.time()

        # ログを出力
        print(
            f"epoch: {epoch + 1}, loss_train: {loss_train:.4f}, loss_valid: {loss_valid:.4f}, {(e_time - s_time):.4f}sec"
        )

        # 毎エポックearlystoppingの判定をさせる
        earlystopping(loss_valid, model)  # callメソッド呼び出し
        if earlystopping.early_stop:  # ストップフラグがTrueの場合、breakでforループを抜ける
            print("Early Stopping!")
            break

    return {"train": log_train, "valid": log_valid}

In [19]:
# パラメータの設定
config = AutoConfig.from_pretrained(MODELNAME).to_dict()
config["drop_rate"] = 0.4
config["output_size"] = 3
config["train_batch_size"] = 4
config["valid_batch_size"] = 4
config["num_epochs"] = 4
config["learning_rate"] = 5e-3

In [17]:
# モデルの定義
model = BERTLinearClass(config)

# 損失関数の定義
criterion = torch.nn.BCEWithLogitsLoss()

# オプティマイザの定義
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

# デバイスの指定
device = torch.device("cuda:0")

# モデルの学習
log = train_model(
    dataset_train,
    dataset_valid,
    BATCH_SIZE,
    model,
    criterion,
    optimizer,
    NUM_EPOCHS,
    device=device,
)

  0%|          | 0/229 [00:00<?, ?it/s]


ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
# ログの可視化
x_axis = [x for x in range(1, len(log["train"]) + 1)]
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].plot(x_axis, np.array(log["train"]).T[0], label="train")
ax[0].plot(x_axis, np.array(log["valid"]).T[0], label="valid")
ax[0].set_xlabel("epoch")
ax[0].set_ylabel("loss")
ax[0].legend()
ax[1].plot(x_axis, np.array(log["train"]).T[1], label="train")
ax[1].plot(x_axis, np.array(log["valid"]).T[1], label="valid")
ax[1].set_xlabel("epoch")
ax[1].set_ylabel("accuracy")
ax[1].legend()
plt.show()

In [None]:
def prediction(model, dataset, device):
    # Dataloaderの作成
    loader = DataLoader(dataset, batch_size=100, shuffle=False)

    model.eval()
  
    prob = []
    pred = []
    label = []
    with torch.no_grad():
        for data in loader:
            # デバイスの指定
            ids = data["ids"].to(device)
            mask = data["mask"].to(device)
            label.append(data["labels"].to(device))

            # 順伝播 + 予測値の取得 
            outputs = model.forward(ids, mask)
            prob.append(torch.sigmoid(outputs))
            pred.append(torch.where(outputs>0.5, 1, 0))
            
        pred = torch.cat(pred, dim=0)
        prob = torch.cat(prob, dim=0)
        label = torch.cat(label, dim=0)
        
        del model, loader
        gc.collect()
        return pred, label, prob

In [None]:
pred_train, label_train, prob_train = prediction(model, dataset_train, device)
pred_valid, label_valid, prob_valid = prediction(model, dataset_valid, device)
pred_test, label_test, prob_test = prediction(model, dataset_test, device)

In [None]:
dataTypes = ["pred", "label", "prob"]
trains = [pred_train, label_train, prob_train]
valids = [pred_valid, label_valid, prob_valid]
tests = [pred_test, label_test, prob_test ]

In [None]:
trains = dict(
    zip(
        dataTypes,
        [pd.DataFrame(_.cpu().numpy(), columns=target_col_names) for _ in trains],
    )
)
valids = dict(
    zip(
        dataTypes,
        [pd.DataFrame(_.cpu().numpy(), columns=target_col_names) for _ in valids],
    )
)
tests = dict(
    zip(
        dataTypes,
        [pd.DataFrame(_.cpu().numpy(), columns=target_col_names) for _ in tests],
    )
)

In [None]:
#precision, recallの算出
def calculate_pr_rc(pred, label):
    pr_rc = []
    for col in pred.columns:
        pr = precision_score(pred.loc[:, col].values, label.loc[:, col].values)
        rc = recall_score(pred.loc[:, col].values, label.loc[:, col].values)
        pr_rc.append([pr, rc])
    return pr_rc

In [None]:
test_pr_rc = pd.DataFrame(
    calculate_pr_rc(tests["pred"], tests["label"]),
    index=target_col_names,
    columns=["precision", "recall"]
).T

In [None]:
def plot_performance(performances):
    plt.rcParams["font.size"] = 16
    
    length = len(performances.columns)
    
    plt.figure(figsize=(25, 10))
    left = np.arange(length)
    
    plt.bar(left - 0.2, performances.loc["precision", :], width=0.4, label="precision")
    plt.bar(left + 0.2, performances.loc["recall", :], width=0.4, label="recall")
    plt.xticks(np.arange(length), performances.columns.tolist(), rotation=90)
    plt.yticks(np.arange(11)*0.1, np.round(np.arange(11)*0.1, 1))
    plt.grid(axis="y")
    plt.title("precision  vs recall ")
    plt.legend(loc="upper left")
    plt.show()

In [None]:
plot_performance(test_pr_rc)