# import

In [1]:
import gc
import os
import re
import sys
import time
import warnings

import numpy as np
import pandas as pd
import torch
from matplotlib import pyplot as plt
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import train_test_split
from torch import optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

In [2]:
warnings.filterwarnings("ignore")

In [3]:
pd.set_option("display.max_column", 100)
pd.set_option("display.max_row", 100)

# データの読み込み

In [4]:
df = pd.read_csv(
    "/home/jovyan/work/data/train.csv",
    index_col="discourse_id"
)

## データのラベル定義

In [5]:
discourse_types = ["Lead", "Position", "Claim", "Evidence", "Counterclaim", "Concluding Statement", "Rebuttal"]
target_col_names = ["Adequate", "Effective", "Ineffective"]

## データ整形

In [6]:
# データの抽出
df = df.loc[
    df["discourse_type"]==discourse_types[0]
]

# データの分割
train, valid_test = train_test_split(
    df, test_size=0.2, shuffle=True, random_state=0, stratify=df["discourse_effectiveness"]
)
valid, test = train_test_split(
    valid_test,
    test_size=0.2,
    shuffle=True,
    random_state=0,
    stratify=valid_test["discourse_effectiveness"],
)

## インデックスの割り当て

In [7]:
def set_index(df, id_col_name):
    df.reset_index(inplace=True)
    index_id = dict(zip(df.index, df[id_col_name]))
    df.drop(id_col_name, inplace=True, axis=1)
    return df, index_id

In [8]:
train, train_index2id = set_index(train, "discourse_id")
valid, valid_index2id = set_index(valid, "discourse_id")
test, test_index2id = set_index(test, "discourse_id")

In [9]:
# 正解ラベルのone-hot化
y_train = pd.get_dummies(train["discourse_effectiveness"]).to_numpy()
y_valid = pd.get_dummies(valid["discourse_effectiveness"]).to_numpy()
y_test = pd.get_dummies(test["discourse_effectiveness"]).to_numpy()

In [32]:
# Datasetの定義
class CreateDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_len):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):  # len(Dataset)で返す値を指定
        return len(self.y)

    def __getitem__(self, index):  # Dataset[index]で返す値を指定
        text = self.X[index]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True,
        )
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "input_ids": torch.LongTensor(input_ids),
            "attention_mask": torch.LongTensor(attention_mask),
            "token_type_ids": torch.LongTensor(token_type_ids),
            "labels": torch.Tensor(self.y[index]),
        }

In [33]:
# Datasetの作成
max_len = 50
MODELNAME = "microsoft/deberta-xlarge-mnli"
tokenizer = AutoTokenizer.from_pretrained(MODELNAME)

dataset_train = CreateDataset(train["discourse_text"], y_train, tokenizer, max_len)
dataset_valid = CreateDataset(valid["discourse_text"], y_valid, tokenizer, max_len)
dataset_test = CreateDataset(test["discourse_text"], y_test, tokenizer, max_len)

In [34]:
# BERT分類モデルの定義(Linear)
class BERTLinearClass(torch.nn.Module):
    def __init__(self, drop_rate, otuput_size):
        super().__init__()
        self.config = AutoConfig.from_pretrained(MODELNAME)
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            MODELNAME, output_hidden_states=True
        )
        self.drop = torch.nn.Dropout(drop_rate)
        self.fc = torch.nn.Linear(self.config.hidden_size, otuput_size)

    def forward(self, ids, attention_mask, token_type_ids):
        out = self.bert(
            ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )
        out = self.drop(out.hidden_states[-1])
        out = self.fc(out[:, 0, :])
        return out

In [85]:
# BERT分類モデルの定義(Pooling)
class BERTPoolingClass(torch.nn.Module):
    def __init__(self, drop_rate, otuput_size):
        super().__init__()
        self.config = AutoConfig.from_pretrained(MODELNAME)
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            MODELNAME, output_hidden_states=True
        )
        self.drop = torch.nn.Dropout(drop_rate)
        self.pooling = torch.nn.AdaptiveMaxPool1d(1)
        self.fc = torch.nn.Linear(self.config.hidden_size, otuput_size)
    def forward(self, ids, attention_mask, token_type_ids):
        out = self.bert(
            ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )
        out = self.drop(out.hidden_states[-1])
        out, _ = out.max(1)
        out = self.fc(out)
        return out

In [None]:
# BERT分類モデルの定義(Couvolution)
class BERTConvolutionClass(torch.nn.Module):
    def __init__(self, drop_rate, otuput_size):
        super().__init__()
        self.config = AutoConfig.from_pretrained(MODELNAME)
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            MODELNAME, output_hidden_states=True
        )
        self.drop = torch.nn.Dropout(drop_rate)
        self.pooling = torch.nn.AdaptiveMaxPool1d(1)
        self.fc = torch.nn.Linear(self.config.hidden_size, otuput_size)
    def forward(self, ids, attention_mask, token_type_ids):
        out = self.bert(
            ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )
        out = self.drop(out.hidden_states[-1])
        out, _ = out.max(1)
        out = self.fc(out)
        return out

In [35]:
def calculate_loss_f1(model, criterion, loader, device):
    """損失・正解率を計算"""
    model.eval()
    loss = 0.0
    total = 0
    correct = 0
    with torch.no_grad():
        for data in loader:
            # デバイスの指定
            ids = data["ids"].to(device)
            mask = data["mask"].to(device)
            labels = data["labels"].to(device)

            # 順伝播
            outputs = model(ids, mask)

            # 損失計算
            loss += criterion(outputs, labels).item()

            # 確率計算
            prob = torch.sigmoid(outputs)
            pred = torch.where(prob > 0.5, 1, 0)

            # f1スコア計算
            f1 = f1_score(pred.cpu().numpy(), labels.cpu().numpy(), average="macro")

    return loss / len(loader), f1

In [36]:
class EarlyStopping:
    def __init__(
        self, patience=3, threshold=0.1, verbose=False, path="checkpoint_model.pth"
    ):
        """引数：最小値の非更新数カウンタ、最小値判定の閾値, 表示設定、モデル格納path"""

        self.patience = patience  # 設定ストップカウンタ
        self.threshold = threshold  # 最小値判定の閾値。比率で指定
        self.verbose = verbose  # 表示の有無
        self.counter = 0  # 現在のカウンタ値
        self.early_stop = False  # ストップフラグ
        self.val_loss_min = np.Inf  # 前回のベストスコア記憶用
        self.path = path  # ベストモデル格納path

    def __call__(self, val_loss, model):
        if val_loss > (1 - self.threshold) * self.val_loss_min:  # ベストスコアを更新できなかった場合
            self.counter += 1  # ストップカウンタを+1
            if self.verbose:  # 表示を有効にした場合は経過を表示
                print(
                    f"EarlyStopping counter: {self.counter} out of {self.patience}"
                )  # 現在のカウンタを表示する
            if self.counter >= self.patience:  # 設定カウントを上回ったらストップフラグをTrueに変更
                self.early_stop = True
                
        else:  # ベストスコアを更新した場合
            if self.verbose:  # 表示を有効にした場合は、前回のベストスコアからどれだけ更新したか？を表示
                print(
                    f"Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ..."
                )
            torch.save(model.state_dict(), self.path)  # ベストモデルを指定したpathに保存
            
            self.val_loss_min = val_loss
            self.counter = 0  # ストップカウンタリセット

In [37]:
# パラメータの設定
DROP_RATE = 0.4
OUTPUT_SIZE = 3
BATCH_SIZE = 2
NUM_EPOCHS = 4
LEARNING_RATE = 5e-3

# モデルの定義
model = BERTClass(DROP_RATE, OUTPUT_SIZE)

# 損失関数の定義
criterion = torch.nn.BCEWithLogitsLoss()

# オプティマイザの定義
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

# デバイスの指定
device = torch.device("cuda:0")

# モデルの学習

batch_size=BATCH_SIZE
num_epochs=NUM_EPOCHS
device=device

In [38]:
torch.cuda.empty_cache()

In [40]:
# デバイスの指定
model.to(device)
# dataloaderの作成
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_valid = DataLoader(
    dataset_valid, batch_size=len(dataset_valid), shuffle=False
)
# 学習
log_train = []
log_valid = []
earlystopping = EarlyStopping(patience=2, threshold=0.05, verbose=True)
for epoch in range(num_epochs):
    # 開始時刻の記録
    s_time = time.time()

    # 訓練モードに設定
    model.train()
    for data in tqdm(dataloader_train):
        # デバイスの指定
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        token_type_ids = data["token_type_ids"].to(device)
        labels = data["labels"].to(device)

        # 勾配をゼロで初期化
        optimizer.zero_grad()

        # 順伝播 + 誤差逆伝播 + 重み更新
        outputs = model(input_ids, attention_mask, token_type_ids)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
        break
    break


  0%|          | 0/916 [00:00<?, ?it/s]


In [52]:
outputs.shape

torch.Size([2, 50, 1024])

In [107]:
cnn1 = torch.nn.Conv1d(1024, 256, kernel_size=2, padding=1, device=device)
cnn2 = torch.nn.Conv1d(256, 3, kernel_size=2, padding=1, device=device)

In [108]:
out = outputs.permute(0, 2, 1)

In [109]:
out = torch.nn.functional.relu(cnn1(out))

In [110]:
out = cnn2(out)

In [111]:
o, _ = torch.max(out, 2)

In [124]:
out.shape

torch.Size([2, 3, 52])

In [126]:
pooling = torch.nn.AdaptiveAvgPool1d(1)

In [128]:
pooling(out).shape

torch.Size([2, 3, 1])

In [56]:
fc = torch.nn.Linear(
            1024, 3, device=device
        ) 