# import

In [1]:
import gc
import os
import re
import sys
import time
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from matplotlib import pyplot as plt
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import StratifiedKFold, train_test_split
from torch import nn, optim
from torch.utils.checkpoint import checkpoint
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import AutoConfig, AutoModel, AutoTokenizer

In [2]:
pd.set_option("display.max_column", 100)
pd.set_option("display.max_row", 100)

# パラメータの設定

In [3]:
# パラメータの設定
MODELNAME="microsoft/deberta-v3-base"
config = AutoConfig.from_pretrained(MODELNAME).to_dict()
config["model_name"] = MODELNAME
config["max_token_len"] = 110
config["drop_rate"] = 0.4
config["output_size"] = 3
config["fold_split"] = 5

config["train_batch_size"] = 100
config["valid_batch_size"] = 32
config["num_epochs"] = 5

config["learning_rate"] = 1e-5
config["lr_T_max"] = 500
config["min_lr"] = 1e-6
config["weight_decay"] = 0.005
# config["warmup_start_value"] = 0.0
# config["warmup_end_value"] = 0.1
# config["warmup_duration"] = 3

config["gradient_checkpoint"] = True
config["freezing"] = True
config["header_type"] = "Concatenate"

In [4]:
# # パラメータの設定
# MODELNAME="bert-base-uncased"
# config = AutoConfig.from_pretrained(MODELNAME).to_dict()
# config["drop_rate"] = 0.4
# config["output_size"] = 3
# config["train_batch_size"] = 180
# config["valid_batch_size"] = 60
# config["num_epochs"] = 1
# config["learning_rate"] = 1e-4
# config["model_name"] = MODELNAME
# config["max_token_len"] = 128
# config["fold_split"] = 5
# config["gradient_checkpoint"] = True
# config["freezing"] = True
# config["header_type"] = "Linear"

## definition

In [5]:
#textをtokenizeするクラス(前処理)
class tokenize(object):
    def __init__(self, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __call__(self, text):
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True
        )
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "input_ids": torch.LongTensor(input_ids),
            "attention_mask": torch.LongTensor(attention_mask),
            "token_type_ids": torch.LongTensor(token_type_ids),
        }

In [26]:
# DatasetTestの定義
class CreateDatasetTest(Dataset):
    def __init__(self, X, transform):
        self.X = X
        self.transform=transform
    def __len__(self):  # len(Dataset)で返す値を指定
        return len(self.X)

    def __getitem__(self, index):  # Dataset[index]で返す値を指定
        text = self.X[index]
        output_dict = self.transform(text)
        return output_dict

## utils

In [7]:
def freeze(module):
    for parameter in module.parameters():
        parameter.require_grad = False

## model

In [8]:
# BERT分類モデル
class BERTClass(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.bert = AutoModel.from_pretrained(
            self.config["model_name"], output_hidden_states=True
        )
        self.drop = nn.Dropout(self.config["drop_rate"])
        if self.config["header_type"] == "Linear":
            self.fc = nn.Linear(self.config["hidden_size"], self.config["output_size"])
        elif self.config["header_type"] == "Pooling":
            self.pooling = nn.AdaptiveMaxPool1d(1)
            self.fc = nn.Linear(
                self.config["hidden_size"], self.config["output_size"]
            )
        elif self.config["header_type"] == "Couvolution":
            self.cnn1 = nn.Conv1d(
                self.config["hidden_size"], 256, kernel_size=2, padding=1
            )
            self.cnn2 = nn.Conv1d(256, 1, kernel_size=2, padding=1)
        elif self.config["header_type"] == "Concatenate":
            self.fc = nn.Linear(
                self.config["hidden_size"] * 4, self.config["output_size"]
            )
        else:
            raise NotImplementedError

        # Gradient Checkpointing
        if self.config["gradient_checkpoint"]:
            self.bert.gradient_checkpointing_enable()
        # Freeze
        if self.config["freezing"]:
            freeze(self.bert.embeddings)
            freeze(self.bert.encoder.layer[:2])

    def forward(self, input_ids, attention_mask, token_type_ids):
        x = self.bert(
                input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
            )
        if self.config["header_type"] == "Linear":
            x = self.drop(x.hidden_states[-1])
            x = self.fc(x[:, 0, :])
        elif self.config["header_type"] == "Pooling":
            x = self.drop(x.hidden_states[-1])
            x, _ = x.max(1)
            x = self.fc(x)
        elif self.config["header_type"] == "Couvolution":
            x.hidden_states[-1].permute(0, 2, 1)
            x = nn.functional.relu(self.cnn1(x))
            x = self.cnn2(x)
            x, _ = torch.max(x, 2)
        elif self.config["header_type"] == "Concatenate":
            x = torch.cat(
                [x["hidden_states"][-1 * i][:, 0] for i in range(1, 4 + 1)], dim=1
            )  # concatenate
            x = self.fc(x)
        else:
            raise NotImplementedError
        return x

# read model

In [9]:
# デバイスの指定
device = torch.device("cuda:0")
# モデルの定義
model = BERTClass(config)
model.load_state_dict(torch.load("/home/jovyan/work/data/checkpoint/checkpoint_model_.pth"))
model.to(device)

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (bert): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
              (dropout): Stab

# Read data

In [10]:
df_test = pd.read_csv(
    "/home/jovyan/work/data/test.csv",
    index_col="discourse_id"
)

## データのラベル定義

In [19]:
discourse_types = ["Lead", "Position", "Claim", "Evidence", "Counterclaim", "Concluding Statement", "Rebuttal"]
discourse_effectiveness = ["Ineffective", "Adequate", "Effective"]

## データ整形

In [20]:
#下処理:discourse_typeとdiscourse_textを結合する
sep = AutoTokenizer.from_pretrained(config["model_name"]).sep_token
df_test["inputs"] = df_test.discourse_type + sep + df_test.discourse_text

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
# Datasetの作成
tokenizer = tokenize(AutoTokenizer.from_pretrained(config["model_name"]), config["max_token_len"])
dataset = CreateDatasetTest(df_test["inputs"], tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
def prediction(model, dataset, device):
    # Dataloaderの作成
    loader = DataLoader(dataset, batch_size=100, shuffle=False)

    model.eval()
  
    prob = []
    with torch.no_grad():
        for data in loader:
            # デバイスの指定
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            token_type_ids = data["token_type_ids"].to(device)

            # 順伝播 + 予測値の取得 
            outputs = model.forward(input_ids, attention_mask, token_type_ids)
            prob.append(torch.sigmoid(outputs))
            
        prob = torch.cat(prob, dim=0)
        
        del model, loader
        gc.collect()
        return prob

In [23]:
prob = prediction(model, dataset, device)

In [24]:
submission_df = pd.DataFrame(
    prob.to("cpu").numpy(), index=df_test.index, columns=discourse_effectiveness
)

In [25]:
submission_df

Unnamed: 0_level_0,Ineffective,Adequate,Effective
discourse_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a261b6e14276,0.448783,0.52586,0.033811
5a88900e7dc1,0.688143,0.225496,0.058268
9790d835736b,0.751171,0.117726,0.115207
75ce6d68b67b,0.75221,0.141448,0.098241
93578d946723,0.725802,0.109388,0.120662
2e214524dbe3,0.309576,0.69796,0.021057
84812fc2ab9f,0.284802,0.708937,0.026075
c668ff840720,0.73855,0.141009,0.088919
739a6d00f44a,0.398511,0.569287,0.032219
bcfae2c9a244,0.403965,0.591043,0.030592


In [18]:
submission_df.to_csv('submission.csv')