In [36]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold,train_test_split
from sklearn.metrics import log_loss,f1_score,accuracy_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# 各種パスの設定
######################################################################
base_path = '/workspace' # ベースとなるパスを指定してください。#######
######################################################################
os.makedirs(os.path.join(base_path,'model'), exist_ok=True)  # 学習済みモデルの保存するディレクトリを作成
os.makedirs(os.path.join(base_path,'output'), exist_ok=True)  # 提出用ファイルを出力するディレクトリを作成
train_data_path = os.path.join(base_path,'data/train_data.csv') # 訓練データのパスを指定
test_data_path = os.path.join(base_path,'data/test_data.csv') # テストデータのパスを指定
submit_data_path = os.path.join(base_path,'data/submission.csv') # 提出用サンプルfileのパスを指定
model_file_path = os.path.join(base_path,'model') # 学習済みモデルのパスを指定
output_file_path = os.path.join(base_path,'output/20230303_submission.csv') # 提出用ファイルのパスを指定

In [15]:
class CFG:
    wandb = False
    apex = True
    model = 'microsoft/deberta-v3-large'
    seed = 42
    n_splits = 5
    max_len = 512
    dropout = 0.2
    target_size=4
    n_accumulate=1
    print_freq = 50
    min_lr=1e-6
    scheduler = 'cosine'
    batch_size = 16
    num_workers = 2
    lr = 3e-5
    weigth_decay = 0.01
    epochs = 10
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True 
    num_warmup_steps = 0
    num_cycles=0.5
    debug = False
    debug_ver2 = False
    gradient_checkpointing = True
    freezing = True

In [2]:
#　データセットを作成するクラスを定義します。
class PaperDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        title = self.data.iloc[index]['title']
        abstract = self.data.iloc[index]['abstract']
        text = title + ' ' + abstract
        labels = self.data.iloc[index]['y']
        
        # inputsの大きさをmodelが扱えるmax_length(512)に指定
        inputs = self.tokenizer(text, max_length=512, truncation=True)
       
        # print(len(inputs['input_ids']))
        
        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'targets': torch.tensor(labels, dtype=torch.float)
        }

In [3]:
# datasetで定義されたバッチ形式の__getitem__を処理して、オリジナルのtorch.Tensorにする関数
def collate_fn(batch):
    input_ids = pad_sequence([torch.tensor(item["input_ids"]) for item in batch], batch_first=True)
    attention_mask = pad_sequence([torch.tensor(item["attention_mask"]) for item in batch], batch_first=True)
    # labels_0 = ([item['targets'][0] for item in batch])
    # labels_1 = ([item['targets'][1] for item in batch])
    # labels = torch.tensor([labels_0, labels_1])
    labels = torch.tensor([item['targets'] for item in batch])


    return {"input_ids": input_ids, "attention_mask": attention_mask, 'targets': labels}


In [17]:
# 各種データの読込
train = pd.read_csv(train_data_path) # 訓練データの読込
test_df = pd.read_csv(test_data_path) # テストデータの読込
test_df['y'] = 0 # テストデータのＹの値を初期化
submit_df = pd.read_csv(submit_data_path) # 提出用散布リファイルの読込

In [18]:
skf = StratifiedKFold(n_splits=CFG.n_splits,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

#使用する学習済みモデル

#model_checkpoint = "distilbert-base-uncased"
model_checkpoint = "microsoft/deberta-v3-base"
config = AutoConfig.from_pretrained(model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, config = config)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)


Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

In [8]:
#criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

In [9]:
def train(model, dataloader, optimizer):
    model.train()
    train_loss = 0
    for batch in dataloader:
        
        #分類問題なので、目的変数の型をfloatからintに変更
        batch['targets'] = batch['targets'].type(torch.LongTensor)

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #targets = batch['targets'].view(-1, 1).to(device)

        # debaerta用
        targets = batch['targets'].to(device)
        train_correct = 0

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, labels=targets)

        # 損失の値
        loss = outputs.loss
        # それぞれのラベルの確率
        logits = outputs.logits
        # 確率の大きい方を取る
        _pred= logits.argmax(-1)
        pred = _pred.argmax(-1)

        # ラベルとpredが一致しているものの数だけtrain_correctを加算
        train_correct += pred.eq(targets).sum() 

        # if labels is not None:
        #     loss = nn.functional.binary_cross_entropy_with_logits(logits, labels) * labels.shape[1]
        # train_total += len(pred)
        
        #train_acc = train_correct/8 #正解率
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    return train_loss / len(dataloader)

In [10]:
def validate(model, dataloader, criterion):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            batch['targets'] = batch['targets'].type(torch.LongTensor)

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            #targets = batch['targets'].view(-1, 1).to(device)

            # debaerta用
            targets = batch['targets'].to(device)

            outputs = model(input_ids, attention_mask, labels=targets)

            loss = outputs.loss 

            val_loss += loss.item()

    return val_loss / len(dataloader)


In [11]:
def predict(model, dataloader):
    outputs = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            # モデルによる予測
            output = model(input_ids, attention_mask=attention_mask)
            logits = output.logits
            # 確率の大きい方を取る
            _pred= logits.argmax(-1)
            #pred = _pred.argmax(-1)
            outputs.extend(_pred.tolist())
    return outputs

In [23]:
# #１バッチ(8データ)取り出す
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
for i in train_loader:
    batch = i
    break

In [22]:
train_dataset = PaperDataset(train, tokenizer)

In [25]:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
targets = batch['targets'].to(device)

In [26]:
outputs = model(input_ids, attention_mask=attention_mask)

In [45]:
 F.softmax(torch.tensor(outputs.logits)).numpy()

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [37]:
F.softmax(outputs.logits)

tensor([[0.4973, 0.5027],
        [0.4978, 0.5022],
        [0.4979, 0.5021],
        [0.4961, 0.5039],
        [0.4974, 0.5026],
        [0.4980, 0.5020],
        [0.5000, 0.5000],
        [0.4977, 0.5023]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [67]:
outputs.logits.to('cpu').detach().numpy()

array([[-0.02172697, -0.01087348],
       [-0.02263962, -0.01391098],
       [-0.01471355, -0.0063681 ],
       [-0.0197665 , -0.00428472],
       [-0.01404355, -0.00348078],
       [-0.01907133, -0.01088125],
       [-0.01841294, -0.01827809],
       [-0.02060701, -0.01136107]], dtype=float32)

In [70]:
train.tail()

Unnamed: 0,id,title,year,abstract,keywords,y,kfold
4969,4970,Convolutional Conditional Neural Processes,2020,We introduce the Convolutional Conditional Neu...,"Neural Processes, Deep Sets, Translation Equiv...",1,4
4970,4971,Gradient Descent Maximizes the Margin of Homog...,2020,"In this paper, we study the implicit regulariz...","margin, homogeneous, gradient descent",1,3
4971,4972,Adversarial Training and Provable Defenses: Br...,2020,"We present COLT, a new method to train neural ...","adversarial examples, adversarial training, pr...",1,3
4972,4973,Differentiable Reasoning over a Virtual Knowle...,2020,We consider the task of answering complex mult...,"Question Answering, Multi-Hop QA, Deep Learnin...",1,2
4973,4974,Federated Learning with Matched Averaging,2020,Federated learning allows edge devices to coll...,federated learning,1,1
