In [19]:
import os
import pandas as pd
import itertools
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertModel, BertTokenizer, BertForSequenceClassification

from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


# 各種パスの設定
######################################################################
base_path = '' # ベースとなるパスを指定してください。#######
######################################################################
os.makedirs(os.path.join(base_path,'model'), exist_ok=True)  # 学習済みモデルの保存するディレクトリを作成
os.makedirs(os.path.join(base_path,'output'), exist_ok=True)  # 提出用ファイルを出力するディレクトリを作成
train_data_path = os.path.join(base_path,'data/StratifiedKFold_train_data.csv') # 訓練データのパスを指定
test_data_path = os.path.join(base_path,'data/test_data.csv') # テストデータのパスを指定
submit_data_path = os.path.join(base_path,'data/submission.csv') # 提出用サンプルfileのパスを指定
model_file_path = os.path.join(base_path,'model') # 学習済みモデルのパスを指定
#foldごとにファイル名変更
output_file_path = os.path.join(base_path,'output/20230314_submission_fold0.csv') # 提出用ファイルのパスを指定

In [20]:
#　データセットを作成するクラスを定義します。
class PaperDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        title = self.data.iloc[index]['title']
        abstract = self.data.iloc[index]['abstract']
        text = title + ' ' + abstract
        labels = self.data.iloc[index]['y']
        
        # inputsの大きさをmodelが扱えるmax_length(512)に指定
        inputs = self.tokenizer(text, max_length=512, truncation=True)
       
        # print(len(inputs['input_ids']))
        
        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'targets': torch.tensor(labels, dtype=torch.float)
        }

In [21]:
# datasetで定義されたバッチ形式の__getitem__を処理して、オリジナルのtorch.Tensorにする関数
def collate_fn(batch):
    input_ids = pad_sequence([torch.tensor(item["input_ids"]) for item in batch], batch_first=True)
    attention_mask = pad_sequence([torch.tensor(item["attention_mask"]) for item in batch], batch_first=True)
    # labels_0 = ([item['targets'][0] for item in batch])
    # labels_1 = ([item['targets'][1] for item in batch])
    # labels = torch.tensor([labels_0, labels_1])
    labels = torch.tensor([item['targets'] for item in batch])


    return {"input_ids": input_ids, "attention_mask": attention_mask, 'targets': labels}


In [28]:
# 各種データの読込
df = pd.read_csv(train_data_path) # 訓練データの読込
#df = df.loc[:,"id":]
test_df = pd.read_csv(test_data_path) # テストデータの読込
test_df['y'] = 0 # テストデータのＹの値を初期化
submit_df = pd.read_csv(submit_data_path) # 提出用散布リファイルの読込

In [22]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

#使用する学習済みモデル

#model_checkpoint = "distilbert-base-uncased"
model_checkpoint = "microsoft/deberta-v3-base"
config = AutoConfig.from_pretrained(model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, config = config)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a Be

In [23]:
#criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
 

In [24]:
def train(model, dataloader, optimizer):
    model.train()
    train_loss = 0
    for batch in dataloader:
        
        #分類問題なので、目的変数の型をfloatからintに変更
        batch['targets'] = batch['targets'].type(torch.LongTensor)

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #targets = batch['targets'].view(-1, 1).to(device)

        # debaerta用
        targets = batch['targets'].to(device)
        train_correct = 0

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, labels=targets)

        # 損失の値
        loss = outputs.loss
        # それぞれのラベルの確率
        logits = outputs.logits
        # 確率の大きい方を取る
        _pred= logits.argmax(-1)
        pred = _pred.argmax(-1)

        # ラベルとpredが一致しているものの数だけtrain_correctを加算
        train_correct += pred.eq(targets).sum() 

        # if labels is not None:
        #     loss = nn.functional.binary_cross_entropy_with_logits(logits, labels) * labels.shape[1]
        # train_total += len(pred)
        
        #train_acc = train_correct/8 #正解率
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    return train_loss / len(dataloader)

In [25]:
def validate(model, dataloader, criterion):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            batch['targets'] = batch['targets'].type(torch.LongTensor)

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            #targets = batch['targets'].view(-1, 1).to(device)

            # debaerta用
            targets = batch['targets'].to(device)

            outputs = model(input_ids, attention_mask, labels=targets)

            loss = outputs.loss 

            val_loss += loss.item()

    return val_loss / len(dataloader)


In [26]:
def predict(model, dataloader):
    outputs = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            # モデルによる予測
            output = model(input_ids, attention_mask=attention_mask)
            logits = output.logits
            # 確率の大きい方を取る
            _pred= logits.argmax(-1)
            #pred = _pred.argmax(-1)
            outputs.extend(_pred.tolist())
    return outputs

In [29]:
df["y_pred"] = 0
model_name = "debarta"

#どのfoldをvalとするか指定
fold = 0

train_df = df[df["folds"] != fold]
val_df = df[df["folds"] == fold]

train_df = train_df.reset_index()
val_df = val_df.reset_index()

train_dataset = PaperDataset(train_df, tokenizer)
val_dataset = PaperDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, collate_fn=collate_fn)
best_val_loss = float('inf')

for epoch in tqdm(range(4)):
    train_loss = train(model, train_loader, optimizer)
    val_loss = validate(model, val_loader, optimizer)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), os.path.join(model_file_path,f"{model_name}_model.pth")) # ベストなモデルを保存する
    print(f"{fold}_fold")
    #予測の実行
    outputs = predict(model, val_loader)
    df.loc[val_df["y_pred"].index.to_list(),"y_pred"] = pd.Series(outputs)
    acc = accuracy_score(val_df["y"], df.loc[val_df["y_pred"].index.to_list(),"y_pred"])
    print(f"acc:{acc}")
    print(f'Epoch {epoch+1} - train loss: {train_loss:.3f}, val loss: {val_loss:.3f}')


  0%|          | 0/4 [00:00<?, ?it/s]

0_fold


 25%|██▌       | 1/4 [02:19<06:57, 139.26s/it]

acc:0.6944723618090453
Epoch 1 - train loss: 0.620, val loss: 0.602
0_fold


 50%|█████     | 2/4 [04:37<04:37, 138.88s/it]

acc:0.6954773869346733
Epoch 2 - train loss: 0.592, val loss: 0.582
0_fold


 75%|███████▌  | 3/4 [06:56<02:18, 138.62s/it]

acc:0.7135678391959799
Epoch 3 - train loss: 0.532, val loss: 0.575
0_fold


100%|██████████| 4/4 [09:11<00:00, 137.82s/it]

acc:0.671356783919598
Epoch 4 - train loss: 0.425, val loss: 0.669





In [30]:
#モデル、テストデータ読み込み
model_path = os.path.join(model_file_path,f"{model_name}_model.pth")
model.load_state_dict(torch.load(model_path))
test_dataset = PaperDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False,collate_fn=collate_fn)

In [31]:
# 予測の実行
outputs = predict(model, test_loader)

In [32]:
submit_df['y'] = pd.Series(outputs)
submit_df.to_csv(output_file_path,index = False)

In [33]:
fold0_path = os.path.join(base_path,'output/20230314_submission_fold0.csv')
fold1_path = os.path.join(base_path,'output/20230314_submission_fold1.csv')
fold2_path = os.path.join(base_path,'output/20230314_submission_fold2.csv')
fold3_path = os.path.join(base_path,'output/20230314_submission_fold3.csv')
fold4_path = os.path.join(base_path,'output/20230314_submission_fold4.csv')
fold0 = pd.read_csv(fold0_path)
fold1 = pd.read_csv(fold1_path)
fold2 = pd.read_csv(fold2_path)
fold3 = pd.read_csv(fold3_path)
fold4 = pd.read_csv(fold4_path)

In [34]:
submit_df = pd.read_csv(submit_data_path)
fold = pd.concat([fold0['y'], fold1['y'], fold2['y'], fold3['y'], fold4['y']], axis=1)
fold = fold.mode(axis=1)

In [35]:
output_file_path = os.path.join(base_path,'output/20230314_finalsubmission.csv') 
fold.columns=["y"]
submit_df['y'] = pd.Series(fold['y'])
submit_df.to_csv(output_file_path,index = False)

# 検証用

In [None]:
# #１バッチ(8データ)取り出す
# for i in train_loader:
#     batch = i
#     break

In [None]:
# batch['targets'].dtype

torch.float32

In [None]:
# batch['targets'] = batch['targets'].type(torch.LongTensor)
batch['targets'].dtype

torch.int64

In [None]:
# batch['targets']

tensor([1, 1, 0, 0, 0, 0, 0, 1])

In [None]:
# input_ids = batch['input_ids'].to(device)
# attention_mask = batch['attention_mask'].to(device)
# targets = batch['targets'].to(device)


In [None]:
# targets

tensor([1, 1, 0, 0, 0, 0, 0, 1], device='cuda:0')

In [None]:
# label_index = (targets >= 0).nonzero()

In [None]:
# label_index.view(-1)

tensor([0, 1, 2, 3, 4, 5, 6, 7], device='cuda:0')

In [None]:
# outputs = model(input_ids, attention_mask, labels=targets)
# outputs

SequenceClassifierOutput(loss=tensor(0.6935, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[0.0909, 0.1603],
        [0.1180, 0.1891],
        [0.0980, 0.1369],
        [0.0952, 0.1546],
        [0.1376, 0.2004],
        [0.1262, 0.1829],
        [0.1115, 0.1544],
        [0.0994, 0.1861]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
# outputs = model(input_ids, attention_mask=attention_mask)
# logits = outputs.logits
# #print(logits)
# # 確率の大きい方を取る
# _pred= logits.argmax(-1)

# print(_pred)



tensor([1, 1, 1, 1, 0, 0, 0, 0], device='cuda:0')
