# BERT Base line−コメント付き (LB 0.7387 ) by Oregin
研究論文の国際学会採択予測で、件名と概要を利用した簡単なBERTのサンプルコードです。まだ、BERTについて勉強し始めたばかりなので、精度や見栄えはイマイチですが、なんとか動くものを作れました。
ご参考までご活用ください。

※Google Colab（GPU利用）で実行可能です。

LB= 0.7387 でした。

ディレクトリ構成
- base_path : このファイルを入れておくディレクトリ（各種パスの設定にて、保存した絶対パスを指定してください）
- base_path/data : test_data.csv,train_data.csv,submission.csvを入れておくディレクトリ
- base_path/model : 学習済みのモデルを保存するディレクトリ
- base_path/output : 提出用ファイルを保存するディレクトリ




# ライブラリのインポート


In [15]:
from google.colab import drive
drive.mount('./drive')

Drive already mounted at ./drive; to attempt to forcibly remount, call drive.mount("./drive", force_remount=True).


In [16]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
# 各種ライブラリのインポート
import os
import pandas as pd
import itertools
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertModel, BertTokenizer

from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# 各種パスの設定

In [18]:
# 各種パスの設定
######################################################################
base_path = '/content/drive/MyDrive/kaggle/probspace' # ベースとなるパスを指定してください。#######
######################################################################
os.makedirs(os.path.join(base_path,'model'), exist_ok=True)  # 学習済みモデルの保存するディレクトリを作成
os.makedirs(os.path.join(base_path,'output'), exist_ok=True)  # 提出用ファイルを出力するディレクトリを作成
train_data_path = os.path.join(base_path,'data/train_data.csv') # 訓練データのパスを指定
test_data_path = os.path.join(base_path,'data/test_data.csv') # テストデータのパスを指定
submit_data_path = os.path.join(base_path,'data/submission.csv') # 提出用サンプルfileのパスを指定
model_file_path = os.path.join(base_path,'model/best_model.pt') # 学習済みモデルのパスを指定
output_file_path = os.path.join(base_path,'output/001_submission.csv') # 提出用ファイルのパスを指定

# 各種データの読み込み



In [19]:
# 各種データの読込
df = pd.read_csv(train_data_path) # 訓練データの読込
test_df = pd.read_csv(test_data_path) # テストデータの読込
test_df['y'] = 0 # テストデータのＹの値を初期化
submit_df = pd.read_csv(submit_data_path) # 提出用散布リファイルの読込

# 関数の定義

In [27]:
# BERTを使用してトークナイズするためのクラスを定義します。
class BERTTokenize(nn.Module):
    def __init__(self):
        super(BERTTokenize, self).__init__()
        self.tokenizer =RobertaTokenizer.from_pretrained('roberta-base')

    def forward(self, text):
        return self.tokenizer.encode_plus(
            text,
            max_length=512,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )


In [28]:
#　データセットを作成するクラスを定義します。
class PaperDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        title = self.data.iloc[index]['title']
        abstract = self.data.iloc[index]['abstract']
        text = title + ' ' + abstract
        label = self.data.iloc[index]['y']
        inputs = self.tokenizer(text)
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'targets': torch.tensor(label, dtype=torch.float)
        }


In [29]:
# datasetで定義されたバッチ形式の__getitem__を処理して、オリジナルのtorch.Tensorにする関数
def collate_fn(batch):
    input_ids = pad_sequence([torch.tensor(item["input_ids"]) for item in batch], batch_first=True)
    attention_mask = pad_sequence([torch.tensor(item["attention_mask"]) for item in batch], batch_first=True)
    labels = torch.tensor([item['targets'] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, 'targets': labels}


In [30]:
#　BERTのモデルのクラスを定義します。
class BERTModel(nn.Module):
    def __init__(self):
        super(BERTModel, self).__init__()
        self.bert = BertModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, 1)
        self.sig = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask,token_type_ids=None)
        pooled_output = outputs[1]
        dropout_output = self.dropout(pooled_output)
        output = self.fc(dropout_output)
        return self.sig(output)


# データローダの作成

In [33]:
#トレーニングとバリデーションのためのデータローダーを作成します。
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

tokenizer = BERTTokenize()

train_dataset = PaperDataset(train_df, tokenizer)
val_dataset = PaperDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, collate_fn=collate_fn)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

# モデルの作成

In [34]:
# 損失関数とオプティマイザーを定義して、モデルを作成します。
model = BERTModel()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing BertModel: ['roberta.encoder.layer.8.intermediate.dense.weight', 'roberta.encoder.layer.5.intermediate.dense.weight', 'roberta.encoder.layer.6.attention.self.value.weight', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.5.attention.output.dense.bias', 'roberta.encoder.layer.2.attention.self.value.weight', 'roberta.encoder.layer.8.output.dense.bias', 'roberta.encoder.layer.8.attention.self.value.weight', 'roberta.encoder.layer.5.attention.self.query.bias', 'roberta.encoder.layer.10.attention.output.LayerNorm.bias', 'roberta.encoder.layer.4.attention.self.value.bias', 'lm_head.dense.weight', 'roberta.encoder.layer.11.intermediate.dense.bias', 'roberta.encoder.layer.11.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.self.query.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.5.attention.output.LayerNorm.bias', 'roberta.encoder.layer.4

BERTModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  

# 学習の実行

In [35]:
# 学習の関数の定義
def train(model, dataloader, optimizer, criterion):
    model.train()
    train_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].view(-1, 1).to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    return train_loss / len(dataloader)
# 検証の関数の定義
def validate(model, dataloader, criterion):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].view(-1, 1).to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, targets)

            val_loss += loss.item()

    return val_loss / len(dataloader)


In [36]:
#　学習・検証の実行

best_val_loss = float('inf')

for epoch in tqdm(range(10)):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss = validate(model, val_loader, criterion)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), model_file_path) # ベストなモデルを保存する

    print(f'Epoch {epoch+1} - train loss: {train_loss:.3f}, val loss: {val_loss:.3f}')


 10%|█         | 1/10 [06:50<1:01:31, 410.18s/it]

Epoch 1 - train loss: 0.628, val loss: 0.618


 20%|██        | 2/10 [13:37<54:29, 408.63s/it]  

Epoch 2 - train loss: 0.627, val loss: 0.614


 30%|███       | 3/10 [20:23<47:32, 407.45s/it]

Epoch 3 - train loss: 0.625, val loss: 0.617


 40%|████      | 4/10 [27:09<40:40, 406.69s/it]

Epoch 4 - train loss: 0.626, val loss: 0.615


 50%|█████     | 5/10 [33:55<33:52, 406.59s/it]

Epoch 5 - train loss: 0.629, val loss: 0.615


 60%|██████    | 6/10 [40:41<27:04, 406.23s/it]

Epoch 6 - train loss: 0.625, val loss: 0.615


 70%|███████   | 7/10 [47:26<20:17, 405.85s/it]

Epoch 7 - train loss: 0.626, val loss: 0.615


 80%|████████  | 8/10 [54:11<13:31, 405.66s/it]

Epoch 8 - train loss: 0.627, val loss: 0.614


 90%|█████████ | 9/10 [1:00:56<06:45, 405.55s/it]

Epoch 9 - train loss: 0.624, val loss: 0.615


100%|██████████| 10/10 [1:07:45<00:00, 406.53s/it]

Epoch 10 - train loss: 0.625, val loss: 0.614





# テストデータで予測する

In [37]:
# モデルの読み込み
model.load_state_dict(torch.load(model_file_path))

<All keys matched successfully>

In [38]:
#　テストデータのデータローダ―の作成
tokenizer = BERTTokenize()

test_dataset = PaperDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False,collate_fn=collate_fn)

In [39]:
#　予測する関数の定義
def predict(model, dataloader):
  outputs = []
  with torch.no_grad():
    for batch in dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      # モデルによる予測
      output = model(input_ids, attention_mask=attention_mask)
      outputs.extend(list(itertools.chain.from_iterable(output.tolist())))
            
  return outputs

#予測の実行
outputs = predict(model, test_loader)

In [40]:
#予測結果を提出形式に合わせて出力
submit_df['y'] = pd.Series(outputs)
submit_df['y'].mask(submit_df['y'] > 0.5,1,inplace=True)
submit_df['y'].mask(submit_df['y'] < 1,0,inplace=True)
submit_df['y'] = submit_df['y'].astype('int')
submit_df.to_csv(output_file_path,index = False)

In [46]:
outputs

AttributeError: ignored

In [47]:
sorted(outputs, reverse=True)

[0.3075016736984253,
 0.30750054121017456,
 0.3074963390827179,
 0.30749472975730896,
 0.30749019980430603,
 0.30748817324638367,
 0.30748632550239563,
 0.3074854016304016,
 0.30748313665390015,
 0.30748289823532104,
 0.3074825704097748,
 0.3074825704097748,
 0.3074823021888733,
 0.3074807822704315,
 0.3074793815612793,
 0.30747926235198975,
 0.307478666305542,
 0.30747827887535095,
 0.3074781894683838,
 0.3074771761894226,
 0.3074769079685211,
 0.3074764907360077,
 0.3074764609336853,
 0.3074756860733032,
 0.3074747622013092,
 0.3074740469455719,
 0.30747273564338684,
 0.3074725568294525,
 0.30747246742248535,
 0.3074723482131958,
 0.30747196078300476,
 0.3074715733528137,
 0.307471364736557,
 0.3074711859226227,
 0.30747100710868835,
 0.30747079849243164,
 0.30747032165527344,
 0.30747032165527344,
 0.3074702322483063,
 0.30747002363204956,
 0.30746990442276,
 0.3074689507484436,
 0.30746886134147644,
 0.3074687123298645,
 0.30746856331825256,
 0.3074684739112854,
 0.3074683845043182