ここでは半教師あり学習を用いた分類を行っている

In [1]:
cd ..

/home/is/akiyoshi-n/my-project


In [2]:
import os
# 使用するGPUを指定. この環境変数の場所は，pytorchをimportする前に入れる
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from pathlib import Path
from datetime import datetime
from src.my_project.dataset import load_dataset_2class_classification, split_test_data_stratify, load_text_dataset, preprocess_for_Trainer
from src.my_project.train_v2 import ActClassifier
from sklearn.model_selection import train_test_split
import wandb
import os
from transformers import AutoTokenizer
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [3]:
DATASET_PATH = Path('/home/is/akiyoshi-n/my-project/data')
# 本日の日付
timestamp = datetime.now().strftime("%Y-%m-%d")
# 出力先ディレクトリ
output_dir = Path('/home/is/akiyoshi-n/my-project/outputs/{}'.format(timestamp))

In [4]:
# 最大トークン数
MAX_LEN = 128
# バッチサイズ
BATCH_SIZE = 16
# エポック数
NUM_EPOCHS = 100
# 学習率
LEARNING_RATE = 2e-5
# Cross Validation時のFold数
NUM_FOLDS = 3
# 早期停止のための忍耐値
PATIENCE = 2
# 乱数シード
SEED = 2023
# クラス数
NUM_LABELS = 2

In [5]:
# 辞書型でデータ取得
data = load_dataset_2class_classification(f"{DATASET_PATH}/act_classification_final.xlsx")

In [6]:
# 東北大BERT-v3
MODEL_NAME = 'cl-tohoku/bert-base-japanese-v3'
Classifier_model1 = ActClassifier(model_name = MODEL_NAME, num_labels=NUM_LABELS, seed=SEED)

In [7]:
# testデータと訓練に使用するデータに分割
train_dataset, test_dataset, train_indices, test_indices = split_test_data_stratify(data=data, test_size=0.1, SEED=SEED)

In [8]:
# 訓練データと検証データに分割
train_dataset, eval_dataset, train_indices, eval_indices = split_test_data_stratify(data=train_dataset, test_size=0.2, SEED=SEED)

In [9]:
len(train_dataset['labels']), len(eval_dataset['labels']), len(test_dataset['labels'])

(792, 198, 110)

In [11]:
model1 = Classifier_model1.train_model(train_dataset, eval_dataset, MAX_LEN, NUM_EPOCHS, LEARNING_RATE, BATCH_SIZE, PATIENCE, output_dir, project_name='ActClassification_2class', run_name='no_semi_supervised_learning')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v3 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Parameter 'fn_kwargs'={'tokenizer': BertJapaneseTokenizer(name_or_path='cl-tohoku/bert-base-japanese-v3', vocab_size=32768, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False,

Map:   0%|          | 0/792 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7503,0.698593,0.540404,0.48
2,0.6517,0.656092,0.580808,0.634361
3,0.5492,0.659394,0.621212,0.619289
4,0.3971,0.822173,0.606061,0.645455


In [12]:
# model1を用いてtestデータの予測を行う
# tokenizerの定義
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
test_dataset = preprocess_for_Trainer(test_dataset, tokenizer, MAX_LEN)
pred_logits_test = model1.predict(test_dataset).predictions
pred_test = np.argmax(pred_logits_test, axis=1)

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

In [13]:
# テストデータで評価
true_test = test_dataset['labels']
accuracy = accuracy_score(true_test, pred_test)
f1 = f1_score(true_test, pred_test)
print(f'Accuracy: {accuracy:.4f}')
print(f'F1: {f1:.4f}')

Accuracy: 0.5909
F1: 0.6400


### addデータ作成

In [43]:
add_data = load_text_dataset(f"{DATASET_PATH}/add_data_sub.xlsx")

In [61]:
add_data['texts'] = set(add_data['texts'])

In [62]:
len(add_data['texts'])

6768

### 半教師あり学習に使用する既存のモデルを作成

In [63]:
# testデータと訓練に使用するデータに分割
train_dataset, eval_dataset, train_indices, eval_indices = split_test_data_stratify(data=data, test_size=0.1, SEED=SEED)

In [64]:
# 東北大BERT-v3
MODEL_NAME = 'cl-tohoku/bert-base-japanese-v3'
Classifier_model2 = ActClassifier(model_name = MODEL_NAME, num_labels=NUM_LABELS, seed=SEED)

In [65]:
model2 = Classifier_model2.train_model(train_dataset, eval_dataset, MAX_LEN, NUM_EPOCHS, LEARNING_RATE, BATCH_SIZE, PATIENCE, output_dir, project_name='ActClassification_2class', run_name='base_for_semi_supervised_learning')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v3 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111216853476233, max=1.0)…

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7089,0.680416,0.572727,0.45977
2,0.6162,0.662545,0.609091,0.556701
3,0.5104,0.652863,0.636364,0.583333
4,0.3436,0.720981,0.690909,0.595238
5,0.1877,1.141681,0.672727,0.513514


In [66]:
# add_dataをTrainerに入力できる形式に変更
# tokenizerの定義
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
add_data = preprocess_for_Trainer(add_data, tokenizer, MAX_LEN)

Map:   0%|          | 0/6768 [00:00<?, ? examples/s]

In [67]:
# logits値を予測
logits = model2.predict(add_data)
# sigmoid関数を適応できるようにTensorに変換
logits = torch.from_numpy(logits.predictions)
# シグモイド関数を適用し，確率に変換
predictions_proba = torch.sigmoid(logits)

In [77]:
# 最大値が0.7以上の行をindexと共に取得
indices = torch.where(predictions_proba.max(dim=1).values > 0.8)

In [78]:
# 複数の値が入ったリストのindices[0]の行を取得
use_data = {'texts':[add_data['texts'][i] for i in indices[0]],
            'labels': np.argmax(predictions_proba[indices[0]], axis=-1).tolist()}

In [79]:
len(use_data['labels'])

153

### use_dataをtrain_datasetに加えて半教師あり学習を行う．

In [80]:
# 東北大BERT-v3
MODEL_NAME = 'cl-tohoku/bert-base-japanese-v3'
Classifier_model3 = ActClassifier(model_name = MODEL_NAME, num_labels=NUM_LABELS, seed=SEED)

In [81]:
# testデータと訓練に使用するデータに分割
train_dataset, test_dataset, train_indices, test_indices = split_test_data_stratify(data=data, test_size=0.1, SEED=SEED)

In [82]:
# 訓練データと検証データに分割
train_dataset, eval_dataset, train_indices, eval_indices = split_test_data_stratify(data=train_dataset, test_size=0.2, SEED=SEED)

In [83]:
model3 = Classifier_model3.train_model_adding_data(train_dataset, eval_dataset, use_data, MAX_LEN, NUM_EPOCHS, LEARNING_RATE, BATCH_SIZE, PATIENCE, output_dir, project_name='ActClassification_2class', run_name='semi_supervised_learning')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v3 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/945 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112267130778895, max=1.0…

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6793,0.696834,0.59596,0.587629
2,0.5446,0.715347,0.60101,0.558659
3,0.4242,0.786132,0.636364,0.560976


In [75]:
# model3を用いてtestデータの予測を行う
# tokenizerの定義
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
test_dataset = preprocess_for_Trainer(test_dataset, tokenizer, MAX_LEN)
pred_logits_test = model3.predict(test_dataset).predictions
pred_test = np.argmax(pred_logits_test, axis=1)

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

In [76]:
# テストデータで評価
true_test = test_dataset['labels']
accuracy = accuracy_score(true_test, pred_test)
f1 = f1_score(true_test, pred_test)
print(f'Accuracy: {accuracy:.4f}')
print(f'F1: {f1:.4f}')

Accuracy: 0.5091
F1: 0.4600
