In [1]:
import sys
import os
import glob
from pathlib import Path
import random
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, roc_curve, roc_auc_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.checkpoint import checkpoint
from transformers import AutoConfig
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding





In [7]:
# Configurations
class CFG:
    SEED = 0
    TRAIN_DATA_PATH = Path('../../radnlp_2024_train_val_20240731/ja/sub_task/train')
    VALID_DATA_PATH = Path('../../radnlp_2024_train_val_20240731/ja/sub_task/val')
    MODEL_SAVE_PATH = Path('./model')
    MODEL_NAME = 'debarta'
    MODEL_PATH = 'microsoft/deberta-v3-large'
    MAX_LENGTH = 512
    INPUT_COL = 'text'
    TARGET_COL1 = 'omittable'
    TARGET_COL2 = 'measure'
    TARGET_COL3 = 'extension'
    TARGET_COL4 = 'atelectasis'
    TARGET_COL5 = 'satellite'
    TARGET_COL6 = 'lymphadenopathy'
    TARGET_COL7 = 'pleural'
    TARGET_COL8 = 'distant'
    TARGET_CLASS_NUM = 2

In [4]:
def get_device() -> str:
    """
    Returns the best available device for PyTorch computations.
    """
    if torch.backends.mps.is_available():
        # macOS with Apple Silicon (MPS backend)
        return "mps"
    elif torch.cuda.is_available():
        # NVIDIA GPU
        return "cuda"
    else:
        # Fallback to CPU
        return "cpu"

try:
    device = torch.device(get_device())
    print(f"Using device: {device}")
except RuntimeError as e:
    print(f"Failed to initialize the device: {e}")
    device = torch.device("cpu")  # Fallback to CPU in case of an error

Using device: cuda


In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False #Trueで高速化重視, Falseで再現性重視
seed_everything(CFG.SEED)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.set_num_threads(1)

In [8]:
def create_dataframe(folder_path):
    """
    指定されたフォルダ内のテキストファイルからDataFrameを作成する関数

    Args:
        folder_path (str): テキストファイルが入っているフォルダのパス

    Returns:
        pandas.DataFrame: 作成されたDataFrame
    """

    # sentence.csvを読み込む
    df_text = pd.read_csv(os.path.join(folder_path, "sentences.csv"))
    # label.csvを読み込む
    df_label = pd.read_csv(os.path.join(folder_path, "label.csv"))

    df_text['index'] = df_text['id'].astype(str) + '_' + df_text['sentence_index'].astype(str)
    df_label['index'] = df_label['id'].astype(str) + '_' + df_label['sentence_index'].astype(str)
    
    # textとlabelのDataFrameを結合
    df = pd.merge(df_text, df_label, on='index', how='inner')
    df['id'] = df['id_x'].combine_first(df['id_y'])
    df = df.drop(columns=['id_x', 'id_y'])
    df['sentence_index'] = df['sentence_index_x'].combine_first(df['sentence_index_y'])
    df = df.drop(columns=['sentence_index_x', 'sentence_index_y'])
    cols = ['index','id','sentence_index'] + [col for col in df.columns if col not in ['index','id','sentence_index']]
    df = df[cols]

    return df

train_df = create_dataframe(CFG.TRAIN_DATA_PATH)
val_df = create_dataframe(CFG.VALID_DATA_PATH)

In [9]:
val_df.head(10)

Unnamed: 0,index,id,sentence_index,text,omittable,measure,extension,atelectasis,satellite,lymphadenopathy,pleural,distant
0,147290_0,147290,0,左肺門部に 37mm 大の腫瘤影を認め、ご指摘の肺癌が疑われます。,0,1,0,0,0,0,0,0
1,147290_1,147290,1,縦隔に有意なリンパ節腫大は認めません。,1,0,0,0,0,0,0,0
2,147290_2,147290,2,胸水はありません。,1,0,0,0,0,0,0,0
3,147290_3,147290,3,背部皮下に腫瘤を認め、粉瘤などと思われます。,1,0,0,0,0,0,0,0
4,241752_0,241752,0,右下葉に 14×15mm の限局性すりガラス影があります。,0,1,0,0,0,0,0,0
5,241752_1,241752,1,粗大な充実部分は認めません。,0,1,0,0,0,0,0,0
6,241752_2,241752,2,内部を血管が通過しています。,1,0,0,0,0,0,0,0
7,241752_3,241752,3,既知の肺癌と考えます。,0,1,0,0,0,0,0,0
8,241752_4,241752,4,縦隔リンパ節腫大は認めません。,1,0,0,0,0,0,0,0
9,241752_5,241752,5,胸水貯留は指摘できません。,1,0,0,0,0,0,0,0


In [10]:
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_PATH)



# 分類モデルの読み込み

In [11]:
base_folder = CFG.MODEL_SAVE_PATH / 'omit'
trained_model_path = [f for f in base_folder.iterdir() if f.is_dir() and "checkpoint-" in f.name][0]
model_1 = AutoModelForSequenceClassification.from_pretrained(trained_model_path)

base_folder = CFG.MODEL_SAVE_PATH / 'meas'
trained_model_path = [f for f in base_folder.iterdir() if f.is_dir() and "checkpoint-" in f.name][0]
model_2 = AutoModelForSequenceClassification.from_pretrained(trained_model_path)

base_folder = CFG.MODEL_SAVE_PATH / 'exte'
trained_model_path = [f for f in base_folder.iterdir() if f.is_dir() and "checkpoint-" in f.name][0]
model_3 = AutoModelForSequenceClassification.from_pretrained(trained_model_path)

base_folder = CFG.MODEL_SAVE_PATH / 'atel'
trained_model_path = [f for f in base_folder.iterdir() if f.is_dir() and "checkpoint-" in f.name][0]
model_4 = AutoModelForSequenceClassification.from_pretrained(trained_model_path)

base_folder = CFG.MODEL_SAVE_PATH / 'sate'
trained_model_path = [f for f in base_folder.iterdir() if f.is_dir() and "checkpoint-" in f.name][0]
model_5 = AutoModelForSequenceClassification.from_pretrained(trained_model_path)

base_folder = CFG.MODEL_SAVE_PATH / 'lymp'
trained_model_path = [f for f in base_folder.iterdir() if f.is_dir() and "checkpoint-" in f.name][0]
model_6 = AutoModelForSequenceClassification.from_pretrained(trained_model_path)

base_folder = CFG.MODEL_SAVE_PATH / 'pleu'
trained_model_path = [f for f in base_folder.iterdir() if f.is_dir() and "checkpoint-" in f.name][0]
model_7 = AutoModelForSequenceClassification.from_pretrained(trained_model_path)

base_folder = CFG.MODEL_SAVE_PATH / 'dist'
trained_model_path = [f for f in base_folder.iterdir() if f.is_dir() and "checkpoint-" in f.name][0]
model_8 = AutoModelForSequenceClassification.from_pretrained(trained_model_path)

# 推論

In [12]:
model_1.eval()
model_2.eval()
model_3.eval()
model_4.eval()
model_5.eval()
model_6.eval()
model_7.eval()
model_8.eval()

omit_li, meas_li, exte_li, atel_li, sate_li, lymp_li, pleu_li, dist_li = [],[],[],[],[],[],[],[]

for i in tqdm(range(len(val_df))):
    tokens = tokenizer.encode_plus(val_df.iloc[i]['text'], padding=False, truncation=True, max_length=CFG.MAX_LENGTH, return_tensors="pt")

    with torch.no_grad():
        omit_pred = F.softmax(model_1(**tokens).logits, dim=1).argmax(dim=1).item()
        omit_li.append(omit_pred)

        if omit_pred == 1:
            meas_li.append(0)
            exte_li.append(0)
            atel_li.append(0)
            sate_li.append(0)
            lymp_li.append(0)
            pleu_li.append(0)
            dist_li.append(0)

        else:
            meas_li.append(F.softmax(model_2(**tokens).logits, dim=1).argmax(dim=1).item())
            exte_li.append(F.softmax(model_3(**tokens).logits, dim=1).argmax(dim=1).item())
            atel_li.append(F.softmax(model_4(**tokens).logits, dim=1).argmax(dim=1).item())
            sate_li.append(F.softmax(model_5(**tokens).logits, dim=1).argmax(dim=1).item())
            lymp_li.append(F.softmax(model_6(**tokens).logits, dim=1).argmax(dim=1).item())
            pleu_li.append(F.softmax(model_7(**tokens).logits, dim=1).argmax(dim=1).item())
            dist_li.append(F.softmax(model_8(**tokens).logits, dim=1).argmax(dim=1).item())

val_df['omit_pred'] = omit_li
val_df['meas_pred'] = meas_li
val_df['exte_pred'] = exte_li
val_df['atel_pred'] = atel_li
val_df['sate_pred'] = sate_li
val_df['lymp_pred'] = lymp_li
val_df['pleu_pred'] = pleu_li
val_df['dist_pred'] = dist_li

val_df.to_csv('../model_outputs/debarta_subtask_results.csv')

  0%|          | 0/451 [00:00<?, ?it/s]

# Submission CSVの出力

In [13]:
sub_df = pd.read_csv('../model_outputs/debarta_subtask_results.csv')
omit_pred = sub_df['omit_pred'].values.tolist()
omit_label = sub_df['omittable'].values.tolist()
meas_pred = sub_df['meas_pred'].values.tolist()
meas_label = sub_df['measure'].values.tolist()
exte_pred = sub_df['exte_pred'].values.tolist()
exte_label = sub_df['extension'].values.tolist()
atel_pred = sub_df['atel_pred'].values.tolist()
atel_label = sub_df['atelectasis'].values.tolist()
sate_pred = sub_df['sate_pred'].values.tolist()
sate_label = sub_df['satellite'].values.tolist()
lymp_pred = sub_df['lymp_pred'].values.tolist()
lymp_label = sub_df['lymphadenopathy'].values.tolist()
pleu_pred = sub_df['pleu_pred'].values.tolist()
pleu_label = sub_df['pleural'].values.tolist()
dist_pred = sub_df['dist_pred'].values.tolist()
dist_label = sub_df['distant'].values.tolist()

print(f'omit_acc: {accuracy_score(omit_label, omit_pred)}')
print(f'meas_acc: {accuracy_score(meas_label, meas_pred)}')
print(f'exte_acc: {accuracy_score(exte_label, exte_pred)}')
print(f'atel_acc: {accuracy_score(atel_label, atel_pred)}')
print(f'sate_acc: {accuracy_score(sate_label, sate_pred)}')
print(f'lymp_acc: {accuracy_score(lymp_label, lymp_pred)}')
print(f'pleu_acc: {accuracy_score(pleu_label, pleu_pred)}')
print(f'dist_acc: {accuracy_score(dist_label, dist_pred)}')

omit_acc: 0.9445676274944568
meas_acc: 0.9667405764966741
exte_acc: 0.975609756097561
atel_acc: 0.991130820399113
sate_acc: 0.9490022172949002
lymp_acc: 0.9844789356984479
pleu_acc: 0.9866962305986696
dist_acc: 0.9379157427937915


In [14]:
sub_df = pd.read_csv('../model_outputs/debarta_subtask_results.csv')
select_df = sub_df[['id','sentence_index','omit_pred','meas_pred','exte_pred','atel_pred','sate_pred','lymp_pred','pleu_pred','dist_pred']]
rename_df = select_df.rename(columns={'omit_pred':'omittable','meas_pred':'measure','exte_pred':'extension','atel_pred':'atelectasis','sate_pred':'satellite','lymp_pred':'lymphadenopathy','pleu_pred':'pleural','dist_pred':'distant'})
rename_df.to_csv('../model_outputs/sentence_classification_debarta.csv', index=False)

In [15]:
rename_df.head(10)

Unnamed: 0,id,sentence_index,omittable,measure,extension,atelectasis,satellite,lymphadenopathy,pleural,distant
0,147290,0,0,1,0,0,0,0,0,0
1,147290,1,1,0,0,0,0,0,0,0
2,147290,2,1,0,0,0,0,0,0,0
3,147290,3,1,0,0,0,0,0,0,0
4,241752,0,0,1,0,0,1,0,0,0
5,241752,1,1,0,0,0,0,0,0,0
6,241752,2,0,0,1,0,0,0,0,0
7,241752,3,0,1,0,0,0,0,0,0
8,241752,4,1,0,0,0,0,0,0,0
9,241752,5,1,0,0,0,0,0,0,0


In [16]:
print(rename_df['omittable'].value_counts())
print(rename_df['measure'].value_counts())
print(rename_df['extension'].value_counts())
print(rename_df['atelectasis'].value_counts())
print(rename_df['satellite'].value_counts())
print(rename_df['lymphadenopathy'].value_counts())
print(rename_df['pleural'].value_counts())
print(rename_df['distant'].value_counts())

omittable
0    281
1    170
Name: count, dtype: int64
measure
0    363
1     88
Name: count, dtype: int64
extension
0    405
1     46
Name: count, dtype: int64
atelectasis
0    433
1     18
Name: count, dtype: int64
satellite
0    434
1     17
Name: count, dtype: int64
lymphadenopathy
0    406
1     45
Name: count, dtype: int64
pleural
0    413
1     38
Name: count, dtype: int64
distant
0    428
1     23
Name: count, dtype: int64
