In [20]:
import sys
import os
import glob
from pathlib import Path
import random
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_curve, roc_auc_score
import re
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from pkg.UTHBERT.preprocess_text import preprocess as my_preprocess
from pkg.UTHBERT.tokenization_mod import MecabTokenizer, FullTokenizerForMecab
from transformers import BertModel,BertConfig

In [3]:
def get_device() -> str:
    """
    Returns the best available device for PyTorch computations.
    """
    if torch.backends.mps.is_available():
        # macOS with Apple Silicon (MPS backend)
        return "mps"
    elif torch.cuda.is_available():
        # NVIDIA GPU
        return "cuda"
    else:
        # Fallback to CPU
        return "cpu"

try:
    device = torch.device(get_device())
    print(f"Using device: {device}")
except RuntimeError as e:
    print(f"Failed to initialize the device: {e}")
    device = torch.device("cpu")  # Fallback to CPU in case of an error

Using device: mps


In [4]:
seed=0
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False #Trueで高速化重視, Falseで再現性重視
seed_everything(seed)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.use_deterministic_algorithms(True)
torch.set_num_threads(1)

# Tokenizer

In [5]:
dic_path = './dic'

In [6]:
# special token for a Person's name (Do not change)
name_token = "＠＠Ｎ"

# path to the mecab-ipadic-neologd
mecab_ipadic_neologd = dic_path + '/mecab-ipadic-neologd'

# path to the J-Medic (We used MANBYO_201907_Dic-utf8.dic)
mecab_J_medic = dic_path + '/MANBYO/MANBYO_201907_Dic-utf8.dic'

# path to the uth-bert vocabulary
vocab_file = dic_path + '/UTH-BERT/bert_vocab_mc_v1_25000.txt'

In [7]:
# MecabTokenizer
sub_tokenizer = MecabTokenizer(mecab_ipadic_neologd=mecab_ipadic_neologd,
                               mecab_J_medic=mecab_J_medic,
                               name_token=name_token)

# FullTokenizerForMecab
tokenizer = FullTokenizerForMecab(sub_tokenizer=sub_tokenizer,
                                  vocab_file=vocab_file,
                                  do_lower_case=False)

In [8]:
def tokenize(sentence_list):
    token_list = []
    for s in tqdm(sentence_list):
        tokens = tokenizer.tokenize(my_preprocess(str(s)))
        ids = tokenizer.convert_tokens_to_ids(tokens)
        ids = [2] + ids + [3]
        token_list.append(ids)
    return token_list

# Data

In [9]:
def create_dataframe(folder_path):
    """
    指定されたフォルダ内のテキストファイルからDataFrameを作成する関数

    Args:
        folder_path (str): テキストファイルが入っているフォルダのパス

    Returns:
        pandas.DataFrame: 作成されたDataFrame
    """

    # フォルダ内のすべてのテキストファイルのパスを取得
    file_list = glob.glob(folder_path + "/*.txt")

    # 空のDataFrameを作成
    df_text = pd.DataFrame(columns=['id', 'text'])
    
    # label.csvを読み込む
    df_label = pd.read_csv(os.path.join(folder_path, "label.csv"))

    for file in file_list:
        # ファイル名からIDを抽出
        file_id = file.split("/")[-1].split(".")[0]
        # file_id = file.split("\\")[-1].split(".")[0] # Windows
        
        # テキストファイルを読み込む
        with open(file, 'r', encoding='utf-8') as f:
            text = f.read()

        # DataFrameに追記
        rows = []
        rows.append({'id': int(file_id), 'text': text})
        df_text = pd.concat([df_text, pd.DataFrame(rows)], ignore_index=True)
        
    # textとlabelのDataFrameを結合
    df = pd.merge(df_text, df_label, on='id', how='left')

    return df

val_folder_path = "../../radnlp_2024_train_val_20240731/ja/main_task/val"
val_df = create_dataframe(val_folder_path)

In [10]:
val_df.head(10)

Unnamed: 0,id,text,t,n,m
0,5764772,右下葉 S10 に⻑径 15mm の pure GGN を認めます。既知肺癌に相当の病変と思...,Tis,N0,M0
1,4644984,右下葉に腫瘤を認め、既知肺癌を疑います。\n右縦隔、肺門部に軟部影を認め、リンパ節転移を疑い...,T0,N2,M1c
2,16066820,左肺上葉に最大径 22mm の分葉状腫瘤があります。\n一部胸膜陥入像を伴っています。\n...,T1c,N0,M0
3,147290,左肺門部に 37mm 大の腫瘤影を認め、ご指摘の肺癌が疑われます。\n縦隔に有意なリンパ節腫...,T2a,N0,M0
4,14063477,右肺野に長径 13cm を超え、胸壁へと進展する病変を認め、肺癌が疑われます。\n両肺野には...,T4,N0,M1c
5,10376521,右肺に長径 13.7cm 大、胸壁を超える腫瘤を認めます。T4 と考えます。\n縦隔リンパ節...,T4,N2,M1c
6,16191878,比較可能な画像検査はありません。\n右肺下葉 S10 に 1.5cm×1.4cm 程度の...,Tis,N0,M0
7,15532322,左上葉には⻑径 22mm 大の不整形結節あり、既知の肺癌部分と思われます。\n病的リンパ節腫...,T1c,N0,M0
8,3462779,左肺には縦隔浸潤が疑われる腫瘤があり、左肺全体に及ぶ無気肺を伴っています。\n肺門-気管分岐...,T4,N2,M1a
9,4724041,左肺門部、左上葉 S2 に主座を置く⻑径 37mm の不整形腫瘤を認め、内部に空洞形成\nを...,T2a,N0,M0


In [11]:
def convert_list(text_list, label_dic):
    """
    リストのテキストを、指定された辞書に基づいて数値に変換します。

    Args:
        text_list (list): 変換したいテキストのリスト
        label_dic (dict): テキストと対応する数値の辞書

    Returns:
        list: 数値に変換されたリスト
    """

    num_list = []
    for text in text_list:
        # 辞書から対応する数値を取得
        num = label_dic.get(text, None)
        if num is not None:
            num_list.append(num)
        else:
            print(f"対応する数値が見つかりません: {text}")
    return num_list

label_dic_t = {'T0':0, 'Tis':1, 'T1mi':2, 'T1a':3, 'T1b':4, 'T1c':5, 'T2a':6, 'T2b':7, 'T3':8, 'T4':9}
inverse_label_dic_t = {v: k for k, v in label_dic_t.items()}
label_dic_n = {'N0':0, 'N1':1, 'N2':2, 'N3':3}
inverse_label_dic_n = {v: k for k, v in label_dic_n.items()}
label_dic_m = {'M0':0, 'M1a':1, 'M1b':2, 'M1c':3}
inverse_label_dic_m = {v: k for k, v in label_dic_m.items()}

# 分類モデルの読み込み

In [12]:
model_path = Path('./pkg/UTHBERT')
save_path = Path('./model')

In [13]:
num_classes_t=10
num_classes_n=4
num_classes_m=4

In [17]:
config = BertConfig.from_json_file(model_path / 'bert_config.json')
bert = BertModel.from_pretrained(model_path / 'UTH_BERT.bin', config=config)

class Net(nn.Module):
    def __init__(self, bert, num_classes):
        super(Net, self).__init__()
        self.bert = bert
        self.cls = nn.Linear(768, num_classes)

  # @torch.cuda.amp.autocast()
    def forward(self, inputs_ids, pad_masks, Attention=False):
        bout = self.bert(inputs_ids, pad_masks, output_attentions=Attention)
        bs = len(bout[0])
        h0 = [bout[0][i][0] for i in range(bs)]
        h0 = torch.stack(h0,dim=0)
        if Attention:
            return self.cls(h0),bout
        else:
            return self.cls(h0)

In [18]:
model_t = Net(bert, num_classes_t).to(device)
path_t = save_path / 't' / 'best_acc_1.pth'
model_t.load_state_dict(torch.load(path_t, weights_only=True))

config = BertConfig.from_json_file(model_path / 'bert_config.json')
bert = BertModel.from_pretrained(model_path / 'UTH_BERT.bin', config=config)
model_n = Net(bert, num_classes_n).to(device)
path_n = save_path / 'n' / 'best_acc_1.pth'
model_n.load_state_dict(torch.load(path_n, weights_only=True))

config = BertConfig.from_json_file(model_path / 'bert_config.json')
bert = BertModel.from_pretrained(model_path / 'UTH_BERT.bin', config=config)
model_m = Net(bert, num_classes_m).to(device)
path_m = save_path / 'm' / 'best_acc_2.pth'
model_m.load_state_dict(torch.load(path_m, weights_only=True))

<All keys matched successfully>

# 推論

In [21]:
val_data = tokenize(val_df['text'].values.tolist())

model_t.eval()
model_n.eval()
model_m.eval()

t_pred_li, n_pred_li, m_pred_li = [], [], []

for s in tqdm(val_data):
    data = torch.tensor(s,dtype=torch.long).unsqueeze(0).to(device)
    mask = (data == 0).long().to(device)

    with torch.no_grad(): 
        t_pred = inverse_label_dic_t.get(F.softmax(model_t(data, mask), dim=1).argmax(dim=1).item(), "Unknown")
        n_pred = inverse_label_dic_n.get(F.softmax(model_n(data, mask), dim=1).argmax(dim=1).item(), "Unknown")
        m_pred = inverse_label_dic_m.get(F.softmax(model_m(data, mask), dim=1).argmax(dim=1).item(), "Unknown")

        t_pred_li.append(t_pred)
        n_pred_li.append(n_pred)
        m_pred_li.append(m_pred)

val_df['t_pred'] = t_pred_li
val_df['n_pred'] = n_pred_li
val_df['m_pred'] = m_pred_li

val_df.to_csv('../model_outputs/uth_results.csv')

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

# Submission CSVの出力

In [22]:
# sub_df = pd.read_csv('../model_outputs/uth_results.csv')
# t_pred = sub_df['t_pred'].values.tolist()
# n_pred = sub_df['n_pred'].values.tolist()
# m_pred = sub_df['m_pred'].values.tolist()
# t_label = sub_df['t'].values.tolist()
# n_label = sub_df['n'].values.tolist()
# m_label = sub_df['m'].values.tolist()

# print(f't_acc: {accuracy_score(t_label, t_pred)}')
# print(f'n_acc: {accuracy_score(n_label, n_pred)}')
# print(f'm_acc: {accuracy_score(m_label, m_pred)}')

t_acc: 0.42592592592592593
n_acc: 0.8148148148148148
m_acc: 0.7592592592592593


In [23]:
sub_df = pd.read_csv('../model_outputs/uth_results.csv')
select_df = sub_df[['id', 't_pred', 'n_pred', 'm_pred']]
rename_df = select_df.rename(columns={'t_pred': 't', 'n_pred': 'n', 'm_pred': 'm'})
rename_df.to_csv('../model_outputs/submission_uth.csv', index=False)

In [24]:
rename_df.head(10)

Unnamed: 0,id,t,n,m
0,5764772,T2a,N0,M0
1,4644984,T2b,N2,M1c
2,16066820,T2b,N0,M0
3,147290,T2a,N0,M0
4,14063477,T2a,N2,M1c
5,10376521,T3,N2,M1c
6,16191878,T2a,N0,M0
7,15532322,T2a,N0,M0
8,3462779,T4,N2,M0
9,4724041,T4,N0,M0
