In [1]:
import sys
import os
import glob
from pathlib import Path
import random
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_curve, roc_auc_score
import re
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from pkg.UTHBERT.preprocess_text import preprocess as my_preprocess
from pkg.UTHBERT.tokenization_mod import MecabTokenizer, FullTokenizerForMecab
from transformers import BertModel,BertConfig

In [3]:
def get_device() -> str:
    """
    Returns the best available device for PyTorch computations.
    """
    if torch.backends.mps.is_available():
        # macOS with Apple Silicon (MPS backend)
        return "mps"
    elif torch.cuda.is_available():
        # NVIDIA GPU
        return "cuda"
    else:
        # Fallback to CPU
        return "cpu"

try:
    device = torch.device(get_device())
    print(f"Using device: {device}")
except RuntimeError as e:
    print(f"Failed to initialize the device: {e}")
    device = torch.device("cpu")  # Fallback to CPU in case of an error

Using device: mps


In [4]:
seed=0
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False #Trueで高速化重視, Falseで再現性重視
seed_everything(seed)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.use_deterministic_algorithms(True)
torch.set_num_threads(1)

# Tokenizer

In [5]:
dic_path = './dic'

In [6]:
# special token for a Person's name (Do not change)
name_token = "＠＠Ｎ"

# path to the mecab-ipadic-neologd
mecab_ipadic_neologd = dic_path + '/mecab-ipadic-neologd'

# path to the J-Medic (We used MANBYO_201907_Dic-utf8.dic)
mecab_J_medic = dic_path + '/MANBYO/MANBYO_201907_Dic-utf8.dic'

# path to the uth-bert vocabulary
vocab_file = dic_path + '/UTH-BERT/bert_vocab_mc_v1_25000.txt'

In [7]:
# MecabTokenizer
sub_tokenizer = MecabTokenizer(mecab_ipadic_neologd=mecab_ipadic_neologd,
                               mecab_J_medic=mecab_J_medic,
                               name_token=name_token)

# FullTokenizerForMecab
tokenizer = FullTokenizerForMecab(sub_tokenizer=sub_tokenizer,
                                  vocab_file=vocab_file,
                                  do_lower_case=False)

In [8]:
def tokenize(sentence_list):
    token_list = []
    for s in tqdm(sentence_list):
        tokens = tokenizer.tokenize(my_preprocess(str(s)))
        ids = tokenizer.convert_tokens_to_ids(tokens)
        ids = [2] + ids + [3]
        token_list.append(ids)
    return token_list

# Data

In [9]:
def create_dataframe(folder_path):
    """
    指定されたフォルダ内のテキストファイルからDataFrameを作成する関数

    Args:
        folder_path (str): テキストファイルが入っているフォルダのパス

    Returns:
        pandas.DataFrame: 作成されたDataFrame
    """

    # sentence.csvを読み込む
    df_text = pd.read_csv(os.path.join(folder_path, "sentences.csv"))
    # label.csvを読み込む
    df_label = pd.read_csv(os.path.join(folder_path, "label.csv"))

    df_text['index'] = df_text['id'].astype(str) + '_' + df_text['sentence_index'].astype(str)
    df_label['index'] = df_label['id'].astype(str) + '_' + df_label['sentence_index'].astype(str)
    
    # textとlabelのDataFrameを結合
    df = pd.merge(df_text, df_label, on='index', how='inner')
    df['id'] = df['id_x'].combine_first(df['id_y'])
    df = df.drop(columns=['id_x', 'id_y'])
    df['sentence_index'] = df['sentence_index_x'].combine_first(df['sentence_index_y'])
    df = df.drop(columns=['sentence_index_x', 'sentence_index_y'])
    cols = ['index','id','sentence_index'] + [col for col in df.columns if col not in ['index','id','sentence_index']]
    df = df[cols]

    return df

val_folder_path = "../../radnlp_2024_train_val_20240731/ja/sub_task/val"
val_df = create_dataframe(val_folder_path)

In [10]:
val_df.head(10)

Unnamed: 0,index,id,sentence_index,text,omittable,measure,extension,atelectasis,satellite,lymphadenopathy,pleural,distant
0,147290_0,147290,0,左肺門部に 37mm 大の腫瘤影を認め、ご指摘の肺癌が疑われます。,0,1,0,0,0,0,0,0
1,147290_1,147290,1,縦隔に有意なリンパ節腫大は認めません。,1,0,0,0,0,0,0,0
2,147290_2,147290,2,胸水はありません。,1,0,0,0,0,0,0,0
3,147290_3,147290,3,背部皮下に腫瘤を認め、粉瘤などと思われます。,1,0,0,0,0,0,0,0
4,241752_0,241752,0,右下葉に 14×15mm の限局性すりガラス影があります。,0,1,0,0,0,0,0,0
5,241752_1,241752,1,粗大な充実部分は認めません。,0,1,0,0,0,0,0,0
6,241752_2,241752,2,内部を血管が通過しています。,1,0,0,0,0,0,0,0
7,241752_3,241752,3,既知の肺癌と考えます。,0,1,0,0,0,0,0,0
8,241752_4,241752,4,縦隔リンパ節腫大は認めません。,1,0,0,0,0,0,0,0
9,241752_5,241752,5,胸水貯留は指摘できません。,1,0,0,0,0,0,0,0


# 分類モデルの読み込み

In [11]:
model_path = Path('./pkg/UTHBERT')
save_path = Path('./model')

In [12]:
num_classes=2

In [13]:
config = BertConfig.from_json_file(model_path / 'bert_config.json')
bert = BertModel.from_pretrained(model_path / 'UTH_BERT.bin', config=config)

class Net(nn.Module):
    def __init__(self, bert, num_classes):
        super(Net, self).__init__()
        self.bert = bert
        self.cls = nn.Linear(768, num_classes)

  # @torch.cuda.amp.autocast()
    def forward(self, inputs_ids, pad_masks, Attention=False):
        bout = self.bert(inputs_ids, pad_masks, output_attentions=Attention)
        bs = len(bout[0])
        h0 = [bout[0][i][0] for i in range(bs)]
        h0 = torch.stack(h0,dim=0)
        if Attention:
            return self.cls(h0),bout
        else:
            return self.cls(h0)

In [14]:
model_omit = Net(bert, num_classes).to(device)
path_omit = save_path / 'omit' / 'best_acc_1.pth'
model_omit.load_state_dict(torch.load(path_omit, weights_only=True))

config = BertConfig.from_json_file(model_path / 'bert_config.json')
bert = BertModel.from_pretrained(model_path / 'UTH_BERT.bin', config=config)
model_meas = Net(bert, num_classes).to(device)
path_meas = save_path / 'meas' / 'best_acc_2.pth'
model_meas.load_state_dict(torch.load(path_meas, weights_only=True))

config = BertConfig.from_json_file(model_path / 'bert_config.json')
bert = BertModel.from_pretrained(model_path / 'UTH_BERT.bin', config=config)
model_exte = Net(bert, num_classes).to(device)
path_exte = save_path / 'exte' / 'best_acc_1.pth'
model_exte.load_state_dict(torch.load(path_exte, weights_only=True))

config = BertConfig.from_json_file(model_path / 'bert_config.json')
bert = BertModel.from_pretrained(model_path / 'UTH_BERT.bin', config=config)
model_atel = Net(bert, num_classes).to(device)
path_atel = save_path / 'atel' / 'best_acc_1.pth'
model_atel.load_state_dict(torch.load(path_atel, weights_only=True))

config = BertConfig.from_json_file(model_path / 'bert_config.json')
bert = BertModel.from_pretrained(model_path / 'UTH_BERT.bin', config=config)
model_sate = Net(bert, num_classes).to(device)
path_sate = save_path / 'sate' / 'best_acc_1.pth'
model_sate.load_state_dict(torch.load(path_sate, weights_only=True))

config = BertConfig.from_json_file(model_path / 'bert_config.json')
bert = BertModel.from_pretrained(model_path / 'UTH_BERT.bin', config=config)
model_lymp = Net(bert, num_classes).to(device)
path_lymp = save_path / 'lymp' / 'best_acc_1.pth'
model_lymp.load_state_dict(torch.load(path_lymp, weights_only=True))

config = BertConfig.from_json_file(model_path / 'bert_config.json')
bert = BertModel.from_pretrained(model_path / 'UTH_BERT.bin', config=config)
model_pleu = Net(bert, num_classes).to(device)
path_pleu = save_path / 'pleu' / 'best_acc_1.pth'
model_pleu.load_state_dict(torch.load(path_pleu, weights_only=True))

config = BertConfig.from_json_file(model_path / 'bert_config.json')
bert = BertModel.from_pretrained(model_path / 'UTH_BERT.bin', config=config)
model_dist = Net(bert, num_classes).to(device)
path_dist = save_path / 'dist' / 'best_acc_1.pth'
model_dist.load_state_dict(torch.load(path_dist, weights_only=True))

<All keys matched successfully>

# 推論

In [15]:
val_data = tokenize(val_df['text'].values.tolist())

model_omit.eval()
model_meas.eval()
model_exte.eval()
model_atel.eval()
model_sate.eval()
model_lymp.eval()
model_pleu.eval()
model_dist.eval()

omit_li, meas_li, exte_li, atel_li, sate_li, lymp_li, pleu_li, dist_li = [],[],[],[],[],[],[],[]

for s in tqdm(val_data):
    data = torch.tensor(s,dtype=torch.long).unsqueeze(0).to(device)
    mask = (data == 0).long().to(device)

    with torch.no_grad(): 
        omit_pred = F.softmax(model_omit(data, mask), dim=1).argmax(dim=1).item()
        omit_li.append(omit_pred)

        if omit_pred == 1:
            meas_li.append(0)
            exte_li.append(0)
            atel_li.append(0)
            sate_li.append(0)
            lymp_li.append(0)
            pleu_li.append(0)
            dist_li.append(0)

        else:
            meas_li.append(F.softmax(model_meas(data, mask), dim=1).argmax(dim=1).item())
            exte_li.append(F.softmax(model_exte(data, mask), dim=1).argmax(dim=1).item())
            atel_li.append(F.softmax(model_atel(data, mask), dim=1).argmax(dim=1).item())
            sate_li.append(F.softmax(model_sate(data, mask), dim=1).argmax(dim=1).item())
            lymp_li.append(F.softmax(model_lymp(data, mask), dim=1).argmax(dim=1).item())
            pleu_li.append(F.softmax(model_pleu(data, mask), dim=1).argmax(dim=1).item())
            dist_li.append(F.softmax(model_dist(data, mask), dim=1).argmax(dim=1).item())
         
val_df['omit_pred'] = omit_li
val_df['meas_pred'] = meas_li
val_df['exte_pred'] = exte_li
val_df['atel_pred'] = atel_li
val_df['sate_pred'] = sate_li
val_df['lymp_pred'] = lymp_li
val_df['pleu_pred'] = pleu_li
val_df['dist_pred'] = dist_li

val_df.to_csv('../model_outputs/uth_subtask_results.csv')

  0%|          | 0/451 [00:00<?, ?it/s]

  0%|          | 0/451 [00:00<?, ?it/s]

# Submission CSVの出力

In [16]:
sub_df = pd.read_csv('../model_outputs/uth_subtask_results.csv')
omit_pred = sub_df['omit_pred'].values.tolist()
omit_label = sub_df['omittable'].values.tolist()
meas_pred = sub_df['meas_pred'].values.tolist()
meas_label = sub_df['measure'].values.tolist()
exte_pred = sub_df['exte_pred'].values.tolist()
exte_label = sub_df['extension'].values.tolist()
atel_pred = sub_df['atel_pred'].values.tolist()
atel_label = sub_df['atelectasis'].values.tolist()
sate_pred = sub_df['sate_pred'].values.tolist()
sate_label = sub_df['satellite'].values.tolist()
lymp_pred = sub_df['lymp_pred'].values.tolist()
lymp_label = sub_df['lymphadenopathy'].values.tolist()
pleu_pred = sub_df['pleu_pred'].values.tolist()
pleu_label = sub_df['pleural'].values.tolist()
dist_pred = sub_df['dist_pred'].values.tolist()
dist_label = sub_df['distant'].values.tolist()

print(f'omit_acc: {accuracy_score(omit_label, omit_pred)}')
print(f'meas_acc: {accuracy_score(meas_label, meas_pred)}')
print(f'exte_acc: {accuracy_score(exte_label, exte_pred)}')
print(f'atel_acc: {accuracy_score(atel_label, atel_pred)}')
print(f'sate_acc: {accuracy_score(sate_label, sate_pred)}')
print(f'lymp_acc: {accuracy_score(lymp_label, lymp_pred)}')
print(f'pleu_acc: {accuracy_score(pleu_label, pleu_pred)}')
print(f'dist_acc: {accuracy_score(dist_label, dist_pred)}')

omit_acc: 0.9312638580931264
meas_acc: 0.9623059866962306
exte_acc: 0.975609756097561
atel_acc: 0.9889135254988913
sate_acc: 0.9445676274944568
lymp_acc: 0.9866962305986696
pleu_acc: 0.9645232815964523
dist_acc: 0.9401330376940134


In [17]:
sub_df = pd.read_csv('../model_outputs/uth_subtask_results.csv')
select_df = sub_df[['id','sentence_index','omit_pred','meas_pred','exte_pred','atel_pred','sate_pred','lymp_pred','pleu_pred','dist_pred']]
rename_df = select_df.rename(columns={'omit_pred':'omittable','meas_pred':'measure','exte_pred':'extension','atel_pred':'atelectasis','sate_pred':'satellite','lymp_pred':'lymphadenopathy','pleu_pred':'pleural','dist_pred':'distant'})
rename_df.to_csv('../model_outputs/sentence_classification_uth.csv', index=False)

In [18]:
rename_df.head(10)

Unnamed: 0,id,sentence_index,omittable,measure,extension,atelectasis,satellite,lymphadenopathy,pleural,distant
0,147290,0,0,1,0,0,0,0,0,0
1,147290,1,1,0,0,0,0,0,0,0
2,147290,2,1,0,0,0,0,0,0,0
3,147290,3,1,0,0,0,0,0,0,0
4,241752,0,0,1,0,0,0,0,0,0
5,241752,1,1,0,0,0,0,0,0,0
6,241752,2,0,0,0,0,0,0,0,0
7,241752,3,0,1,0,0,0,0,0,0
8,241752,4,1,0,0,0,0,0,0,0
9,241752,5,1,0,0,0,0,0,0,0


In [19]:
print(rename_df['omittable'].value_counts())
print(rename_df['measure'].value_counts())
print(rename_df['extension'].value_counts())
print(rename_df['atelectasis'].value_counts())
print(rename_df['satellite'].value_counts())
print(rename_df['lymphadenopathy'].value_counts())
print(rename_df['pleural'].value_counts())
print(rename_df['distant'].value_counts())

omittable
0    291
1    160
Name: count, dtype: int64
measure
0    363
1     88
Name: count, dtype: int64
extension
0    411
1     40
Name: count, dtype: int64
atelectasis
0    436
1     15
Name: count, dtype: int64
satellite
0    432
1     19
Name: count, dtype: int64
lymphadenopathy
0    411
1     40
Name: count, dtype: int64
pleural
0    423
1     28
Name: count, dtype: int64
distant
0    429
1     22
Name: count, dtype: int64
