In [1]:
import pandas as pd

# Load the uploaded CSV files to check their structure and contents
file_path_projections = "C:/Users/user/Desktop/학부 연구생/프로젝트/archive/indiana_projections.csv"
file_path_reports ="C:/Users/user/Desktop/학부 연구생/프로젝트/archive/indiana_reports.csv"


# Read the CSV files
data_projections = pd.read_csv(file_path_projections)
data_reports = pd.read_csv(file_path_reports)
# Filter for "Frontal" images in the projections dataset
frontal_data = data_projections[data_projections['projection'] == 'Frontal']

# Merge the frontal data with the reports data using 'uid' as the key
merged_data = pd.merge(frontal_data, data_reports, on='uid', how='inner')

# Check the resulting merged data
merged_data.head()


Unnamed: 0,uid,filename,projection,MeSH,Problems,image,indication,comparison,findings,impression
0,1,1_IM-0001-4001.dcm.png,Frontal,normal,normal,Xray Chest PA and Lateral,Positive TB test,None.,The cardiac silhouette and mediastinum size ar...,Normal chest x-XXXX.
1,2,2_IM-0652-1001.dcm.png,Frontal,Cardiomegaly/borderline;Pulmonary Artery/enlarged,Cardiomegaly;Pulmonary Artery,"Chest, 2 views, frontal and lateral",Preop bariatric surgery.,None.,Borderline cardiomegaly. Midline sternotomy XX...,No acute pulmonary findings.
2,3,3_IM-1384-1001.dcm.png,Frontal,normal,normal,Xray Chest PA and Lateral,"rib pain after a XXXX, XXXX XXXX steps this XX...",,,"No displaced rib fractures, pneumothorax, or p..."
3,4,4_IM-2050-1001.dcm.png,Frontal,"Pulmonary Disease, Chronic Obstructive;Bullous...","Pulmonary Disease, Chronic Obstructive;Bullous...","PA and lateral views of the chest XXXX, XXXX a...",XXXX-year-old XXXX with XXXX.,None available,There are diffuse bilateral interstitial and a...,1. Bullous emphysema and interstitial fibrosis...
4,5,5_IM-2117-1003002.dcm.png,Frontal,Osteophyte/thoracic vertebrae/multiple/small;T...,Osteophyte;Thickening;Lung,Xray Chest PA and Lateral,Chest and nasal congestion.,,The cardiomediastinal silhouette and pulmonary...,No acute cardiopulmonary abnormality.


In [2]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
from torch.utils.data import DataLoader, Dataset
# findings와 impression 결합
merged_data["report"] = merged_data["findings"].fillna("") + " " + merged_data["impression"].fillna("")
merged_data["report"] = merged_data["report"].str.strip()

# 필요한 컬럼 선택 및 NaN 제거
merged_data = merged_data[["uid", "filename", "projection", "indication", "report"]].dropna()

# BERT Tokenizer 로드
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# MLM 데이터셋 클래스 정의
class MLM_Dataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        
        # BERT 토크나이징
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [3]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')


# 데이터셋 및 DataLoader 생성
mlm_dataset = MLM_Dataset(merged_data["report"].tolist() + merged_data["indication"].tolist(), tokenizer)
mlm_dataloader = DataLoader(mlm_dataset, batch_size=8, shuffle=True)

# 모델 로드
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForMaskedLM.from_pretrained("bert-base-uncased").to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not

In [4]:
import random

# 'report' 컬럼에서 'XXXX'가 포함된 문장만 필터링
filtered_texts = [text for text in merged_data["report"].dropna() if "XXXX" in text]

# 랜덤 샘플 선택 (filtered_texts가 비어 있으면 기본값 제공)
sample_text = random.choice(filtered_texts) if filtered_texts else "No valid text found."

# 마스킹 수행
masked_text = sample_text.replace("XXXX", tokenizer.mask_token)

# 토크나이징 및 텐서 변환
inputs = tokenizer(masked_text, return_tensors="pt").to(device)
mask_idx = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]

# MLM 예측 수행
with torch.no_grad():
    predictions = model(**inputs).logits

# 마스크된 토큰 예측 및 복원
predicted_text = masked_text
for idx in mask_idx.tolist():
    predicted_token = tokenizer.decode(torch.argmax(predictions[0, idx]))
    predicted_text = predicted_text.replace(tokenizer.mask_token, predicted_token, 1)

# 결과 출력
print("=" * 50)
print("📝 원본 텍스트:\n", sample_text, "\n", "=" * 50)
print("\n🔍 마스킹된 텍스트:\n", masked_text, "\n", "=" * 50)
print("\n🎯 예측된 텍스트:\n", predicted_text, "\n", "=" * 50)

📝 원본 텍스트:
 There is widening of the mediastinum. There is moderate cardiomegaly identified. The central pulmonary XXXX appear enlarged. Correlate for pulmonary vascular congestion. No focal infiltrate. No large effusion or pneumothorax. 1. Moderate increase in size of the cardiac silhouette. Unclear whether this is secondary to cardiomegaly or pericardial effusion. 2. Pulmonary vascular congestion. 3. Widened mediastinum. Maybe secondary to prominent mediastinal fat or tortuous XXXX. However, adenopathy, or mass is not excluded. CT of the chest with contrast is recommended for further evaluation of these findings. 

🔍 마스킹된 텍스트:
 There is widening of the mediastinum. There is moderate cardiomegaly identified. The central pulmonary [MASK] appear enlarged. Correlate for pulmonary vascular congestion. No focal infiltrate. No large effusion or pneumothorax. 1. Moderate increase in size of the cardiac silhouette. Unclear whether this is secondary to cardiomegaly or pericardial effusion. 2. P