In [37]:
import pandas as pd

# Load the uploaded CSV files
projections_path = "C:/Users/user/Desktop/학부 연구생/프로젝트/archive/indiana_projections.csv"
reports_path ="C:/Users/user/Desktop/학부 연구생/프로젝트/archive/indiana_reports.csv"


projections_df = pd.read_csv(projections_path)
reports_df = pd.read_csv(reports_path)

# 데이터 구조 확인
projections_df_info = projections_df.info()
reports_df_info = reports_df.info()

projections_head = projections_df.head()
reports_head = reports_df.head()

(projections_df_info, reports_df_info, projections_head, reports_head)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7466 entries, 0 to 7465
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   uid         7466 non-null   int64 
 1   filename    7466 non-null   object
 2   projection  7466 non-null   object
dtypes: int64(1), object(2)
memory usage: 175.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3851 entries, 0 to 3850
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   uid         3851 non-null   int64 
 1   MeSH        3851 non-null   object
 2   Problems    3851 non-null   object
 3   image       3851 non-null   object
 4   indication  3765 non-null   object
 5   comparison  2685 non-null   object
 6   findings    3337 non-null   object
 7   impression  3820 non-null   object
dtypes: int64(1), object(7)
memory usage: 240.8+ KB


(None,
 None,
    uid                filename projection
 0    1  1_IM-0001-4001.dcm.png    Frontal
 1    1  1_IM-0001-3001.dcm.png    Lateral
 2    2  2_IM-0652-1001.dcm.png    Frontal
 3    2  2_IM-0652-2001.dcm.png    Lateral
 4    3  3_IM-1384-1001.dcm.png    Frontal,
    uid                                               MeSH  \
 0    1                                             normal   
 1    2  Cardiomegaly/borderline;Pulmonary Artery/enlarged   
 2    3                                             normal   
 3    4  Pulmonary Disease, Chronic Obstructive;Bullous...   
 4    5  Osteophyte/thoracic vertebrae/multiple/small;T...   
 
                                             Problems  \
 0                                             normal   
 1                      Cardiomegaly;Pulmonary Artery   
 2                                             normal   
 3  Pulmonary Disease, Chronic Obstructive;Bullous...   
 4                         Osteophyte;Thickening;Lung   
 
         

In [38]:
# Frontal 투영 데이터 필터링
frontal_projections = projections_df[projections_df['projection'] == 'Frontal']

# uid 기준 병합
merged_data = frontal_projections.merge(reports_df, on='uid', how='inner')

# 필요한 열 선택 (filename과 MeSH 정보로 간단히 구성)
final_data = merged_data[['filename', 'MeSH']]

# 데이터 크기 및 예시 출력
final_data.shape, final_data.head()


((3818, 2),
                     filename  \
 0     1_IM-0001-4001.dcm.png   
 1     2_IM-0652-1001.dcm.png   
 2     3_IM-1384-1001.dcm.png   
 3     4_IM-2050-1001.dcm.png   
 4  5_IM-2117-1003002.dcm.png   
 
                                                 MeSH  
 0                                             normal  
 1  Cardiomegaly/borderline;Pulmonary Artery/enlarged  
 2                                             normal  
 3  Pulmonary Disease, Chronic Obstructive;Bullous...  
 4  Osteophyte/thoracic vertebrae/multiple/small;T...  )

In [39]:
import os
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np

# 이미지 경로 설정
image_dir = r'C:/Users/user/Desktop/images_normalized'

# 이미지 크기 (ResNet 표준)
target_size = (224, 224)

# 이미지와 레이블 데이터 준비
def load_images_and_labels(data, image_dir, target_size):
    images = []
    labels = []
    
    for _, row in data.iterrows():
        image_path = os.path.join(image_dir, row['filename'])
        if os.path.exists(image_path):
            # 이미지 로드 및 전처리
            img = load_img(image_path, target_size=target_size)
            img_array = img_to_array(img) / 255.0  # 정규화
            images.append(img_array)
            
            # 다중 클래스 레이블 분리 및 바이너리화
            labels.append(row['MeSH'].split(';'))  # 레이블 리스트 생성

    return np.array(images), labels

# 데이터 로드
images, labels = load_images_and_labels(final_data, image_dir, target_size)

# 로드된 데이터 크기 확인
images.shape, len(labels)


((3818, 224, 224, 3), 3818)

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# 데이터 분리
# 레이블을 MultiLabelBinarizer로 변환
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(labels)

# 학습 데이터와 평가 데이터로 분리 (80:20 비율)
X_train, X_val, y_train, y_val = train_test_split(images, binary_labels, test_size=0.2, random_state=42)

# 분리된 데이터 크기 확인
X_train.shape, X_val.shape, y_train.shape, y_val.shape


((3054, 224, 224, 3), (764, 224, 224, 3), (3054, 1629), (764, 1629))

In [41]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



In [42]:
# 레이블 수 설정 (MultiLabelBinarizer 적용 후의 결과에 따라)
num_classes = len(mlb.classes_)

# ResNet50 모델 불러오기 (사전 학습된 가중치 사용)
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# 출력 레이어 추가
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
output_layer = Dense(num_classes, activation='sigmoid')(x)  # Multi-label classification에 sigmoid 사용

# 모델 정의
model = Model(inputs=base_model.input, outputs=output_layer)

# 사전 학습된 층을 동결
for layer in base_model.layers:
    layer.trainable = False

# 모델 컴파일
model.compile(optimizer=Adam(learning_rate=0.001), 
              loss='binary_crossentropy',  # Multi-label 분류에는 binary_crossentropy 사용
              metrics=['accuracy'])

# 모델 학습 설정
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True, mode='max')
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# 모델 요약 출력
model.summary()

# 모델 학습
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[checkpoint, early_stopping]
)

# 학습 결과 저장
model.save('final_resnet_model.h5')

# 성능 평가
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

# 학습 및 검증 손실/정확도 시각화
def plot_history(history):
    # 손실 그래프
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss Over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    # 정확도 그래프
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy Over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()

plot_history(history)

# 예측 결과 및 성능 지표 계산
def calculate_metrics(model, X_val, y_val, mlb):
    predictions = model.predict(X_val)
    binary_predictions = (predictions > 0.5).astype(int)  # 0.5 기준으로 이진화

    # Accuracy, Precision, Recall, F1-Score 계산
    accuracy = accuracy_score(y_val, binary_predictions)
    precision = precision_score(y_val, binary_predictions, average='micro')
    recall = recall_score(y_val, binary_predictions, average='micro')
    f1 = f1_score(y_val, binary_predictions, average='micro')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

    return accuracy, precision, recall, f1

# 성능 지표 계산 및 출력
calculate_metrics(model, X_val, y_val, mlb)

# 예측 결과 확인
def display_sample_predictions(model, X_val, y_val, mlb, num_samples=5):
    predictions = model.predict(X_val[:num_samples])
    for i in range(num_samples):
        true_labels = mlb.inverse_transform([y_val[i]])[0]
        predicted_labels = mlb.inverse_transform([predictions[i] > 0.5])[0]
        print(f"Sample {i+1}:")
        print(f"True Labels: {true_labels}")
        print(f"Predicted Labels: {predicted_labels}\n")

# 샘플 예측 출력
display_sample_predictions(model, X_val, y_val, mlb)


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 conv1_pad (ZeroPadding2D)      (None, 230, 230, 3)  0           ['input_2[0][0]']                
                                                                                                  
 conv1_conv (Conv2D)            (None, 112, 112, 64  9472        ['conv1_pad[0][0]']              
                                )                                                                 
                                                                                            

In [None]:
# 모델 학습
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    callbacks=[checkpoint, early_stopping]
)

# 학습 결과 저장
model.save('final_resnet_model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

In [None]:
# 성능 평가
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

# 학습 및 검증 손실/정확도 시각화
def plot_history(history):
    # 손실 그래프
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss Over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    # 정확도 그래프
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy Over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()

plot_history(history)

In [None]:
y_pred = model.predict(X_val)

y_test_class = np.argmax(y_val,axis=1)
y_pred_class = np.argmax(y_pred,axis=1)

In [None]:
#Accuracy of the predicted values
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test_class,y_pred_class))
print(confusion_matrix(y_test_class,y_pred_class))
