In [1]:
import os
import time

import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from torchvision import transforms
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary

import warnings
warnings.filterwarnings('ignore')

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
def build_image_path_map(image_root):
    image_path_map = {}
    for folder in os.listdir(image_root):
        subfolder = os.path.join(image_root, folder, "images")
        if not os.path.isdir(subfolder):
            continue
        for img_file in os.listdir(subfolder):
            if img_file.endswith(".png"):
                full_path = os.path.join(subfolder, img_file)
                image_path_map[img_file] = full_path
    return image_path_map

# Example
image_root = "/kaggle/input/data/"
image_path_map = build_image_path_map(image_root)

In [4]:
df = pd.read_csv("/kaggle/input/data/Data_Entry_2017.csv")
df['Finding Labels'] = df['Finding Labels'].str.split('|')

mlb = MultiLabelBinarizer()
df['labels'] = mlb.fit_transform(df['Finding Labels']).tolist()

In [5]:
df['Patient Gender'] = df['Patient Gender'].map({'M': 0, 'F': 1})
df['View Position'] = df['View Position'].map({'PA': 0, 'AP': 1})

df['patient_data'] = df[['Patient Age', 'Patient Gender', 'View Position']].values.tolist()

In [6]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5172546, 0.5172546, 0.5172546],
                         std=[0.23124999, 0.23124999, 0.23124999])
])

In [7]:
class ChestXrayWithMetaDataset(Dataset):
    def __init__(self, dataframe, image_path_map, transform=None):
        self.df = dataframe.reset_index(drop=True)
        self.image_path_map = image_path_map
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Image
        img_name = row['Image Index']
        img_path = self.image_path_map.get(img_name)
        if not os.path.exists(img_path):
            raise FileNotFoundError(f"Image {img_name} not found.")
        
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Labels
        labels = torch.tensor(row['labels'], dtype=torch.float32)

        # Metadata
        metadata = torch.tensor(row['patient_data'], dtype=torch.float32)
        metadata[0] = metadata[0] / 100.0  # Normalize age to 0–1

        return image, metadata, labels

In [8]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = ChestXrayWithMetaDataset(train_df, image_path_map, transform=transform)
test_dataset = ChestXrayWithMetaDataset(test_df, image_path_map, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, pin_memory=True, num_workers=2)

In [9]:
subset_size = 5000
rest = len(train_dataset) - subset_size

subset_dataset, _ = random_split(train_dataset, [subset_size, rest])

subset_loader = DataLoader(subset_dataset, batch_size=128, shuffle=True, pin_memory=True, num_workers=2)

In [10]:
class ChestXrayMultiInputCNN(nn.Module):
    def __init__(self, num_labels=15):
        super().__init__()

        # CNN for image
        self.image_conv = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1),  # (B, 3, 224, 224) → (B, 16, 224, 224)
            nn.ReLU(),
            nn.MaxPool2d(2),                             # (B, 16, 112, 112)
            
            nn.Conv2d(16, 32, kernel_size=3, padding=1), # (B, 32, 112, 112)
            nn.ReLU(),
            nn.MaxPool2d(2),                             # (B, 32, 56, 56)
            
            nn.Conv2d(32, 64, kernel_size=3, padding=1), # (B, 64, 56, 56)
            nn.ReLU(),
            nn.MaxPool2d(2)                              # (B, 64, 28, 28)
        )

        self.image_fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 28 * 28, 128),
            nn.ReLU()
        )

        # Small MLP for patient data (3 features)
        self.patient_fc = nn.Sequential(
            nn.Linear(3, 16),
            nn.ReLU()
        )

        # Combined output
        self.classifier = nn.Sequential(
            nn.Linear(128 + 16, 64),
            nn.ReLU(),
            nn.Linear(64, num_labels),
            nn.Sigmoid()  # For multi-label classification
        )

    def forward(self, image, patient_data):
        img_feat = self.image_conv(image)
        img_feat = self.image_fc(img_feat)

        patient_feat = self.patient_fc(patient_data)

        combined = torch.cat((img_feat, patient_feat), dim=1)
        output = self.classifier(combined)
        return output

In [11]:
learning_rate = 1e-3
epochs = 10

model = ChestXrayMultiInputCNN(num_labels=15)
model = model.to(device)

criterion = nn.BCELoss()  # Since we use sigmoid
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [12]:
summary(model)

Layer (type:depth-idx)                   Param #
ChestXrayMultiInputCNN                   --
├─Sequential: 1-1                        --
│    └─Conv2d: 2-1                       448
│    └─ReLU: 2-2                         --
│    └─MaxPool2d: 2-3                    --
│    └─Conv2d: 2-4                       4,640
│    └─ReLU: 2-5                         --
│    └─MaxPool2d: 2-6                    --
│    └─Conv2d: 2-7                       18,496
│    └─ReLU: 2-8                         --
│    └─MaxPool2d: 2-9                    --
├─Sequential: 1-2                        --
│    └─Flatten: 2-10                     --
│    └─Linear: 2-11                      6,422,656
│    └─ReLU: 2-12                        --
├─Sequential: 1-3                        --
│    └─Linear: 2-13                      64
│    └─ReLU: 2-14                        --
├─Sequential: 1-4                        --
│    └─Linear: 2-15                      9,280
│    └─ReLU: 2-16                        --
│    └─Li

In [13]:
for epoch in range(epochs):
    start_time = time.time()
    model.train()
    running_loss = 0.0

    for batch in subset_loader:
        images, patient_data, labels = batch
        
        images = images.to(device, non_blocking=True)
        patient_data = patient_data.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        optimizer.zero_grad()
        outputs = model(images, patient_data)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_time = time.time() - start_time
    epoch_time_min = epoch_time / 60
    print(f"Epoch [{epoch+1}] Loss: {running_loss / len(subset_loader):.4f}  Time: {epoch_time_min:.2f} minutes")

Epoch [1] Loss: 0.2622  Time: 1.49 minutes
Epoch [2] Loss: 0.2116  Time: 0.98 minutes
Epoch [3] Loss: 0.2091  Time: 0.98 minutes
Epoch [4] Loss: 0.2056  Time: 0.99 minutes
Epoch [5] Loss: 0.2050  Time: 0.98 minutes
Epoch [6] Loss: 0.2041  Time: 0.97 minutes
Epoch [7] Loss: 0.1974  Time: 0.97 minutes
Epoch [8] Loss: 0.1934  Time: 0.96 minutes
Epoch [9] Loss: 0.1857  Time: 0.95 minutes
Epoch [10] Loss: 0.1762  Time: 0.97 minutes
