# **Step 1: Import Libraries**

In [48]:
import os
import zipfile
import torch
from torchvision.models import (
    swin_v2_t,
    Swin_V2_T_Weights,
)
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from glob import glob

**Ensure Reproducibility**

In [26]:
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

# **Step 2: Load Data**

In [12]:
# Zip path
zip_path = '/cluster/home/bjorneme/projects/Data/chestX-ray14.zip'

# Path to save extracted files
extracted_path = '/cluster/home/bjorneme/projects/Data/chestX-ray14-extracted'

# Create extraction directory if it doesn't exist
os.makedirs(extracted_path, exist_ok=True)

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

# **Step 3: Data Preprocessing**

In [49]:
# Labels all possible diseases
disease_labels = ['Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema', 'Emphysema', 'Fibrosis', 'Effusion', 'Pneumonia', 'Pleural_Thickening',
'Cardiomegaly', 'Nodule', 'Mass', 'Hernia']

# NIH Dataset Labels CSV File 
labels_df = pd.read_csv('/cluster/home/bjorneme/projects/Data/chestX-ray14-extracted/Data_Entry_2017.csv')

# One hot encoding
for diseases in tqdm(disease_labels): 
    labels_df[diseases] = labels_df['Finding Labels'].map(lambda result: 1 if diseases in result else 0)

100%|██████████| 14/14 [00:00<00:00, 25.37it/s]


In [67]:
labels_df['Finding Labels'] = labels_df['Finding Labels'].apply(lambda s: [l for l in str(s).split('|')])

num_glob = glob('/cluster/home/bjorneme/projects/Data/chestX-ray14-extracted/*/images/*.png')
img_path = {os.path.basename(x): x for x in num_glob}

labels_df['Paths'] = labels_df['Image Index'].map(img_path.get)
labels_df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia,Paths
0,00000001_000.png,[['Cardiomegaly']],0,1,58,M,PA,2682,2749,0.143,...,0,0,0,0,0,1,0,0,0,/cluster/home/bjorneme/projects/Data/chestX-ra...
1,00000001_001.png,"[['Cardiomegaly', 'Emphysema']]",1,1,58,M,PA,2894,2729,0.143,...,1,0,0,0,0,1,0,0,0,/cluster/home/bjorneme/projects/Data/chestX-ra...
2,00000001_002.png,"[['Cardiomegaly', 'Effusion']]",2,1,58,M,PA,2500,2048,0.168,...,0,0,1,0,0,1,0,0,0,/cluster/home/bjorneme/projects/Data/chestX-ra...
3,00000002_000.png,[['No Finding']],0,2,81,M,PA,2500,2048,0.171,...,0,0,0,0,0,0,0,0,0,/cluster/home/bjorneme/projects/Data/chestX-ra...
4,00000003_000.png,[['Hernia']],0,3,81,F,PA,2582,2991,0.143,...,0,0,0,0,0,0,0,0,1,/cluster/home/bjorneme/projects/Data/chestX-ra...


# **Step 4: Exploratory Data Analysis (EDA)**

**Number of Patients**

In [68]:
unique_patients = np.unique(labels_df['Patient ID'])
len(unique_patients)

30805

# **Step 5: Split Dataset**

In [69]:
from sklearn.model_selection import train_test_split

# train-70
# val-10
# test-20
train_val_df_patients, test_df_patients = train_test_split(unique_patients, 
                                   test_size = 0.2,
                                   random_state = SEED,
                                    shuffle= True
                                   )
len(train_val_df_patients)

24644

In [70]:
train_df = labels_df[labels_df['Patient ID'].isin(train_val_df_patients)]

test_df = labels_df[labels_df['Patient ID'].isin(test_df_patients)]

# **Step 5: Build the Model**

In [71]:
# Modify the model's head for 14 classes
from torch import nn
num_classes = len(disease_labels)
model.head = nn.Linear(model.head.in_features, num_classes)

# Custom dataset
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

class ChestXray14Dataset(Dataset):
    def __init__(self, df, transform=None, disease_labels=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform
        self.disease_labels = disease_labels
        self.labels = self.df[self.disease_labels].values
        self.image_paths = self.df['Paths'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx].astype(np.float32)
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        label = torch.from_numpy(label)
        return image, label

from torchvision.models import Swin_V2_T_Weights

# Get the weights
weights = Swin_V2_T_Weights.IMAGENET1K_V1

# Get the preprocessing transforms
preprocess = weights.transforms()

# Use preprocess in your DataLoader
train_dataset = ChestXray14Dataset(train_df, transform=preprocess, disease_labels=disease_labels)
test_dataset = ChestXray14Dataset(test_df, transform=preprocess, disease_labels=disease_labels)

# DataLoaders
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)




# **Step 6: Train the Model**

In [72]:
# Loss function and optimizer
import torch
from torch import nn, optim

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
import numpy as np

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in tqdm(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * images.size(0)
        
    epoch_loss = running_loss / len(train_dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}')

  0%|          | 11/2808 [03:01<12:08:15, 15.62s/it]

# **Step 7: Evaluate the Model**

In [17]:
# TODO