# **Step 1: Import Libraries**

In [89]:
pip install fastai 

Collecting fastai
  Downloading fastai-2.7.18-py3-none-any.whl.metadata (9.1 kB)
Collecting fastdownload<2,>=0.0.5 (from fastai)
  Downloading fastdownload-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Collecting fastcore<1.8,>=1.5.29 (from fastai)
  Downloading fastcore-1.7.20-py3-none-any.whl.metadata (3.5 kB)
Collecting fastprogress>=0.2.4 (from fastai)
  Downloading fastprogress-1.0.3-py3-none-any.whl.metadata (5.6 kB)
Collecting spacy<4 (from fastai)
  Downloading spacy-3.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy<4->fastai)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy<4->fastai)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy<4->fastai)
  Downloading murmurhash-1.0.10-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014

In [48]:
import os
import zipfile
import torch
from torchvision.models import (
    swin_v2_t,
    Swin_V2_T_Weights,
)
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from glob import glob

**Ensure Reproducibility**

In [26]:
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

# **Step 2: Load Data**

In [12]:
# Zip path
zip_path = '/cluster/home/bjorneme/projects/Data/chestX-ray14.zip'

# Path to save extracted files
extracted_path = '/cluster/home/bjorneme/projects/Data/chestX-ray14-extracted'

# Create extraction directory if it doesn't exist
os.makedirs(extracted_path, exist_ok=True)

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

# **Step 3: Data Preprocessing**

In [49]:
# Labels all possible diseases
disease_labels = ['Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema', 'Emphysema', 'Fibrosis', 'Effusion', 'Pneumonia', 'Pleural_Thickening',
'Cardiomegaly', 'Nodule', 'Mass', 'Hernia']

# NIH Dataset Labels CSV File 
labels_df = pd.read_csv('/cluster/home/bjorneme/projects/Data/chestX-ray14-extracted/Data_Entry_2017.csv')

# One hot encoding
for diseases in tqdm(disease_labels): 
    labels_df[diseases] = labels_df['Finding Labels'].map(lambda result: 1 if diseases in result else 0)

100%|██████████| 14/14 [00:00<00:00, 25.37it/s]


In [67]:
labels_df['Finding Labels'] = labels_df['Finding Labels'].apply(lambda s: [l for l in str(s).split('|')])

num_glob = glob('/cluster/home/bjorneme/projects/Data/chestX-ray14-extracted/*/images/*.png')
img_path = {os.path.basename(x): x for x in num_glob}

labels_df['Paths'] = labels_df['Image Index'].map(img_path.get)
labels_df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia,Paths
0,00000001_000.png,[['Cardiomegaly']],0,1,58,M,PA,2682,2749,0.143,...,0,0,0,0,0,1,0,0,0,/cluster/home/bjorneme/projects/Data/chestX-ra...
1,00000001_001.png,"[['Cardiomegaly', 'Emphysema']]",1,1,58,M,PA,2894,2729,0.143,...,1,0,0,0,0,1,0,0,0,/cluster/home/bjorneme/projects/Data/chestX-ra...
2,00000001_002.png,"[['Cardiomegaly', 'Effusion']]",2,1,58,M,PA,2500,2048,0.168,...,0,0,1,0,0,1,0,0,0,/cluster/home/bjorneme/projects/Data/chestX-ra...
3,00000002_000.png,[['No Finding']],0,2,81,M,PA,2500,2048,0.171,...,0,0,0,0,0,0,0,0,0,/cluster/home/bjorneme/projects/Data/chestX-ra...
4,00000003_000.png,[['Hernia']],0,3,81,F,PA,2582,2991,0.143,...,0,0,0,0,0,0,0,0,1,/cluster/home/bjorneme/projects/Data/chestX-ra...


# **Step 4: Exploratory Data Analysis (EDA)**

**Number of Patients**

In [68]:
unique_patients = np.unique(labels_df['Patient ID'])
len(unique_patients)

30805

# **Step 5: Split Dataset**

In [69]:
from sklearn.model_selection import train_test_split

# train-70
# val-10
# test-20
train_val_df_patients, test_df_patients = train_test_split(unique_patients, 
                                   test_size = 0.2,
                                   random_state = SEED,
                                    shuffle= True
                                   )
len(train_val_df_patients)

24644

In [85]:
train_val_df = labels_df[labels_df['Patient ID'].isin(train_val_df_patients)]

train_val_df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia,Paths
0,00000001_000.png,[['Cardiomegaly']],0,1,58,M,PA,2682,2749,0.143,...,0,0,0,0,0,1,0,0,0,/cluster/home/bjorneme/projects/Data/chestX-ra...
1,00000001_001.png,"[['Cardiomegaly', 'Emphysema']]",1,1,58,M,PA,2894,2729,0.143,...,1,0,0,0,0,1,0,0,0,/cluster/home/bjorneme/projects/Data/chestX-ra...
2,00000001_002.png,"[['Cardiomegaly', 'Effusion']]",2,1,58,M,PA,2500,2048,0.168,...,0,0,1,0,0,1,0,0,0,/cluster/home/bjorneme/projects/Data/chestX-ra...
3,00000002_000.png,[['No Finding']],0,2,81,M,PA,2500,2048,0.171,...,0,0,0,0,0,0,0,0,0,/cluster/home/bjorneme/projects/Data/chestX-ra...
4,00000003_000.png,[['Hernia']],0,3,81,F,PA,2582,2991,0.143,...,0,0,0,0,0,0,0,0,1,/cluster/home/bjorneme/projects/Data/chestX-ra...


In [86]:
labels_df.shape
print('train_val size', train_val_df.shape[0])
print('test size', labels_df.shape[0] - train_val_df.shape[0])

train_val size 89826
test size 22294


In [90]:
import fastai
from fastai.vision.all import *

item_transforms = [
    Resize((224, 224)),
]

batch_transforms = [
    Flip(),
    Rotate(),
    Normalize.from_stats(*imagenet_stats),
]


def get_x(row):
    return row['Paths']

def get_y(row):
    labels = row[disease_labels].tolist()
    return labels

dblock = DataBlock(
    blocks=(ImageBlock, MultiCategoryBlock(encoded=True,vocab=disease_labels)),
                   splitter=RandomSplitter(valid_pct=0.125, seed=SEED),
                   get_x=get_x,
                   get_y=get_y,
                   item_tfms=item_transforms,
                   batch_tfms=batch_transforms
                  )
dls = dblock.dataloaders(train_val_df, bs=32)
# print(dblock.datasets(train_val_merge).train)

In [93]:
from fastai.vision.all import *
import torch
from torchvision.models import swin_v2_t, Swin_V2_T_Weights

# Step 1: Instantiate the Swin Transformer Tiny model with pretrained weights
model = swin_v2_t(weights=Swin_V2_T_Weights.IMAGENET1K_V1)

# Step 2: Modify the final layer to match the number of classes in your dataset
# Assume 'dls' is your DataLoaders object
num_classes = dls.c  # Number of classes
model.head = torch.nn.Linear(model.head.in_features, num_classes)

# Step 3: Define callbacks
cbs = [
    SaveModelCallback(monitor='valid_loss', min_delta=0.0001, with_opt=True),
    EarlyStoppingCallback(monitor='valid_loss', min_delta=0.001, patience=5),
    ShowGraphCallback()
]

# Step 4: Create the Fastai Learner with the custom model
learn = Learner(
    dls,
    model,
    metrics=[accuracy_multi, F1ScoreMulti(), RocAucMulti()],
    cbs=cbs,
    wd=0.001
)

# Optional: Wrap the model with DataParallel for multi-GPU training
learn.model = torch.nn.DataParallel(learn.model)

In [96]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting protobuf!=4.21.0,!

In [97]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2024-11-21 20:30:57.080812: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732217457.106551 1272616 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732217457.114619 1272616 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-21 20:30:57.143187: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available:  0


2024-11-21 20:31:03.222085: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


# **Step 5: Build the Model**

# **Step 6: Train the Model**

# **Step 7: Evaluate the Model**

In [17]:
# TODO