In [None]:
### Load and Filter Dataset


import os
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Input
from tensorflow.keras.optimizers import Adam
# If you have installed TensorFlow 2.x, Keras is included as tf.keras.
# If your IDE shows "grayed out" or "unresolved" for keras imports, try restarting the Jupyter kernel.
# You can also check your TensorFlow version:
import tensorflow as tf
print("TF Version:", tf.__version__)
print("GPU Available:", tf.config.list_physical_devices('GPU'))

### CONFIG
# dataset file directory
DATA_DIR = r'D:\NIH Xray Dataset'


# CSV file containing labels
CSV_PATH = os.path.join(DATA_DIR, 'Data_Entry_2017.csv')

# Load the CSV file (metadata)
df = pd.read_csv(CSV_PATH)

# Classes considered
target_classes = ['Atelectasis', 'Cardiomegaly', 'Effusion', 'Pneumonia', 'No Finding']

# Keep only samples with single label within target classes
df['Finding Labels'] = df['Finding Labels'].str.strip()
df_single = df[df['Finding Labels'].isin(target_classes)]

# Balance dataset (equal number of samples for each class)
min_count = df_single['Finding Labels'].value_counts().min()
df_balanced = pd.concat([df_single[df_single['Finding Labels'] == cls].sample(min_count, random_state=43) for cls in target_classes])

# Shuffle dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

### DEBUG: Display the first few rows of the balanced dataset
print(df_balanced.head())

Note: you may need to restart the kernel to use updated packages.
TF Version: 2.20.0-rc0
GPU Available: []



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


        Image Index Finding Labels  Follow-up #  Patient ID  Patient Age  \
0  00015468_000.png    Atelectasis            0       15468           38   
1  00008051_006.png    Atelectasis            6        8051           49   
2  00028628_006.png       Effusion            6       28628           20   
3  00000618_000.png     No Finding            0         618           54   
4  00020773_003.png    Atelectasis            3       20773           20   

  Patient Gender View Position  OriginalImage[Width  Height]  \
0              M            PA                 2992     2991   
1              M            AP                 2500     2048   
2              M            PA                 1767     2021   
3              F            PA                 2596     2364   
4              M            PA                 2578     2991   

   OriginalImagePixelSpacing[x        y]  Unnamed: 11  
0                     0.143000  0.143000          NaN  
1                     0.168000  0.168000      

In [12]:
### Train/Validation/Test Split
from sklearn.model_selection import train_test_split

# 80% train, 10% val, 10% test
train_df, test_df = train_test_split(df_balanced, test_size=0.2, random_state=42, stratify=df_balanced['Finding Labels'])

train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['Finding Labels'], random_state=42)

### DEBUG: Display the splits and percentages
print("\nDataset splits:")
print(f"Train samples: {len(train_df)}, {len(train_df)/len(df_balanced)*100:.2f}%")
print(f"Validation samples: {len(val_df)}, {len(val_df)/len(df_balanced)*100:.2f}%")
print(f"Test samples: {len(test_df)}, {len(test_df)/len(df_balanced)*100:.2f}%")
# Display pixel dimensions (width x height) for a few images in the dataset
print("\nSample image dimensions (pixels):")
for idx, row in df_balanced.head(5).iterrows():
    width = row['OriginalImage[Width']
    height = row['Height]']
    print(f"{row['Image Index']}: {width}x{height}")

### DEBUG: Display class distribution in each split
print("\nClass distribution in splits:")
print("Train:\n", train_df['Finding Labels'].value_counts())
print("Validation:\n", val_df['Finding Labels'].value_counts())
print("Test:\n", test_df['Finding Labels'].value_counts())



Dataset splits:
Train samples: 1159, 71.99%
Validation samples: 129, 8.01%
Test samples: 322, 20.00%

Sample image dimensions (pixels):
00015468_000.png: 2992x2991
00008051_006.png: 2500x2048
00028628_006.png: 1767x2021
00000618_000.png: 2596x2364
00020773_003.png: 2578x2991

Class distribution in splits:
Train:
 Finding Labels
Pneumonia       232
No Finding      232
Effusion        232
Cardiomegaly    232
Atelectasis     231
Name: count, dtype: int64
Validation:
 Finding Labels
Cardiomegaly    26
Pneumonia       26
Effusion        26
Atelectasis     26
No Finding      25
Name: count, dtype: int64
Test:
 Finding Labels
Atelectasis     65
No Finding      65
Cardiomegaly    64
Effusion        64
Pneumonia       64
Name: count, dtype: int64
