In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from pathlib import Path
import sys
import os
from dotenv import load_dotenv
import torch.optim as optim

load_dotenv()
src_path = Path(os.getenv("SRC_PATH"))

sys.path.append(str(Path(src_path).resolve().parent))

from src.data.preprocessing import XRayPreprocessor
from src.data.dataset import ChestXrayDataset
from src.models.cnn import ChestXrayModel
from src.training.trainer import XRayTrainer


ModuleNotFoundError: No module named 'tqdm'

In [3]:
# Load metadata
load_dotenv()
data_path = Path(os.getenv("DATA_PATH"))
metadata_data_path = data_path / 'metadata' / 'Data_Entry_2017_v2020.csv'
metadata_df = pd.read_csv(metadata_data_path)
raw_data_path = data_path / 'raw'
images_001_data_path = raw_data_path / 'images_001'
last_file = sorted(list(images_001_data_path.iterdir()))[-1]
last_file_name = last_file.stem

# Create a copy of the filtered DataFrame
metadata_df_001 = metadata_df[metadata_df['Image Index'] <= last_file_name].copy()

# Create binary labels for pneumonia
metadata_df_001['pneumonia'] = metadata_df_001['Finding Labels'].str.contains('Pneumonia', case=False).astype(int)

# Display class distribution
print("\nClass Distribution:")
print(metadata_df_001['pneumonia'].value_counts())

# Display sample of the data
print("\nPercentage distribution:")
print(metadata_df_001['pneumonia'].value_counts(normalize=True) * 100)


Class Distribution:
pneumonia
0    4933
1      65
Name: count, dtype: int64

Percentage distribution:
pneumonia
0    98.69948
1     1.30052
Name: proportion, dtype: float64


In [4]:
preprocessor = XRayPreprocessor(
    target_size=(224, 224),
    normalize_method='standard',
    train_split=0.70,
    val_split=0.15,
    test_split=0.15,
    random_seed=42
)

image_splits, label_splits = preprocessor.create_splits(
    metadata_df = metadata_df_001
)



INFO:src.data.preprocessing:Train set size: 3498
INFO:src.data.preprocessing:Validation set size: 750
INFO:src.data.preprocessing:Test set size: 750


In [5]:
# Create datasets
train_dataset = ChestXrayDataset(
    folder_path=images_001_data_path,
    image_paths=image_splits['train'],
    labels=label_splits['train'],
    preprocessor=preprocessor
)

val_dataset = ChestXrayDataset(
    folder_path=images_001_data_path,
    image_paths=image_splits['val'],
    labels=label_splits['val'],
    preprocessor=preprocessor
)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Verify data loading
for images, labels in train_loader:
    print(f"First image of batch: {images[0].shape}")
    print(f"Labels shape: {labels[0]}")
    break

First image of batch: torch.Size([3, 224, 224])
Labels shape: 0


In [6]:
# After your data loading and before model initialization
total_samples = len(metadata_df_001)
n_pneumonia = metadata_df_001['pneumonia'].sum()
n_normal = total_samples - n_pneumonia

weight_for_1 = total_samples / (2 * n_pneumonia)  # pneumonia class
weight_for_0 = total_samples / (2 * n_normal)     # normal class

class_weights = torch.FloatTensor([weight_for_0, weight_for_1]).to('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# TRAIN SCRATCH

# Initialize model
model = ChestXrayModel(num_classes=1, pretrained=True)

# Initialize trainer
trainer = XRayTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    num_classes=1,
    lr=1e-4,
    checkpoint_dir= (os.getenv("SRC_PATH") + '/checkpoints')
)

# Train the model
best_metrics = trainer.train(
    num_epochs=10,  # Start with a small number for testing
    early_stopping_patience=3
)

# Display final metrics
print("\nBest Validation Metrics:")
for metric, value in best_metrics.items():
    print(f"{metric}: {value:.4f}")

2024-12-03 00:13:01,270 - INFO - Trainer initialized with ChestXrayModel
INFO:src.training.trainer:Trainer initialized with ChestXrayModel
2024-12-03 00:13:01,270 - INFO - Training on device: cpu
INFO:src.training.trainer:Training on device: cpu
2024-12-03 00:13:01,271 - INFO - Number of classes: 1
INFO:src.training.trainer:Number of classes: 1
2024-12-03 00:13:01,271 - INFO - Starting training for 10 epochs...
INFO:src.training.trainer:Starting training for 10 epochs...
Training: 100%|██████████| 110/110 [11:43<00:00,  6.40s/it, loss=0.0164, avg_loss=0.0991]
Validation: 100%|██████████| 24/24 [01:34<00:00,  3.95s/it, loss=0.0123, avg_loss=0.0694]
2024-12-03 00:26:20,072 - INFO - Epoch 1/10 - Train Loss: 0.0991 - Val Loss: 0.0694 - Val AUC-ROC: 0.5938
INFO:src.training.trainer:Epoch 1/10 - Train Loss: 0.0991 - Val Loss: 0.0694 - Val AUC-ROC: 0.5938
2024-12-03 00:26:20,073 - INFO - Validation loss improved from inf to 0.0694
INFO:src.training.trainer:Validation loss improved from inf to

KeyboardInterrupt: 

In [8]:
# LOAD MODEL
model = ChestXrayModel()
optimizer = optim.Adam(model.parameters())

checkpoint = torch.load(src_path / 'checkpoints' / 'checkpoint_epoch0_f1_0.000_auc_0.594.pt')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
