### Combining Datasets: bonefractureyolo and FracAtlas
We combine images and labels from both `bonefractureyolo` and `FracAtlas` folders into a new unified dataset. This includes copying images and merging label files into a single format (YOLO or CSV as needed).

In [26]:
import os
import shutil
from glob import glob

# Paths for both datasets
yolo_img_dirs = [
    'data/bonefractureyolo/train/images',
    'data/bonefractureyolo/valid/images',
    'data/bonefractureyolo/test/images',
    'data/FracAtlas/images/Fractured',
    'data/FracAtlas/images/Non_fractured'
# Add more if needed
 ]
yolo_label_dirs = [
    'data/bonefractureyolo/train/labels',
    'data/bonefractureyolo/valid/labels',
    'data/bonefractureyolo/test/labels',
    # FracAtlas YOLO labels (if available)
    'data/FracAtlas/Annotations/YOLO'
# Add more if needed
 ]

combined_img_dir = 'data/combined/images'
combined_label_dir = 'data/combined/labels'

os.makedirs(combined_img_dir, exist_ok=True)
os.makedirs(combined_label_dir, exist_ok=True)

# Copy images
for img_dir in yolo_img_dirs:
    if os.path.exists(img_dir):
        for img_path in glob(os.path.join(img_dir, '*')):
            shutil.copy(img_path, combined_img_dir)

# Copy YOLO label files
for label_dir in yolo_label_dirs:
    if os.path.exists(label_dir):
        for label_path in glob(os.path.join(label_dir, '*.txt')):
            shutil.copy(label_path, combined_label_dir)

print(f"Combined images: {len(os.listdir(combined_img_dir))}")
print(f"Combined labels: {len(os.listdir(combined_label_dir))}")

Combined images: 8231
Combined labels: 8232


In [27]:
import random
from sklearn.model_selection import train_test_split

# Create train/val structure for combined dataset
combined_train_dir = 'data/combined_organized/train'
combined_val_dir = 'data/combined_organized/val'

# Create class directories
for split in ['train', 'val']:
    for class_name in ['fractured', 'non_fractured']:
        os.makedirs(f'data/combined_organized/{split}/{class_name}', exist_ok=True)

# Get all images from combined directory
all_images = [f for f in os.listdir(combined_img_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

# Classify images based on source or naming patterns
fractured_images = []
non_fractured_images = []

for img in all_images:
    # Classify based on source directory or filename patterns
    if 'fracture' in img.lower() or 'fractured' in img.lower():
        fractured_images.append(img)
    else:
        non_fractured_images.append(img)

# If we can't classify by name, check original source directories
for img in all_images:
    if img not in fractured_images and img not in non_fractured_images:
        # Default classification - you may need to adjust this logic
        non_fractured_images.append(img)

# Split each class into train/val (80/20 split)
frac_train, frac_val = train_test_split(fractured_images, test_size=0.2, random_state=42)
non_frac_train, non_frac_val = train_test_split(non_fractured_images, test_size=0.2, random_state=42)

# Copy images to organized structure
def copy_images_to_split(image_list, source_dir, dest_dir):
    for img in image_list:
        src_path = os.path.join(source_dir, img)
        if os.path.exists(src_path):
            shutil.copy(src_path, dest_dir)

# Copy fractured images
copy_images_to_split(frac_train, combined_img_dir, f'{combined_train_dir}/fractured')
copy_images_to_split(frac_val, combined_img_dir, f'{combined_val_dir}/fractured')

# Copy non-fractured images
copy_images_to_split(non_frac_train, combined_img_dir, f'{combined_train_dir}/non_fractured')
copy_images_to_split(non_frac_val, combined_img_dir, f'{combined_val_dir}/non_fractured')

print(f"Training fractured: {len(os.listdir(f'{combined_train_dir}/fractured'))}")
print(f"Training non-fractured: {len(os.listdir(f'{combined_train_dir}/non_fractured'))}")
print(f"Validation fractured: {len(os.listdir(f'{combined_val_dir}/fractured'))}")
print(f"Validation non-fractured: {len(os.listdir(f'{combined_val_dir}/non_fractured'))}")

Training fractured: 8
Training non-fractured: 6576
Validation fractured: 3
Validation non-fractured: 1644


### Updated Data Pipeline for Combined Dataset
The code above combines images from both datasets and organizes them into a standard ImageFolder structure:
- `data/combined_organized/train/fractured/` - Training images with fractures
- `data/combined_organized/train/non_fractured/` - Training images without fractures
- `data/combined_organized/val/fractured/` - Validation images with fractures
- `data/combined_organized/val/non_fractured/` - Validation images without fractures

This unified structure allows us to use PyTorch's ImageFolder dataset loader efficiently.

## Week 2 & 3: Data Preprocessing, Cleaning, and Baseline Model
- **Week 2:** Data normalization, augmentation, and cleaning are performed above using torchvision transforms and DataLoader setup.
- **Week 3:** Develop a simple baseline CNN for bone fracture detection. The following cells define, train, and validate a basic convolutional neural network using the preprocessed data.

In [28]:
import torch
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

# Define preprocessing and augmentation transforms
train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),               # resize to match model input
    transforms.RandomHorizontalFlip(p=0.5),      # flip horizontally
    transforms.RandomRotation(degrees=10),       # small random rotations
    transforms.ColorJitter(brightness=0.2,       # random brightness changes
                           contrast=0.2),        
    transforms.ToTensor(),                       # convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406],   # normalize (ImageNet mean/std)
                         std=[0.229, 0.224, 0.225])
])

# Validation/Test transforms (no augmentation, only normalization)
val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


# Load datasets using balanced loader for training
train_dataset = BalancedImageFolder(root="data/combined_organized/train", transform=train_transforms, oversample_ratio=0.3)
val_dataset = LenientImageFolder(root="data/combined_organized/val", transform=val_transforms)

# Data loaders (reduced num_workers to avoid OS errors on macOS)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)

# Inspect one batch
images, labels = next(iter(train_loader))
print("Batch image tensor shape:", images.shape)
print("Batch label tensor shape:", labels.shape)


Original fractured: 8
Original non-fractured: 6576
Final dataset size: 8548
Final fractured: 1972
Final non-fractured: 6576
Found 1647 image files in data/combined_organized/val
Batch image tensor shape: torch.Size([32, 3, 224, 224])
Batch label tensor shape: torch.Size([32])


In [29]:
from PIL import Image, ImageFile
import torch.utils.data as data
from torchvision.datasets.folder import default_loader

# Enable loading of truncated images - this allows PIL to load partially corrupted images
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Custom dataset class that is very lenient with image loading
class LenientImageFolder(data.Dataset):
    def __init__(self, root, transform=None):
        self.root = root
        self.transform = transform
        self.samples = []
        self.classes = ['fractured', 'non_fractured']
        self.class_to_idx = {cls_name: i for i, cls_name in enumerate(self.classes)}
        
        # Collect all image paths (don't pre-validate to be more lenient)
        for class_name in self.classes:
            class_dir = os.path.join(root, class_name)
            if os.path.exists(class_dir):
                for img_name in os.listdir(class_dir):
                    if img_name.lower().endswith(('.png', '.jpg', '.jpeg')):
                        img_path = os.path.join(class_dir, img_name)
                        self.samples.append((img_path, self.class_to_idx[class_name]))
                            
        print(f"Found {len(self.samples)} image files in {root}")
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, index):
        path, target = self.samples[index]
        try:
            # Try to open and convert the image
            with Image.open(path) as img:
                sample = img.convert('RGB')
                if self.transform is not None:
                    sample = self.transform(sample)
                return sample, target
        except Exception as e:
            # If loading fails, create a black placeholder image
            sample = Image.new('RGB', (224, 224), (0, 0, 0))
            if self.transform is not None:
                sample = self.transform(sample)
            return sample, target

print("Lenient dataset loader created!")

Lenient dataset loader created!


In [30]:
# Create a more balanced dataset by oversampling fractured images
import random

class BalancedImageFolder(data.Dataset):
    def __init__(self, root, transform=None, oversample_ratio=0.1):
        self.root = root
        self.transform = transform
        self.samples = []
        self.classes = ['fractured', 'non_fractured']
        self.class_to_idx = {cls_name: i for i, cls_name in enumerate(self.classes)}
        
        # Collect all image paths by class
        fractured_samples = []
        non_fractured_samples = []
        
        for class_name in self.classes:
            class_dir = os.path.join(root, class_name)
            if os.path.exists(class_dir):
                for img_name in os.listdir(class_dir):
                    if img_name.lower().endswith(('.png', '.jpg', '.jpeg')):
                        img_path = os.path.join(class_dir, img_name)
                        sample = (img_path, self.class_to_idx[class_name])
                        if class_name == 'fractured':
                            fractured_samples.append(sample)
                        else:
                            non_fractured_samples.append(sample)
        
        # Calculate how many times to repeat fractured images
        target_fractured_count = int(len(non_fractured_samples) * oversample_ratio)
        oversample_factor = max(1, target_fractured_count // len(fractured_samples)) if fractured_samples else 1
        
        # Add samples to dataset
        self.samples = non_fractured_samples.copy()
        
        # Oversample fractured images
        for _ in range(oversample_factor):
            self.samples.extend(fractured_samples)
        
        # Add some random additional fractured samples if needed
        remaining = target_fractured_count - (len(fractured_samples) * oversample_factor)
        if remaining > 0 and fractured_samples:
            additional_samples = random.sample(fractured_samples, min(remaining, len(fractured_samples)))
            self.samples.extend(additional_samples)
        
        # Shuffle the dataset
        random.shuffle(self.samples)
        
        print(f"Original fractured: {len(fractured_samples)}")
        print(f"Original non-fractured: {len(non_fractured_samples)}")
        print(f"Final dataset size: {len(self.samples)}")
        
        # Count final class distribution
        final_counts = {0: 0, 1: 0}
        for _, label in self.samples:
            final_counts[label] += 1
        print(f"Final fractured: {final_counts[0]}")
        print(f"Final non-fractured: {final_counts[1]}")
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, index):
        path, target = self.samples[index]
        try:
            with Image.open(path) as img:
                sample = img.convert('RGB')
                if self.transform is not None:
                    sample = self.transform(sample)
                return sample, target
        except Exception as e:
            # If loading fails, create a black placeholder image
            sample = Image.new('RGB', (224, 224), (0, 0, 0))
            if self.transform is not None:
                sample = self.transform(sample)
            return sample, target

print("Balanced dataset loader created!")

Balanced dataset loader created!


In [31]:
import torch.nn as nn
import torch.nn.functional as F

# Simple CNN baseline model
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(32 * 56 * 56, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Determine number of classes from dataset
num_classes = len(train_dataset.classes)
model = SimpleCNN(num_classes=num_classes)
print(model)

SimpleCNN(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=100352, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
)


In [32]:
import torch.optim as optim

# Training settings
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5  # For demonstration, increase as needed

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * images.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}')

    # Validation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_acc = 100 * correct / total
    print(f'Validation Accuracy: {val_acc:.2f}%')

Epoch 1/5, Training Loss: 0.0895
Validation Accuracy: 99.21%
Validation Accuracy: 99.21%
Epoch 2/5, Training Loss: 0.0043
Epoch 2/5, Training Loss: 0.0043
Validation Accuracy: 99.88%
Validation Accuracy: 99.88%
Epoch 3/5, Training Loss: 0.0057
Epoch 3/5, Training Loss: 0.0057
Validation Accuracy: 99.39%
Validation Accuracy: 99.39%
Epoch 4/5, Training Loss: 0.0052
Epoch 4/5, Training Loss: 0.0052
Validation Accuracy: 99.88%
Validation Accuracy: 99.88%
Epoch 5/5, Training Loss: 0.0001
Epoch 5/5, Training Loss: 0.0001
Validation Accuracy: 99.82%
Validation Accuracy: 99.82%


In [33]:
import torch.optim as optim
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter

# Calculate class weights to handle imbalance
train_labels = [train_dataset.samples[i][1] for i in range(len(train_dataset))]
class_counts = Counter(train_labels)
total_samples = len(train_labels)

# Calculate weights inversely proportional to class frequency
class_weights = []
for i in range(len(train_dataset.classes)):
    weight = total_samples / (len(train_dataset.classes) * class_counts[i])
    class_weights.append(weight)

class_weights = torch.FloatTensor(class_weights)
print(f"Class distribution: {class_counts}")
print(f"Class weights: {class_weights}")

# Training settings with weighted loss
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
class_weights = class_weights.to(device)

# Use weighted CrossEntropyLoss to handle class imbalance
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10  # Increased epochs for better training

# Training loop with detailed metrics
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    train_correct = 0
    train_total = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / len(train_loader.dataset)
    train_acc = 100 * train_correct / train_total
    
    # Validation with detailed metrics
    model.eval()
    val_correct = 0
    val_total = 0
    all_predicted = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()
            
            all_predicted.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    val_acc = 100 * val_correct / val_total
    
    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(f'  Training Loss: {epoch_loss:.4f}, Training Acc: {train_acc:.2f}%')
    print(f'  Validation Acc: {val_acc:.2f}%')
    
    # Print detailed metrics every 5 epochs
    if (epoch + 1) % 5 == 0:
        print(f'\nDetailed metrics for epoch {epoch+1}:')
        print('Confusion Matrix:')
        cm = confusion_matrix(all_labels, all_predicted)
        print(cm)
        print('\nClassification Report:')
        print(classification_report(all_labels, all_predicted, 
                                  target_names=train_dataset.classes))
        print('-' * 50)

Class distribution: Counter({1: 6576, 0: 1972})
Class weights: tensor([2.1673, 0.6499])
Epoch 1/10:
  Training Loss: 0.0164, Training Acc: 99.66%
  Validation Acc: 99.70%
Epoch 1/10:
  Training Loss: 0.0164, Training Acc: 99.66%
  Validation Acc: 99.70%
Epoch 2/10:
  Training Loss: 0.0073, Training Acc: 99.80%
  Validation Acc: 99.57%
Epoch 2/10:
  Training Loss: 0.0073, Training Acc: 99.80%
  Validation Acc: 99.57%
Epoch 3/10:
  Training Loss: 0.0001, Training Acc: 100.00%
  Validation Acc: 99.76%
Epoch 3/10:
  Training Loss: 0.0001, Training Acc: 100.00%
  Validation Acc: 99.76%
Epoch 4/10:
  Training Loss: 0.0000, Training Acc: 100.00%
  Validation Acc: 99.76%
Epoch 4/10:
  Training Loss: 0.0000, Training Acc: 100.00%
  Validation Acc: 99.76%
Epoch 5/10:
  Training Loss: 0.0029, Training Acc: 99.88%
  Validation Acc: 99.09%

Detailed metrics for epoch 5:
Confusion Matrix:
[[   2    1]
 [  14 1630]]

Classification Report:
               precision    recall  f1-score   support

    f