## Fine-tuning a ResNet50 model for Chest X-Ray Classification

In [1]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import torchvision.models as models
import torch.nn as nn

import torch.optim as optim

from datetime import datetime

from tqdm import tqdm

import os
import csv

In [2]:
# Define your transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)), # Resize images to fit the model
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # mean and std are set to the standard of the training data for resnet50
])

# Load your dataset
train_dataset = datasets.ImageFolder(root='/rds/general/user/sz2823/home/ML_project/Chest_XRay_Classification/Dataset/images/train', transform=transform)
val_dataset = datasets.ImageFolder(root='/rds/general/user/sz2823/home/ML_project/Chest_XRay_Classification/Dataset/images/validation', transform=transform)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [3]:
val_loader 

<torch.utils.data.dataloader.DataLoader at 0x1539a51ca190>

In [4]:
import torch

# Check if CUDA (GPU support) is available
is_cuda_available = torch.cuda.is_available()
print("Is CUDA (GPU) available:", is_cuda_available)

# If CUDA is available, print the GPU name(s)
if is_cuda_available:
    gpu_count = torch.cuda.device_count()
    print(f"Number of GPU(s) available: {gpu_count}")
    for i in range(gpu_count):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available. Using CPU.")

Is CUDA (GPU) available: True
Number of GPU(s) available: 1
GPU 0: Quadro RTX 6000


In [5]:
# Load a pre-trained ResNet model
model = models.resnet50(pretrained=True)

# Modify the classifier to fit your dataset
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, len(train_dataset.classes)) # Adjust this based on your number of classes

# Move the model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

model = model.to(device)

print("Model device:", next(model.parameters()).device)



Device: cuda
Model device: cuda:0


In [6]:
criterion = nn.CrossEntropyLoss().cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [16]:
def output_log_writer(s, end='\r'):
    with open(f'model_result/log/output_{model_name}_{date_time}.txt', 'a') as output_file:
        output_file.write(s + '\n')
        print(s, end=end, flush=True)
        
def eval_fn(model, eval_data):
    model.eval()

    total_correct = 0
    total_samples = 0
    
    preds = []
    with torch.no_grad():
        for images, labels in eval_data:
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()
            
    accuracy = total_correct / total_samples
    return accuracy

In [8]:
current_datetime = datetime.now()
date_time = str(current_datetime)[:-7].replace('-','').replace(':','').replace(' ','_')

In [9]:
model_name = "ResNet50"

In [26]:
# num_epochs = 10  # Set the number of epochs to 10

# for epoch in range(num_epochs):
#     print(f'Starting epoch {epoch+1}/{num_epochs}')  # Add this line to print the current epoch
#     model.train()  # Set the model to training mode
#     running_loss = 0.0  # Initialize the running loss which will accumulate losses over an epoch
    
#     # Use tqdm to show the progress bar for the training batches
#     for batch_idx, (images, labels) in enumerate(tqdm(train_loader, desc=f'Epoch {epoch+1}')):
#         images, labels = images.to(device), labels.to(device)
#         optimizer.zero_grad()  # Clear the gradients
#         outputs = model(images)  # Forward pass
#         loss = criterion(outputs, labels)  # Calculate loss
#         loss.backward()  # Backpropagate the loss
#         optimizer.step()  # Update the parameters
#         running_loss += loss.item()  # Add the batch loss

#     avg_loss = running_loss / len(train_loader)  # Compute the average loss for the epoch
#     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')  # Print the average loss for the epoch

#     model.eval()  # Set the model to evaluation mode
#     correct = 0  # Initialize the count for correct predictions
#     total = 0  # Initialize the count for total predictions

#     # Validation step with no progress bar (since it's usually much faster)
#     with torch.no_grad():
#         for images, labels in val_loader:
#             images, labels = images.to(device), labels.to(device)
#             outputs = model(images)
#             _, predicted = torch.max(outputs.data, 1)
#             total += labels.size(0)
#             correct += (predicted == labels).sum().item()

#     val_accuracy = 100 * correct / total  # Calculate the accuracy
#     print(f'Accuracy on validation set: {val_accuracy:.2f}%')  # Print the validation accuracy

Starting epoch 1/10


Epoch 1: 100%|██████████| 2853/2853 [1:13:46<00:00,  1.55s/it]


Epoch [1/10], Loss: 2.8336
Starting epoch 2/10


Epoch 2:  16%|█▋        | 467/2853 [11:54<1:00:49,  1.53s/it]


KeyboardInterrupt: 

In [13]:
os.makedirs(f'model_result/log', exist_ok=True)
os.makedirs(f'model_result/measurement', exist_ok=True)
os.makedirs(f'model_result/model_checkpoint', exist_ok=True)
os.makedirs(f'model_result/model_pth', exist_ok=True)  # Ensure this directory exists as well

In [11]:
losses, accus_train, accus_val = [], [[]], [[]]
train_len = len(train_loader.dataset)
model_save_path = f'model_result/model_pth/MODEL_FINETUNE_{model_name}_{date_time}.pth'
model_save_path

'model_result/model_pth/MODEL_FINETUNE_ResNet50_20240402_152443.pth'

In [14]:
num_epochs = 10

for epoch in range(num_epochs):
    output_log_writer(f'-------------------------------[Epoch {epoch+1}]---------------------------------')
    model.train()
    running_loss = 0.0
    
    # Ensure there is an empty list for the current epoch's losses and accuracies
    losses.append([])
    accus_train.append([])
    
    for batch_idx, (images, labels) in enumerate(tqdm(train_loader, desc=f'Epoch {epoch+1}')):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        losses[-1].append(loss.item())
        with torch.no_grad():
            accus_train[-1].append((torch.max(outputs, dim=1)[1] == labels).sum().item())

    # Aggregating and logging training metrics
    total_loss = sum(losses[-1])
    total_correct = sum(accus_train[-1])
    avg_accuracy = total_correct / train_len

    # Saving training metrics to CSV files
    with open(f'model_result/measurement/batch_loss_{model_name}_{date_time}.csv', 'a', newline='') as b_loss_file, \
         open(f'model_result/measurement/epoch_loss_{model_name}_{date_time}.csv', 'a', newline='') as e_loss_file, \
         open(f'model_result/measurement/acc_train_{model_name}_{date_time}.csv', 'a', newline='') as acc_train_file:
        csv.writer(b_loss_file).writerow([epoch+1] + losses[-1])
        csv.writer(e_loss_file).writerow([epoch+1, total_loss])
        csv.writer(acc_train_file).writerow([epoch+1, avg_accuracy])

    losses.append([])
    accus_train.append([])

    # Validation measurement
    validation_accuracy = eval_fn(model, val_loader)
    accus_val.append(validation_accuracy)
    with open(f'model_result/measurement/acc_val_{model_name}_{date_time}.csv', 'a', newline='') as acc_val_file:
        csv.writer(acc_val_file).writerow([epoch+1, validation_accuracy])

    # Logging evaluation metrics
    output_log_writer(f'[Epoch {epoch+1}] loss={total_loss:.2e}, train accu={avg_accuracy:.2%}, validation accu={validation_accuracy:.2%}')

    # Saving model checkpoint
    ckpt_save_path = f'model_result/model_checkpoint/MODEL_CKPT_{epoch+1}_{model_name}_{date_time}.pt'
    torch.save({
        'epoch': epoch+1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': total_loss,
    }, ckpt_save_path)
    output_log_writer(f'[Epoch {epoch+1}] Model checkpoint is saved.')

# Saving final model
model_save_path = f'model_result/{model_name}_{date_time}.pt'
torch.save(model, model_save_path)
output_log_writer('\n\nFinal model saved. Training Finished.')



Epoch 1: 100%|██████████| 2853/2853 [1:18:05<00:00,  1.64s/it]


NameError: name 'eval_fn' is not defined