## Imports

In [None]:
!pip install pillow
!pip install torch
!pip install torchvision
!pip install numpy

In [3]:
# Imports
import io
import boto3
import random
from PIL import Image
import numpy as np

import torch
from torchvision import datasets, transforms
import torchvision
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Constants
BUCKET_NAME = "pnuemonia-chest-xrays-ids721"
MODEL_PATH = "models/pnuemonia_model.pt"
NUM_EPOCHS = 10

## Functions

### Functions to load the data

In [4]:
def load_data(train=True, test=True, val=True):
    # Create the client
    s3 = boto3.client('s3')

    # Generate the prefixes for each image set
    train_prefix = "chest_xrays/train"
    test_prefix = "chest_xrays/test"
    val_prefix = "chest_xrays/val"
    
    # Load the data
    if train:
        print("Loading the training data...")
        normal_train = get_s3_objects(s3, train_prefix, "NORMAL")
        pnue_train = get_s3_objects(s3, train_prefix, "PNEUMONIA")
        
        Xtrain, ytrain = combine_data(normal_train, pnue_train)
    
    if test:
        print("Loading the testing data...")
        normal_test = get_s3_objects(s3, test_prefix, "NORMAL")
        pnue_test = get_s3_objects(s3, test_prefix, "PNEUMONIA")
        
        Xtest, ytest = combine_data(normal_test, pnue_test)
    
    if val:
        print("Loading the validation data...")
        normal_val = get_s3_objects(s3, val_prefix, "NORMAL")
        pnue_val = get_s3_objects(s3, val_prefix, "PNEUMONIA")
        
        Xval, yval = combine_data(normal_val, pnue_val)
    
    if train and test and val:
        return Xtrain, ytrain, Xtest, ytest, Xval, yval
    if train and test:
        return Xtrain, ytrain, Xtest, ytest
    if train and val:
        return Xtrain, ytrain, Xval, yval
    if test and val:
        return Xtest, ytest, Xval, yval
    if train:
        return Xtrain, ytrain
    if test:
        return Xtest, ytest
    if val:
        return Xval, yval
    
    return None
    
def get_s3_objects(s3_client, prefix, class_name):
    # Get the list of objects in the bucket
    files = s3_client.list_objects(Bucket=BUCKET_NAME, Prefix=f"{prefix}/{class_name}")["Contents"]
    
    # Get the filenames
    filenames = [file["Key"] for file in files]
    
    # Loop thruogh the filenames
    images = []
    for file in filenames:
        try: 
            # Get the object
            file_obj = io.BytesIO()
            s3_client.download_fileobj(Bucket=BUCKET_NAME, Key=file, Fileobj=file_obj)
            # Load the image
            img = Image.frombytes(mode="L", size=(112, 112), data=file_obj.getvalue())
            img = np.array(img.resize((224, 224)))
            # Mimic RGB to use pretrained model
            rgb_img = np.repeat(img[:, :, np.newaxis], 3, -1)
            images.append(rgb_img)
        except:
            continue # If there is an error, skip the file
    
    return images
    
def combine_data(normal, pnue):
    # Generate the labels
    labels = [0] * len(normal) + [1] * len(pnue)

    # Combine the data
    images = normal + pnue

    # Shuffle the data
    random.seed(42)
    random.shuffle(images)
    random.shuffle(labels)
    
    # Return the results
    return images, labels

### Functions to setup modeling

In [5]:
def create_dataloaders(xval, yval, batch_size=8):
    # Create the datasets
    val_data = TensorDataset(torch.tensor(np.array(xval)).float() , torch.tensor(np.array(yval)))
    
    # Create the dataloaders
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
    
    # Return the dataloaders
    return val_loader

In [6]:
def define_model():
    # Get resnet50 model
    model = torchvision.models.resnet50(pretrained=True)
    
    # Freeze the parameters
    for param in model.parameters():
        param.requires_grad = False
        
    # Define the classifier
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, 2)
    
    # Return the model
    return model

### Functions for training and saving the model

In [7]:
def train_model(model, criterion, optimizer, data_loader, num_epochs=1):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1} of {num_epochs}")

        # Set model to training mode
        model.train()

        running_loss = 0.0
        running_corrects = 0

        # Get the input images and labels, and send to GPU if available
        for inputs, labels in data_loader:
            inputs = inputs.reshape(-1, 3, 224, 224)
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Zero the weight gradients
            optimizer.zero_grad()

            # Forward pass to get outputs and calculate loss
            # Track gradient only for training data
            with torch.set_grad_enabled(True):
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels.long())

                # Backpropagation to get the gradients with respect to each weight
                loss.backward()
                # Update the weights
                optimizer.step()

            # Convert loss into a scalar and add it to running_loss
            running_loss += loss.item() * inputs.size(0)
            # Track number of correct predictions
            running_corrects += torch.sum(preds == labels.data)
    
    # Return the model
    return model

In [8]:
def test_model(model, data_loader):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Set the model to evaluation mode
    model.eval()
    y_true = []
    y_pred = []
    
    # For each batch
    for inputs, labels in data_loader:
        inputs = inputs.reshape(-1, 3, 224, 224)
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        # Feed inputs through model to get raw scores
        logits = model.forward(inputs)
        # Convert raw scores to probabilities (not necessary since we just care about discrete probs in this case)
        probs = F.softmax(logits,dim=1)
        # Get discrete predictions using argmax
        preds = np.argmax(probs.cpu().detach().numpy(),axis=1)
        # Add predictions and actuals to lists
        y_pred.extend(preds)
        y_true.extend(labels.cpu())
    
    # Calculate the accuracy
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    accuracy = np.sum(y_pred == y_true)/y_true.shape[0]
    
    return accuracy

In [9]:
def save_model(model):
    # Save the model
    torch.save(model.state_dict(), MODEL_PATH)

## Training

In [10]:
# Get the data
Xtrain, ytrain, Xval, yval = load_data(train=True, test=False, val=True)

# Convert the data to tensors
train_data = create_dataloaders(Xtrain, ytrain)
val_data = create_dataloaders(Xval, yval)

# Setup the mdoel
model = define_model()
critereon = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
model = train_model(model, critereon, optimizer, train_data, num_epochs=NUM_EPOCHS)

# Get training and validation accuracy
train_acc = test_model(model, train_data)
val_acc = test_model(model, val_data)

print(f"Training Accuracy: {train_acc}")
print(f"Validation Accuracy: {val_acc}")

# Save the model
save_model(model)

Loading the training data...
Loading the testing data...
Loading the validation data...




Epoch 1 of 10
Epoch 2 of 10
Epoch 3 of 10
Epoch 4 of 10
Epoch 5 of 10
Epoch 6 of 10
Epoch 7 of 10
Epoch 8 of 10
Epoch 9 of 10
Epoch 10 of 10
Training Accuracy: 0.6421421421421422
Validation Accuracy: 0.5


RuntimeError: Parent directory models does not exist.