In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil
import importlib
import sys

# Determine the environment and import preprocessing module accordingly
def is_kaggle():
    return 'KAGGLE_KERNEL_RUN_TYPE' in os.environ

if is_kaggle():
    print("Running on Kaggle")
    # Assuming 'preprocessing.py' and other scripts are in '/kaggle/input'
    kaggle_input_path = '/kaggle/usr/lib'
    sys.path.append(kaggle_input_path)
    
    import preprocessing_py.preprocessing_py as preprocessing
    import models_py.models_py as models
    import utils_py.utils_py as utils
   
    
    # Install missing libraries on kaggle
    ! pip install torchsummary
    ! pip install mlflow
else:
    print("Running locally")
    import scripts.preprocessing as preprocessing
    import scripts.models as models
    import scripts.utils as utils
    
    
    
# Reload the module (if necessary)
importlib.reload(preprocessing)
importlib.reload(models)
importlib.reload(utils)

# Other imports
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
import torchsummary
import torch.optim as optim

import tqdm
import mlflow
import mlflow.pytorch

Running locally


In [2]:
device = utils.use_GPU()

CUDA is available!  Training on GPU ...
cuda:0


In [3]:
if is_kaggle():
    path_SSL = "/kaggle/input/..."   # insert paths to the model in kaggle
    path_Storm = "/kaggle/input/... " # insert paths to the model in kaggle
else:
    path_SSL = "."   # insert paths to the model for local runs
    path_Storm = "." # insert paths to the model for local runs
    


model = models.StormModel()    # first version of the model
#model = models.StormModel2()  # second version of the model

# use this string to load a state dict from a file
# model=torch.load("path_Storm")

# use this string to load a dict from a file
# model.load_state_dict(torch.load("path_Storm")))


StormModel = model.to(device)

In [4]:
# Print the summary of the model
torchsummary.summary(StormModel, (3, 224, 224));

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 111, 111]           1,792
         LeakyReLU-2         [-1, 64, 111, 111]               0
         MaxPool2d-3           [-1, 64, 55, 55]               0
            Conv2d-4           [-1, 16, 55, 55]           1,040
         LeakyReLU-5           [-1, 16, 55, 55]               0
       BatchNorm2d-6           [-1, 16, 55, 55]              32
            Conv2d-7           [-1, 64, 55, 55]           1,088
         LeakyReLU-8           [-1, 64, 55, 55]               0
       BatchNorm2d-9           [-1, 64, 55, 55]             128
           Conv2d-10           [-1, 64, 55, 55]           9,280
        LeakyReLU-11           [-1, 64, 55, 55]               0
      BatchNorm2d-12           [-1, 64, 55, 55]             128
        FireStorm-13          [-1, 128, 55, 55]               0
           Conv2d-14           [-1, 16,

In [5]:
# Load data
if not is_kaggle():  # on kaggle the data must be already loaded with the correct structure

    # check if the train folder is  already created
    if not os.path.exists('data/train'):

        folder_structure = preprocessing.create_dataset()

        # transform it in a dataframe and list the number of images per class in the folders

        a = pd.DataFrame([(k, len(v)) for k,v in folder_structure[0].items()], 
                            columns=['class', 'count'])
        b = pd.DataFrame([(k, len(v)) for k,v in folder_structure[1].items()], 
                            columns=['class', 'count'])
        image_counts = pd.merge(a, 
                                b, 
                                on='class', 
                                how='outer', 
                                suffixes=('_train', '_test'))
        
        
        image_counts.loc[np.argmin(image_counts['count_train']),:]
        
        # create a validation set
        preprocessing.create_validation(42);

In [6]:
# Choose the correct data directory
if is_kaggle():
    im_dir = '/kaggle/input/food-dataset-sl/'
else:
    im_dir ='.'  

im_dir

'.'

In [7]:
#Apply transformations to the images 
transform = transforms.Compose([
    # resize 
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    #transforms.RandomCrop(224),
    transforms.ToTensor(),
    # Normalize pixel values
    transforms.Normalize(mean= [0.6388, 0.5445, 0.4448],  std =  [0.2713, 0.2864, 0.3131]),
])

# set the num_worker for the dataloader
num_workers = 4

# Load the training dataset
trainset = torchvision.datasets.ImageFolder(root=os.path.join(im_dir,'data/train'), transform=transform)

# Create data loader for training data
trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=num_workers)

valset = torchvision.datasets.ImageFolder(root=os.path.join(im_dir,'data/val'), transform=transform)

#create data loader for validation data
valloader = torch.utils.data.DataLoader(valset, batch_size=16, shuffle=True, num_workers=num_workers)

testset = torchvision.datasets.ImageFolder(root=os.path.join(im_dir,'data/test'), transform=transform)

#create data loader for test data
testloader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False, num_workers=num_workers)


In [8]:
# Define criterion, optimizer and scheduler and other parameters for training

#  PARAMETERS HERE ARE JUST EXAMPLES, THE PARAMETERS FOR THE USER ARE DEFINED IN THE FOLLOWING CELLS

opt = "Adam"                        # optimizer to be used: ["Adam" or "SGD"]
momentum = 0.9                      # momentum ONLY for SGD optimizer
weight_decay = 1e-4                 # weight decay ONLY on Adam optimizer
step_size = 7                       # step size for the scheduler
gamma = 0.1                         # gamma for the scheduler

batch_size = 8                      # batch size
num_epochs=10                       # number of epochs
patience = 3                        # patience for early stopping
criterion ="CrossEntropyLoss"       # loss function to be used: ["CrossEntropyLoss", "MSELoss"]
lr = 5e-5                           # learning rate

model_name = "StormModel"           # model name
model = model                       # model



#set the optimizer
if opt == "Adam":
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
elif opt == "SGD":
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
else:
    print("Invalid optimizer")

#set the criterion
if criterion == "CrossEntropyLoss":
    criterion = nn.CrossEntropyLoss()
elif criterion == "MSELoss":
    criterion = nn.MSELoss()
else:
    print("Invalid criterion")

#set the scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

# Upload model to correct device
model = model.to(device)

In [9]:
def set_training_parameters(model, model_name, opt, lr, weight_decay, momentum, criterion, step_size, gamma, num_epochs, patience, device):

    #set the optimizer
    if opt == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    elif opt == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    else:
        print("Invalid optimizer")

    #set the criterion
    if criterion == "CrossEntropyLoss":
        criterion = nn.CrossEntropyLoss()
    elif criterion == "MSELoss":
        criterion = nn.MSELoss()
    elif criterion == "L1Loss":
        criterion = nn.L1Loss()
    elif criterion == "NLLLoss":
        criterion = nn.NLLLoss()
    else:
        print("Invalid criterion")

    #set the scheduler
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    # Upload model to correct device
    model = model.to(device)
    
    training_parameters = {}
    training_parameters['model'] = model
    training_parameters['model_name'] = model_name
    training_parameters['criterion'] = criterion
    training_parameters['optimizer'] = optimizer
    training_parameters['scheduler'] = scheduler
    training_parameters['num_epochs'] = num_epochs
    training_parameters['patience'] = patience
    
    return training_parameters

In [10]:
# Training function based on above parameters
def train_model(model, model_name, trainloader, valloader, criterion, optimizer, scheduler, num_epochs=10, patience=3 ):

    mlflow.start_run(run_name=model_name)

    # Log model parameters
    mlflow.log_param("optimizer", opt)
    mlflow.log_param("learning_rate", lr)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("num_epochs", num_epochs)
    mlflow.log_param("momentum", momentum)
    mlflow.log_param("weight_decay", weight_decay)
    mlflow.log_param("step_size", step_size)
    mlflow.log_param("gamma", gamma)
    mlflow.log_param("patience", patience)

    starting_step=0

    patience_counter = 0
    best_model = None
    best_loss = np.inf
        
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        running_loss = 0.0
        train_loader_tqdm = tqdm.tqdm(trainloader, desc=f"Epoch {epoch+1}/{num_epochs}", 
                                unit="batch")
        ind_rloss=1
        for inputs, labels in train_loader_tqdm:
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Print statistics
            running_loss += loss.item()
            train_loader_tqdm.set_postfix(loss=running_loss / ind_rloss)
            ind_rloss +=1

        epoch_loss = running_loss / len(trainloader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

        scheduler.step()
        mlflow.log_metric("train_loss", epoch_loss, step=starting_step+epoch)
        
        # Validation loop (optional)
        model.eval()  # Set model to evaluation mode
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for inputs, labels in valloader:
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss /= len(valloader)
        val_accuracy = 100 * correct / total
        print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')
        
        # Log validation loss and accuracy
        mlflow.log_metric("val_loss", val_loss, step=starting_step+epoch)
        mlflow.log_metric("val_accuracy", val_accuracy, step=starting_step+epoch)
        
        # Early stopping
        if val_loss < best_loss:
            best_loss = val_loss
            best_model = model
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter > patience:
                print("Early stopping")
                break
                    
    # Log the model
    mlflow.pytorch.log_model(best_model, model_name)

    # End the MLflow run
    mlflow.end_run()

    print('Finished Training')


In [11]:
#Training section: MODIFY THE PARAMETERS BELOW

# Define criterion, optimizer and scheduler and other parameters for training
opt = "Adam"                        # optimizer to be used: ["Adam" or "SGD"]
momentum = 0.9                      # momentum ONLY for SGD optimizer
weight_decay = 1e-4                 # weight decay ONLY on Adam optimizer
step_size = 20                     # step size for the scheduler
gamma = 0.5                         # gamma for the scheduler

batch_size = 64                      # batch size
num_epochs= 10                     # number of epochs
patience = 5                        # patience for early stopping
criterion ="CrossEntropyLoss"       # loss function to be used: ["CrossEntropyLoss", "MSELoss", "L1Loss", "NLLLoss"]
lr = 1e-5                           # learning rate

model_name = "SSL_Storm_Model_samedata"           # model name
model =  StormModel               # model

tr_param = set_training_parameters(model=model,model_name = model_name, opt=opt, lr=lr, weight_decay=weight_decay, 
                                   momentum=momentum, criterion=criterion, step_size=step_size, gamma=gamma, num_epochs=num_epochs, 
                                   patience=patience, device=device)

#stop eventual mlflow runs
mlflow.end_run()
train_model(**tr_param, trainloader=trainloader, valloader=valloader)

Epoch 1/10:   3%|▎         | 342/11860 [00:07<04:15, 45.04batch/s, loss=5.52]


KeyboardInterrupt: 