In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil
import importlib
import sys

# Determine the environment and import preprocessing module accordingly
def is_kaggle():
    return 'KAGGLE_KERNEL_RUN_TYPE' in os.environ

if is_kaggle():
    print("Running on Kaggle")
    # Assuming 'preprocessing.py' and other scripts are in '/kaggle/input'
    kaggle_input_path = '/kaggle/usr/lib'
    sys.path.append(kaggle_input_path)
    
    import preprocessing_py.preprocessing_py as preprocessing
    import models_py.models_py as models
    import utils_py.utils_py as utils
   
    
    # Install missing libraries on kaggle
    ! pip install torchsummary
    ! pip install mlflow
else:
    print("Running locally")
    import scripts.preprocessing as preprocessing
    import scripts.models as models
    import scripts.utils as utils
    
    
    
# Reload the module (if necessary)
importlib.reload(preprocessing)
importlib.reload(models)
importlib.reload(utils)

# Other imports
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
import torchsummary
import torch.optim as optim

import tqdm
import mlflow
import mlflow.pytorch

Running on Kaggle
Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl.metadata (296 bytes)
Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1
Collecting mlflow
  Downloading mlflow-2.13.2-py3-none-any.whl.metadata (29 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow)
  Downloading cachetools-5.3.3-py3-none-any.whl.metadata (5.3 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting querystring-parser<2 (from mlflow)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl.metadata (559 bytes)
Collecting gunicorn<23 (from mlflow)
  Downloading gunicorn-22.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.3-py3-none-any.whl.metadata (10 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading g

In [2]:
device = utils.use_GPU()

CUDA is available!  Training on GPU ...
cuda:0


In [3]:
if is_kaggle():
    path = "/kaggle/input/ssl/pytorch/uploaded/1"
else:
    path="."
    

model = models.StormSqueezeNet()
StormModel = model.to(device)

model = models.SqueezeNet()  # untill we pass to new version
model.load_state_dict(torch.load(os.path.join(path,"netFromSSL_10e.pth")))
SSLmodel = model.to(device)

In [4]:
torchsummary.summary(model, (3, 224, 224));

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 111, 111]           1,792
              ReLU-2         [-1, 64, 111, 111]               0
         MaxPool2d-3           [-1, 64, 55, 55]               0
            Conv2d-4           [-1, 16, 55, 55]           1,040
              ReLU-5           [-1, 16, 55, 55]               0
            Conv2d-6           [-1, 64, 55, 55]           1,088
              ReLU-7           [-1, 64, 55, 55]               0
            Conv2d-8           [-1, 64, 55, 55]           9,280
              ReLU-9           [-1, 64, 55, 55]               0
             Fire-10          [-1, 128, 55, 55]               0
           Conv2d-11           [-1, 16, 55, 55]           2,064
             ReLU-12           [-1, 16, 55, 55]               0
           Conv2d-13           [-1, 64, 55, 55]           1,088
             ReLU-14           [-1, 64,

In [5]:
# Load data
if not is_kaggle():

    # check if the train folder is  already created
    if not os.path.exists('data/train'):
        folder_structure = preprocessing.create_dataset()
        # transform it in a dataframe and list the number of images per class in the folders
        a = pd.DataFrame([(k, len(v)) for k,v in folder_structure[0].items()], 
                            columns=['class', 'count'])
        b = pd.DataFrame([(k, len(v)) for k,v in folder_structure[1].items()], 
                            columns=['class', 'count'])
        image_counts = pd.merge(a, 
                                b, 
                                on='class', 
                                how='outer', 
                                suffixes=('_train', '_test'))
        
        
        image_counts.loc[np.argmin(image_counts['count_train']),:]
        # create a validation set
        preprocessing.create_validation(42);

In [6]:
ls

__notebook__.ipynb


In [7]:
if is_kaggle():
    im_dir = '/kaggle/input/food-dataset-sl/'
else:
    im_dir ='.'  

In [8]:
im_dir

'/kaggle/input/food-dataset-sl/'

In [9]:
transform = transforms.Compose([
    # resize 
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(224),
    transforms.ToTensor(),
    # Normalize pixel values
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


# Load the training dataset
trainset = torchvision.datasets.ImageFolder(root=os.path.join(im_dir,'data/train'), transform=transform)

# Create data loader for training data with batch size 4 and shuffling
trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=4)

valset = torchvision.datasets.ImageFolder(root=os.path.join(im_dir,'data/val'), transform=transform)

valloader = torch.utils.data.DataLoader(valset, batch_size=16, shuffle=True, num_workers=4)

testset = torchvision.datasets.ImageFolder(root=os.path.join(im_dir,'data/test'), transform=transform)

testloader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False, num_workers=4)


In [10]:
# Define criterion, optimizer and scheduler and other parameters for training
opt = "Adam"                        # optimizer to be used: ["Adam" or "SGD"]
momentum = 0.9                      # momentum ONLY for SGD optimizer
weight_decay = 1e-4                 # weight decay ONLY on Adam optimizer
step_size = 7                       # step size for the scheduler
gamma = 0.1                         # gamma for the scheduler

batch_size = 8                      # batch size
num_epochs=10                       # number of epochs
patience = 3                        # patience for early stopping
criterion ="CrossEntropyLoss"       # loss function to be used: ["CrossEntropyLoss", "MSELoss", "L1Loss", "NLLLoss"]
lr = 5e-5                           # learning rate

model_name = "squeezenet"           # model name
model = models.SqueezeNet()         # model



#set the optimizer
if opt == "Adam":
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
elif opt == "SGD":
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
else:
    print("Invalid optimizer")

#set the criterion
if criterion == "CrossEntropyLoss":
    criterion = nn.CrossEntropyLoss()
elif criterion == "MSELoss":
    criterion = nn.MSELoss()
elif criterion == "L1Loss":
    criterion = nn.L1Loss()
elif criterion == "NLLLoss":
    criterion = nn.NLLLoss()
else:
    print("Invalid criterion")

#set the scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

# Upload model to correct device
model = model.to(device)

In [11]:
def set_training_parameters(model, model_name, opt, lr, weight_decay, momentum, criterion, step_size, gamma, num_epochs, patience, device):

    #set the optimizer
    if opt == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    elif opt == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    else:
        print("Invalid optimizer")

    #set the criterion
    if criterion == "CrossEntropyLoss":
        criterion = nn.CrossEntropyLoss()
    elif criterion == "MSELoss":
        criterion = nn.MSELoss()
    elif criterion == "L1Loss":
        criterion = nn.L1Loss()
    elif criterion == "NLLLoss":
        criterion = nn.NLLLoss()
    else:
        print("Invalid criterion")

    #set the scheduler
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    # Upload model to correct device
    model = model.to(device)
    
    training_parameters = {}
    training_parameters['model'] = model
    training_parameters['model_name'] = model_name
    training_parameters['criterion'] = criterion
    training_parameters['optimizer'] = optimizer
    training_parameters['scheduler'] = scheduler
    training_parameters['num_epochs'] = num_epochs
    training_parameters['patience'] = patience
    
    return training_parameters

In [12]:
# Training function based on above parameters
def train_model(model, model_name, trainloader, valloader, criterion, optimizer, scheduler, num_epochs=10, patience=3 ):

    mlflow.start_run(run_name=model_name)

    # Log model parameters
    mlflow.log_param("optimizer", opt)
    mlflow.log_param("learning_rate", lr)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("num_epochs", num_epochs)
    mlflow.log_param("momentum", momentum)
    mlflow.log_param("weight_decay", weight_decay)
    mlflow.log_param("step_size", step_size)
    mlflow.log_param("gamma", gamma)
    mlflow.log_param("patience", patience)


    patience_counter = 0
    best_model = None
    best_loss = np.inf
        
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        running_loss = 0.0
        train_loader_tqdm = tqdm.tqdm(trainloader, desc=f"Epoch {epoch+1}/{num_epochs}", 
                                unit="batch")
        ind_rloss=1
        for inputs, labels in train_loader_tqdm:
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Print statistics
            running_loss += loss.item()
            train_loader_tqdm.set_postfix(loss=running_loss / ind_rloss)
            ind_rloss +=1

        epoch_loss = running_loss / len(trainloader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

        scheduler.step()
        mlflow.log_metric("train_loss", epoch_loss, step=epoch)
        
        # Validation loop (optional)
        model.eval()  # Set model to evaluation mode
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for inputs, labels in valloader:
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss /= len(valloader)
        val_accuracy = 100 * correct / total
        print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')
        
        # Log validation loss and accuracy
        mlflow.log_metric("val_loss", val_loss, step=epoch)
        mlflow.log_metric("val_accuracy", val_accuracy, step=epoch)
        
        # Early stopping
        if val_loss < best_loss:
            best_loss = val_loss
            best_model = model
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter > patience:
                print("Early stopping")
                break
                    
    # Log the model
    mlflow.pytorch.log_model(best_model, model_name)

    # End the MLflow run
    mlflow.end_run()

    print('Finished Training')


In [13]:
def Save_mlruns():
    print("moving to working directory...")
    ! cd /kaggle/working/
    print("zipping directory...")
    ! zip -r mlruns.zip mlruns
    print("!!REMEMBER TO DOWNLOAD IT FROM THE OUTPUT SECTION!!")
    print("back to home directory...")
    ! cd 

In [14]:
# example of training 

# Define criterion, optimizer and scheduler and other parameters for training
opt = "Adam"                        # optimizer to be used: ["Adam" or "SGD"]
momentum = 0.9                      # momentum ONLY for SGD optimizer
weight_decay = 1e-4                 # weight decay ONLY on Adam optimizer
step_size = 50                      # step size for the scheduler
gamma = 0.5                         # gamma for the scheduler

batch_size = 64                      # batch size
num_epochs=30                       # number of epochs
patience = 5                        # patience for early stopping
criterion ="CrossEntropyLoss"       # loss function to be used: ["CrossEntropyLoss", "MSELoss", "L1Loss", "NLLLoss"]
lr = 5e-5                           # learning rate

model_name = "StormModel_1.0-30_epochs"           # model name
model =  StormModel      # model, StormModel, SSLmodel

tr_param = set_training_parameters(model=model,model_name = model_name, opt=opt, lr=lr, weight_decay=weight_decay, 
                                   momentum=momentum, criterion=criterion, step_size=step_size, gamma=gamma, num_epochs=num_epochs, 
                                   patience=patience, device=device)

#stop eventual mlflow runs
mlflow.end_run()
train_model(**tr_param, trainloader=trainloader, valloader=valloader)

Epoch 1/30: 100%|██████████| 11860/11860 [05:02<00:00, 39.26batch/s, loss=5.05]

Epoch [1/30], Loss: 5.0515





Validation Loss: 4.6567, Validation Accuracy: 6.12%


Epoch 2/30: 100%|██████████| 11860/11860 [04:39<00:00, 42.38batch/s, loss=4.74]


Epoch [2/30], Loss: 4.7351
Validation Loss: 4.4352, Validation Accuracy: 8.66%


Epoch 3/30: 100%|██████████| 11860/11860 [04:36<00:00, 42.82batch/s, loss=4.56]

Epoch [3/30], Loss: 4.5614





Validation Loss: 4.2900, Validation Accuracy: 10.82%


Epoch 4/30: 100%|██████████| 11860/11860 [04:40<00:00, 42.35batch/s, loss=4.41]

Epoch [4/30], Loss: 4.4148





Validation Loss: 4.1440, Validation Accuracy: 12.96%


Epoch 5/30: 100%|██████████| 11860/11860 [04:40<00:00, 42.25batch/s, loss=4.3]

Epoch [5/30], Loss: 4.3011





Validation Loss: 4.0257, Validation Accuracy: 14.94%


Epoch 6/30: 100%|██████████| 11860/11860 [04:56<00:00, 39.98batch/s, loss=4.2]

Epoch [6/30], Loss: 4.2033





Validation Loss: 3.9377, Validation Accuracy: 16.27%


Epoch 7/30: 100%|██████████| 11860/11860 [04:50<00:00, 40.85batch/s, loss=4.12]

Epoch [7/30], Loss: 4.1196





Validation Loss: 3.8309, Validation Accuracy: 17.65%


Epoch 8/30: 100%|██████████| 11860/11860 [04:45<00:00, 41.59batch/s, loss=4.04]

Epoch [8/30], Loss: 4.0404





Validation Loss: 3.7783, Validation Accuracy: 18.92%


Epoch 9/30: 100%|██████████| 11860/11860 [04:52<00:00, 40.52batch/s, loss=3.97]

Epoch [9/30], Loss: 3.9679





Validation Loss: 3.7109, Validation Accuracy: 20.06%


Epoch 10/30: 100%|██████████| 11860/11860 [04:53<00:00, 40.45batch/s, loss=3.9]

Epoch [10/30], Loss: 3.9028





Validation Loss: 3.6627, Validation Accuracy: 20.48%


Epoch 11/30: 100%|██████████| 11860/11860 [04:44<00:00, 41.72batch/s, loss=3.85]

Epoch [11/30], Loss: 3.8477





Validation Loss: 3.6321, Validation Accuracy: 21.91%


Epoch 12/30: 100%|██████████| 11860/11860 [04:52<00:00, 40.49batch/s, loss=3.79]

Epoch [12/30], Loss: 3.7912





Validation Loss: 3.5562, Validation Accuracy: 23.08%


Epoch 13/30: 100%|██████████| 11860/11860 [04:53<00:00, 40.43batch/s, loss=3.74]

Epoch [13/30], Loss: 3.7365





Validation Loss: 3.5079, Validation Accuracy: 23.21%


Epoch 14/30: 100%|██████████| 11860/11860 [04:50<00:00, 40.81batch/s, loss=3.69]

Epoch [14/30], Loss: 3.6909





Validation Loss: 3.4607, Validation Accuracy: 24.48%


Epoch 15/30: 100%|██████████| 11860/11860 [04:49<00:00, 40.96batch/s, loss=3.65]

Epoch [15/30], Loss: 3.6474





Validation Loss: 3.4138, Validation Accuracy: 25.13%


Epoch 16/30: 100%|██████████| 11860/11860 [04:51<00:00, 40.62batch/s, loss=3.6]

Epoch [16/30], Loss: 3.5982





Validation Loss: 3.4110, Validation Accuracy: 25.47%


Epoch 17/30: 100%|██████████| 11860/11860 [04:50<00:00, 40.82batch/s, loss=3.56]


Epoch [17/30], Loss: 3.5569
Validation Loss: 3.3581, Validation Accuracy: 26.08%


Epoch 18/30: 100%|██████████| 11860/11860 [04:52<00:00, 40.51batch/s, loss=3.51]

Epoch [18/30], Loss: 3.5064





Validation Loss: 3.3182, Validation Accuracy: 27.35%


Epoch 19/30: 100%|██████████| 11860/11860 [04:45<00:00, 41.52batch/s, loss=3.47]

Epoch [19/30], Loss: 3.4733





Validation Loss: 3.2809, Validation Accuracy: 27.75%


Epoch 20/30: 100%|██████████| 11860/11860 [04:43<00:00, 41.82batch/s, loss=3.44]

Epoch [20/30], Loss: 3.4375





Validation Loss: 3.2479, Validation Accuracy: 28.63%


Epoch 21/30: 100%|██████████| 11860/11860 [04:45<00:00, 41.49batch/s, loss=3.4]

Epoch [21/30], Loss: 3.4021





Validation Loss: 3.2155, Validation Accuracy: 28.88%


Epoch 22/30: 100%|██████████| 11860/11860 [04:42<00:00, 41.97batch/s, loss=3.36]


Epoch [22/30], Loss: 3.3648
Validation Loss: 3.2331, Validation Accuracy: 28.89%


Epoch 23/30: 100%|██████████| 11860/11860 [04:37<00:00, 42.76batch/s, loss=3.34]

Epoch [23/30], Loss: 3.3383





Validation Loss: 3.1778, Validation Accuracy: 30.15%


Epoch 24/30: 100%|██████████| 11860/11860 [04:35<00:00, 43.13batch/s, loss=3.29]

Epoch [24/30], Loss: 3.2934





Validation Loss: 3.1845, Validation Accuracy: 30.29%


Epoch 25/30: 100%|██████████| 11860/11860 [04:37<00:00, 42.67batch/s, loss=3.27]

Epoch [25/30], Loss: 3.2690





Validation Loss: 3.1304, Validation Accuracy: 30.76%


Epoch 26/30: 100%|██████████| 11860/11860 [04:49<00:00, 40.92batch/s, loss=3.24]


Epoch [26/30], Loss: 3.2380
Validation Loss: 3.1192, Validation Accuracy: 31.15%


Epoch 27/30: 100%|██████████| 11860/11860 [04:41<00:00, 42.11batch/s, loss=3.21]

Epoch [27/30], Loss: 3.2130





Validation Loss: 3.1072, Validation Accuracy: 31.59%


Epoch 28/30: 100%|██████████| 11860/11860 [04:42<00:00, 41.96batch/s, loss=3.19]

Epoch [28/30], Loss: 3.1850





Validation Loss: 3.0459, Validation Accuracy: 32.58%


Epoch 29/30: 100%|██████████| 11860/11860 [04:41<00:00, 42.14batch/s, loss=3.16]

Epoch [29/30], Loss: 3.1626





Validation Loss: 3.0429, Validation Accuracy: 32.69%


Epoch 30/30: 100%|██████████| 11860/11860 [04:41<00:00, 42.07batch/s, loss=3.13]

Epoch [30/30], Loss: 3.1340





Validation Loss: 3.0103, Validation Accuracy: 32.87%
Finished Training


In [15]:
if is_kaggle():
    Save_mlruns()

moving to working directory...
zipping directory...
  adding: mlruns/ (stored 0%)
  adding: mlruns/.trash/ (stored 0%)
  adding: mlruns/0/ (stored 0%)
  adding: mlruns/0/03e4b39c75e340b88605e5ad0c46fbe8/ (stored 0%)
  adding: mlruns/0/03e4b39c75e340b88605e5ad0c46fbe8/tags/ (stored 0%)
  adding: mlruns/0/03e4b39c75e340b88605e5ad0c46fbe8/tags/mlflow.source.name (deflated 3%)
  adding: mlruns/0/03e4b39c75e340b88605e5ad0c46fbe8/tags/mlflow.source.type (stored 0%)
  adding: mlruns/0/03e4b39c75e340b88605e5ad0c46fbe8/tags/mlflow.user (stored 0%)
  adding: mlruns/0/03e4b39c75e340b88605e5ad0c46fbe8/tags/mlflow.log-model.history (deflated 44%)
  adding: mlruns/0/03e4b39c75e340b88605e5ad0c46fbe8/tags/mlflow.runName (stored 0%)
  adding: mlruns/0/03e4b39c75e340b88605e5ad0c46fbe8/params/ (stored 0%)
  adding: mlruns/0/03e4b39c75e340b88605e5ad0c46fbe8/params/patience (stored 0%)
  adding: mlruns/0/03e4b39c75e340b88605e5ad0c46fbe8/params/step_size (stored 0%)
  adding: mlruns/0/03e4b39c7