<a href="https://colab.research.google.com/github/DanielleRaine/Bird-Species-Distribution-Modeling-with-Location-Information/blob/main/CLEAN_UP_ACTUAL_MLP_Loss%2BTrainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nbimporter
!pip install import-ipynb
!pip install torch
!pip install torcheval
!pip install torchmetrics
!pip install comet-ml

Collecting nbimporter
  Downloading nbimporter-0.3.4-py3-none-any.whl.metadata (252 bytes)
Downloading nbimporter-0.3.4-py3-none-any.whl (4.9 kB)
Installing collected packages: nbimporter
Successfully installed nbimporter-0.3.4
Collecting import-ipynb
  Downloading import_ipynb-0.2-py3-none-any.whl.metadata (2.3 kB)
Collecting jedi>=0.16 (from IPython->import-ipynb)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading import_ipynb-0.2-py3-none-any.whl (4.0 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, import-ipynb
Successfully installed import-ipynb-0.2 jedi-0.19.2
Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m5.7 MB/

In [None]:
import sys
import os
import json
import pickle
from typing import Any
from ctypes import sizeof

from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model

import torch
import torch.utils.data
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

from torcheval.metrics import MeanSquaredError
from torchmetrics.regression import MeanAbsoluteError
from torchmetrics.classification import Accuracy

from google.colab import userdata
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd

Mounted at /content/drive


In [None]:
class BirdHotspotsDataset(Dataset):
  """
  Dataset for Bird Species Distribution Modeling with Location Information (Bird Hotspots).
  """

  def __init__(self, features_df, targets_df):
    """
    Args:
      features_df (pd.DataFrame): DataFrame containing the features.
      targets_df (pd.DataFrame): DataFrame containing the targets.
    """

    self.data = [
        [torch.tensor(pd.to_numeric(row.drop(labels = ["hotspot_id"]).values, errors = "coerce")).squeeze(),
         torch.tensor(targets_df.loc[targets_df["hotspot_id"] == row["hotspot_id"]]
                                    .drop(columns = ["hotspot_id", "num_complete_checklists"]).values).squeeze()]
                 for i, row in features_df.iterrows()]

  def __len__(self):
    """
    Returns the length of the dataset.
    Returns:
      length (int): Length of the dataset.
    """

    return len(self.data)

  def __getitem__(self, idx):
    """
    Returns the record and its target.
    Args:
      idx (int): Index of the record.
    Returns:
      record (torch.Tensor): Record.
      target (torch.Tensor): Target.
    """

    return self.data[idx]

In [None]:
# The next two cells are for turning the data into datasets that the model can use.
# The third cell is for pickling the dataset for future use, as preprocessing can take a while.
# The the fourth is for loading the pickled data sets. Skip to the fourth if already done so.

In [None]:
# The data used for the model using un-augmented data
# training_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/train_split.csv")
# evaluation_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/valid_split.csv")
# testing_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/test_split.csv")
# targets_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/targets.csv")

# The data used for the model using augmented data
# training_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_train.csv")
# evaluation_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_val.csv")
# testing_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_test.csv")
# targets_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/targets.csv")

# Columns that will be used for the data
# df_columns = [f"bio_{i}" for i in range(1, 20)] + ["hotspot_id", "orcdrc", "phihox", "cecsol", "bdticm", "clyppt", "sltppt", "sndppt", "bldfie"]

# training_df = training_df[df_columns]
# evaluation_df = evaluation_df[df_columns]
# testing_df = testing_df[df_columns]

In [None]:
# Turn the data into pytorch datasets
# training_set = BirdHotspotsDataset(training_df, targets_df)
# evaluation_set = BirdHotspotsDataset(evaluation_df, targets_df)
# testing_set = BirdHotspotsDataset(testing_df, targets_df)

Index(['hotspot_id', 'lon', 'lat', 'county', 'county_code', 'state',
       'state_code', 'num_complete_checklists', 'num_different_species',
       'bio_1', 'bio_2', 'bio_3', 'bio_4', 'bio_5', 'bio_6', 'bio_7', 'bio_8',
       'bio_9', 'bio_10', 'bio_11', 'bio_12', 'bio_13', 'bio_14', 'bio_15',
       'bio_16', 'bio_17', 'bio_18', 'bio_19', 'bdticm', 'bldfie', 'cecsol',
       'clyppt', 'orcdrc', 'phihox', 'sltppt', 'sndppt', 'split'],
      dtype='object')

In [None]:
# with open("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_train.p", "wb") as f:
#     pickle.dump(training_set, f)

# with open("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_val.p", "wb") as f:
#     pickle.dump(evaluation_set, f)

# with open("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_test.p", "wb") as f:
#     pickle.dump(testing_set, f)

In [None]:
# Unpickling the augmented datasets.
with open("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_train.p", "rb") as f:
  training_set = pickle.load(f)

with open("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_val.p", "rb") as f:
  evaluation_set = pickle.load(f)

with open("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_test.p", "rb") as f:
  testing_set = pickle.load(f)

# Unpickling the un-augmented datasets.
# with open("/content/drive/MyDrive/TeamMila/ProjectDataset/train_split.p", "rb") as f:
#     training_set = pickle.load(f)

# with open("/content/drive/MyDrive/TeamMila/ProjectDataset/valid_split.p", "rb") as f:
#   evaluation_set = pickle.load(f)

# with open("/content/drive/MyDrive/TeamMila/ProjectDataset/test_split.p", "rb") as f:
#   testing_set = pickle.load(f)

In [None]:
# different datloaders for the splits of the data
train_dataloader = DataLoader(training_set, batch_size=64, shuffle=True)
valid_dataloader = DataLoader(evaluation_set, batch_size=64, shuffle=True)
test_dataloader = DataLoader(testing_set, batch_size=64, shuffle=True)

In [None]:
X_train, y_train = next(iter(train_dataloader))
print(X_train.shape, y_train.shape, len(X_train[0]), len(y_train[0]))
# x, y = next(iter(valid_dataloader))
# print(x.shape, y.shape, len(x[0]), len(y[0]))
# x, y = next(iter(test_dataloader))
# print(x.shape, y.shape, len(x[0]), len(y[0]))

torch.Size([64, 27]) torch.Size([64, 671]) 27 671


In [None]:
class EncounterRateMLP(torch.nn.Module):
  def __init__(self,num_inputs, num_classes, hidden_dimensions=128,ebd=False):

    #num
      super(EncounterRateMLP, self).__init__()
      self.inc_bias = False
      self.feats = nn.Sequential(
          nn.Linear(num_inputs, hidden_dimensions),
          nn.LeakyReLU(inplace=True),
          nn.Linear(hidden_dimensions, num_classes),
          nn.LeakyReLU(inplace=True)
      )
  def forward(self, x, class_of_interest=None, return_feats=False):
      return torch.sigmoid(self.feats(x))

# evaluates a single class
  def eval_single_class(self, x, class_of_interest):
      if self.inc_bias:
        #dot product
          return torch.matmul(x, self.class_emb.weight[class_of_interest, :]) + self.class_emb.bias[class_of_interest]
      else:
          return torch.matmul(x, self.class_emb.weight[class_of_interest, :])

In [None]:
def TopKAccuracy(outputs, targets, k = None):
    if k is None:
        # Let K be the number of nonzero values for a set of predictions
        sum_correct = 0
        batch_size = outputs.shape[0]
        for output, target in zip(outputs, targets):
            k = torch.count_nonzero(target).item()
            top_k_preds = torch.topk(output, k, dim=0).indices
            true_labels = torch.topk(target, k, dim=0).indices
            sum_correct += torch.any(top_k_preds == true_labels)
        correct = sum_correct / batch_size
        return correct

    # Get the top K predictions' indices
    top_k_preds = torch.topk(outputs, k, dim=1).indices
    true_labels = torch.topk(targets, k, dim=1).indices

    # Check if the true label is in the top K predictions
    correct = torch.any(top_k_preds == true_labels, dim=1)

    # Return the average success of the batch of predictions
    return correct.float().mean().item()

In [None]:
def training_step(model, dataloader, eval_dataloader, criterion, optimizer, device, checkpoint_dir, num_epochs, experiment = None):
    model.to(device)
    best_loss = float('inf')
    last_checkpoint_path = os.path.join(checkpoint_dir, 'last_checkpoint.pth')
    best_checkpoint_path = os.path.join(checkpoint_dir, 'best_checkpoint.pth')
    mse_metric = MeanSquaredError().to(device)
    mae_metric = MeanAbsoluteError().to(device)
    mse_metric_eval = MeanSquaredError().to(device)
    mae_metric_eval = MeanAbsoluteError().to(device)

    for epoch in range(num_epochs):
        model.train()
        mse_metric.reset()
        mae_metric.reset()
        mse_metric_eval.reset()
        mae_metric_eval.reset()

        print("A NEW EPOCH HAS STARTED")

        running_loss = 0.0

        top_10_correct = 0
        top_30_correct = 0
        top_k_correct = 0
        num_batches = 0

        top_10_correct_eval = 0
        top_30_correct_eval = 0
        top_k_correct_eval = 0
        num_batches_eval = 0

        for inputs, targets in dataloader:
          ### runtime error of having inputs and targets as non-floats
          inputs, targets = inputs.float().to(device), targets.float().to(device)

          optimizer.zero_grad()

          outputs = model(inputs)

          #runtime error address: targets not in between 0 to 1
          targets = torch.clamp(targets,0,1)
          loss = criterion(outputs, targets)

          loss.backward()
          optimizer.step()

          running_loss += loss.item()

          mse_metric.update(outputs, targets)
          mae_metric.update(outputs, targets)

          top_10_correct += TopKAccuracy(outputs, targets, k = 10)
          top_30_correct += TopKAccuracy(outputs, targets, k = 30)
          top_k_correct += TopKAccuracy(outputs, targets)
          num_batches += 1

        model.eval()

        for inputs, targets in eval_dataloader:
          with torch.no_grad():
            inputs, targets = inputs.float().to(device), targets.float().to(device)
            outputs = model(inputs)
            targets = torch.clamp(targets,0,1)

            mse_metric_eval.update(outputs, targets)
            mae_metric_eval.update(outputs, targets)

            top_10_correct_eval += TopKAccuracy(outputs, targets, k = 10)
            top_30_correct_eval += TopKAccuracy(outputs, targets, k = 30)
            top_k_correct_eval += TopKAccuracy(outputs, targets)
            num_batches_eval += 1

        mse = mse_metric.compute()
        mae = mae_metric.compute()

        top_10 = top_10_correct / num_batches
        top_30 = top_30_correct / num_batches
        top_k = top_k_correct / num_batches

        mse_eval = mse_metric_eval.compute()
        mae_eval = mae_metric_eval.compute()

        top_10_correct_eval = top_10_correct_eval / num_batches_eval
        top_30_correct_eval = top_30_correct_eval / num_batches_eval
        top_k_correct_eval = top_k_correct_eval / num_batches_eval

        if experiment is not None:
          experiment.log_metrics({
              "mse": mse,
              "mae": mae,
              "top_10_accuracy": top_10,
              "top_30_accuracy": top_30,
              "top_k_accuracy": top_k,
              "mse_eval": mse_eval,
              "mae_eval": mae_eval,
              "top_10_accuracy_eval": top_10_correct_eval,
              "top_30_accuracy_eval": top_30_correct_eval,
              "top_k_accuracy_eval": top_k_correct_eval
              }, step=epoch
          )

        print(f"Epoch [{epoch + 1}/{num_epochs}], " f"Mean Squared Error: {mse.item():.5f}, " f"Mean Absolute Error: {mae.item():.5f}, " f"Top 10 Accuracy: {top_10:.5f}, " f"Top 30 Accuracy: {top_30:.5f}," f" Top K Accuracy: {top_k:.5f}")

        #-----
        #below statement has been tested and does work
        # torch.save(model.state_dict(), last_checkpoint_path)
        #-----

        checkpoint = {
        'epoch': epoch + 1,  # Save the epoch number (1-based index)
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss.item()
        }

        torch.save(checkpoint, f"checkpoints_epoch_{epoch+1}.pth")

    epoch_loss = running_loss / len(dataloader)
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        torch.save(model.state_dict(), best_checkpoint_path)
        print(f'CURRENT BEST MODEL: {epoch + 1} LOSS: {best_loss:.5f}')


    print(f'CURRENT EPOCH: [{epoch + 1}/{num_epochs}], LOSS: {epoch_loss:.5f}')


In [None]:
def testing_step(model, dataloader, device, experiment = None):
    model.to(device)

    mse_metric = MeanSquaredError().to(device)
    mae_metric = MeanAbsoluteError().to(device)

    mse_metric.reset()
    mae_metric.reset()

    top_10_correct = 0
    top_30_correct = 0
    top_k_correct = 0
    num_batches = 0

    model.eval()

    for inputs, targets in dataloader:
      inputs, targets = inputs.float().to(device), targets.float().to(device)
      outputs = model(inputs)
      targets = torch.clamp(targets,0,1)

      mse_metric.update(outputs, targets)
      mae_metric.update(outputs, targets)

      top_10_correct += TopKAccuracy(outputs, targets, k = 10)
      top_30_correct += TopKAccuracy(outputs, targets, k = 30)
      top_k_correct += TopKAccuracy(outputs, targets)
      num_batches += 1

    mse = mse_metric.compute()
    mae = mae_metric.compute()

    top_10 = top_10_correct / num_batches
    top_30 = top_30_correct / num_batches
    top_k = top_k_correct / num_batches

    print(f"Mean Squared Error: {mse.item():.5f}, Mean Absolute Error: {mae.item():.5f}")

    print(f"Top 10 Accuracy: {top_10:.5f}, Top 30 Accuracy: {top_30:.5f}, Top K Accuracy: {top_k:.5f}")

    if experiment is not None:
      experiment.log_metrics({
          "mse_test": mse,
          "mae_test": mae,
          "top_10_accuracy_test": top_10,
          "top_30_accuracy_test": top_30,
          "top_k_accuracy_test": top_k
      })

In [None]:
def CrossEntropyLoss(model,predictions,targets):
  criterion = nn.BCELoss()
  loss = criterion(predictions, targets)
  return loss


In [None]:
experiment = Experiment(
  api_key=userdata.get('comet_api_key'),
  project_name="bird-species-distribution-modeling-with-location-information",
  workspace="danielleraine"
)

experiment.set_name("baseline-augnorm")

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/danielleraine/bird-species-distribution-modeling-with-location-information/6f9305b33a3a42af825afcb36ac599b1



In [None]:
hyper_params = {
    'batch_size': 64,
    'learning_rate': 0.001,
    'num_epochs': 10,
}

experiment.log_parameters(hyper_params)

In [None]:
model = EncounterRateMLP(27, 671, hidden_dimensions=128,ebd=False).float()
criterion = nn.BCELoss()

In [None]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'
# optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
checkpoint_dir = './checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

last_checkpoint_path = os.path.join(checkpoint_dir, 'last_checkpoint.pth')
best_checkpoint_path = os.path.join(checkpoint_dir, 'best_checkpoint.pth')

# Save model state after each epoch
# torch.save(model.state_dict(),
#            last_checkpoint_path)
loss = 0.0
epoch = 0
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
}, "best_checkpoint.pth")

training_step(
    model=model,
    dataloader=train_dataloader,
    eval_dataloader=valid_dataloader,
    criterion=criterion,
    optimizer=optimizer,
    device=device,
    checkpoint_dir=checkpoint_dir,
    num_epochs=10,  # Number of epochs for training
    experiment=experiment
)

A NEW EPOCH HAS STARTED
Epoch [1/10], Mean Squared Error: 0.04850, Mean Absolute Error: 0.13728, Top 10 Accuracy: 0.72028, Top 30 Accuracy: 0.77307, Top K Accuracy: 0.77812
A NEW EPOCH HAS STARTED
Epoch [2/10], Mean Squared Error: 0.00935, Mean Absolute Error: 0.03181, Top 10 Accuracy: 0.74429, Top 30 Accuracy: 0.80304, Top K Accuracy: 0.81358
A NEW EPOCH HAS STARTED
Epoch [3/10], Mean Squared Error: 0.00934, Mean Absolute Error: 0.03085, Top 10 Accuracy: 0.74298, Top 30 Accuracy: 0.80334, Top K Accuracy: 0.81269
A NEW EPOCH HAS STARTED
Epoch [4/10], Mean Squared Error: 0.00934, Mean Absolute Error: 0.03070, Top 10 Accuracy: 0.74314, Top 30 Accuracy: 0.80199, Top K Accuracy: 0.81158
A NEW EPOCH HAS STARTED
Epoch [5/10], Mean Squared Error: 0.00931, Mean Absolute Error: 0.03061, Top 10 Accuracy: 0.74419, Top 30 Accuracy: 0.80390, Top K Accuracy: 0.81163
A NEW EPOCH HAS STARTED
Epoch [6/10], Mean Squared Error: 0.00914, Mean Absolute Error: 0.03022, Top 10 Accuracy: 0.73985, Top 30 Accur

In [None]:
checkpoint = torch.load("checkpoints/best_checkpoint.pth")
print(checkpoint.keys())
print(checkpoint['feats.0.weight'])

odict_keys(['feats.0.weight', 'feats.0.bias', 'feats.2.weight', 'feats.2.bias'])
tensor([[-0.6991, -0.1071,  0.4424,  ...,  2.3730,  2.6345,  0.3510],
        [-0.4678, -0.1232, -0.8607,  ...,  2.9040,  1.8779,  0.9207],
        [ 1.3438, -0.9566, -0.9068,  ...,  1.3304,  0.7855,  1.0264],
        ...,
        [ 0.0111, -1.1264,  0.4330,  ...,  1.7392,  2.5955,  0.2371],
        [ 0.4334, -0.4779,  0.0314,  ..., -1.8369, -1.5135, -0.5403],
        [ 0.3201,  0.4476,  0.8273,  ...,  1.0665,  1.9521,  1.0294]])


  checkpoint = torch.load("checkpoints/best_checkpoint.pth")


In [None]:
# torch.save(model, "/content/drive/MyDrive/TeamMila/Models/baseline-augnorm.pth")

In [None]:
# model = torch.load("/content/drive/MyDrive/TeamMila/Models/baseline_model.pth", weights_only=False)

In [None]:
testing_step(model, test_dataloader, device, experiment)

Mean Squared Error: 0.00863, Mean Absolute Error: 0.02875
Top 10 Accuracy: 0.71888, Top 30 Accuracy: 0.80734, Top K Accuracy: 0.81478


In [None]:
log_model(experiment, model, "baseline-augnorm")

In [None]:
experiment.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline-augnorm
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/danielleraine/bird-species-distribution-modeling-with-location-information/6f9305b33a3a42af825afcb36ac599b1
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [1337]               : (0.05714833736419678, 0.7597743272781372)
[1;38;5;39mCOMET INFO:[0m     mae [10]                  : (0.02687237039208412, 0.1361050009727478)
[1;38;5;39mCOMET INFO:[0m     mae_eval [10]             : (0.028570998460054398, 0.035003166645