In [4]:
!pip install nbimporter
!pip install import-ipynb
!pip install torch
!pip install torcheval
!pip install torchmetrics
!pip install comet-ml



In [5]:
import sys
import os
import json
import pickle
from typing import Any
from ctypes import sizeof

from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model

import torch
import torch.utils.data
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

from torcheval.metrics import MeanSquaredError
from torchmetrics.regression import MeanAbsoluteError
from torchmetrics.classification import Accuracy

from google.colab import userdata
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd

Mounted at /content/drive


In [6]:
class BirdHotspotsDataset(Dataset):
  """
  Dataset for Bird Species Distribution Modeling with Location Information (Bird Hotspots).
  """

  def __init__(self, features_df, targets_df):
    """
    Args:
      features_df (pd.DataFrame): DataFrame containing the features.
      targets_df (pd.DataFrame): DataFrame containing the targets.
    """

    self.data = [
        [torch.tensor(pd.to_numeric(row.drop(labels = ["hotspot_id"]).values, errors = "coerce")).squeeze(),
         torch.tensor(targets_df.loc[targets_df["hotspot_id"] == row["hotspot_id"]]
                                    .drop(columns = ["hotspot_id", "num_complete_checklists"]).values).squeeze()]
                 for i, row in features_df.iterrows()]

  def __len__(self):
    """
    Returns the length of the dataset.
    Returns:
      length (int): Length of the dataset.
    """

    return len(self.data)

  def __getitem__(self, idx):
    """
    Returns the record and its target.
    Args:
      idx (int): Index of the record.
    Returns:
      record (torch.Tensor): Record.
      target (torch.Tensor): Target.
    """

    return self.data[idx]

In [None]:
# The next two cells are for turning the data into datasets that the model can use.
# The third cell is for pickling the dataset for future use, as preprocessing can take a while.
# The the fourth is for loading the pickled data sets. Skip to the fourth if already done so.

In [31]:
# The data used for the model using un-augmented data
# training_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/train_split.csv")
# evaluation_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/valid_split.csv")
# testing_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/test_split.csv")
# targets_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/targets.csv")

# The data used for the model using augmented data
# training_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_train.csv")
# evaluation_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_val.csv")
# testing_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_test.csv")
# targets_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/targets.csv")

training_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/augmented_train.csv")
evaluation_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/augmented_val.csv")
testing_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/augmented_test.csv")
targets_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/targets.csv")

# Columns that will be used for the data
df_columns = [f"bio_{i}" for i in range(1, 20)] + ["hotspot_id", "orcdrc", "phihox", "cecsol", "bdticm", "clyppt", "sltppt", "sndppt", "bldfie"]

training_df = training_df[df_columns]
evaluation_df = evaluation_df[df_columns]
testing_df = testing_df[df_columns]

In [36]:
# Turn the data into pytorch datasets
training_set = BirdHotspotsDataset(training_df, targets_df)
evaluation_set = BirdHotspotsDataset(evaluation_df, targets_df)
testing_set = BirdHotspotsDataset(testing_df, targets_df)

In [None]:
# with open("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_train.p", "wb") as f:
#     pickle.dump(training_set, f)

# with open("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_val.p", "wb") as f:
#     pickle.dump(evaluation_set, f)

# with open("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_test.p", "wb") as f:
#     pickle.dump(testing_set, f)

In [22]:
# Unpickling the augmented datasets.
# with open("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_train.p", "rb") as f:
#   training_set = pickle.load(f)

# with open("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_val.p", "rb") as f:
#   evaluation_set = pickle.load(f)

# with open("/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_test.p", "rb") as f:
#   testing_set = pickle.load(f)

# Unpickling the un-augmented datasets.
with open("/content/drive/MyDrive/TeamMila/ProjectDataset/train_split.p", "rb") as f:
    training_set = pickle.load(f)

with open("/content/drive/MyDrive/TeamMila/ProjectDataset/valid_split.p", "rb") as f:
  evaluation_set = pickle.load(f)

with open("/content/drive/MyDrive/TeamMila/ProjectDataset/test_split.p", "rb") as f:
  testing_set = pickle.load(f)

In [37]:
# different datloaders for the splits of the data
train_dataloader = DataLoader(training_set, batch_size=64, shuffle=True)
valid_dataloader = DataLoader(evaluation_set, batch_size=64, shuffle=True)
test_dataloader = DataLoader(testing_set, batch_size=64, shuffle=True)

In [9]:
class EncounterRateMLP(torch.nn.Module):
  def __init__(self,num_inputs, num_classes, hidden_dimensions=128,ebd=False):
      super(EncounterRateMLP, self).__init__()
      self.inc_bias = False
      self.feats = nn.Sequential(
          nn.Linear(num_inputs, hidden_dimensions),
          nn.LeakyReLU(inplace=True),
          nn.Linear(hidden_dimensions, num_classes),
          nn.LeakyReLU(inplace=True)
      )
  def forward(self, x, class_of_interest=None, return_feats=False):
      return torch.sigmoid(self.feats(x))

# evaluates a single class
  def eval_single_class(self, x, class_of_interest):
      if self.inc_bias:
        #dot product
          return torch.matmul(x, self.class_emb.weight[class_of_interest, :]) + self.class_emb.bias[class_of_interest]
      else:
          return torch.matmul(x, self.class_emb.weight[class_of_interest, :])

In [10]:
def TopKAccuracy(outputs, targets, k = None):
    sum_topk = 0
    for output, target in zip(outputs, targets):
        if k is None:
            # Let K be the number of nonzero values for a set of predictions
            k = torch.count_nonzero(target).item()
        top_k_preds = torch.topk(output, k).indices
        true_labels = torch.topk(target, k).indices
        sum_topk += torch.count_nonzero(torch.isin(top_k_preds, true_labels)) / k
    batch_size = outputs.shape[0]
    return sum_topk / batch_size

In [11]:
def training_step(model, dataloader, eval_dataloader, criterion, optimizer, device, num_epochs, experiment = None):
    model.to(device)
    mse_metric = MeanSquaredError().to(device)
    mae_metric = MeanAbsoluteError().to(device)
    mse_metric_eval = MeanSquaredError().to(device)
    mae_metric_eval = MeanAbsoluteError().to(device)

    for epoch in range(num_epochs):
        model.train()
        mse_metric.reset()
        mae_metric.reset()
        mse_metric_eval.reset()
        mae_metric_eval.reset()

        print("A NEW EPOCH HAS STARTED")

        running_loss = 0.0

        top_10_correct = 0
        top_30_correct = 0
        top_k_correct = 0
        num_batches = 0

        top_10_correct_eval = 0
        top_30_correct_eval = 0
        top_k_correct_eval = 0
        num_batches_eval = 0

        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2)

        for inputs, targets in dataloader:
          ### runtime error of having inputs and targets as non-floats
          inputs, targets = inputs.float().to(device), targets.float().to(device)

          optimizer.zero_grad()

          outputs = model(inputs)

          #runtime error address: targets not in between 0 to 1
          targets = torch.clamp(targets,0,1)
          loss = criterion(outputs, targets)

          loss.backward()
          optimizer.step()

          running_loss += loss.item()

          mse_metric.update(outputs, targets)
          mae_metric.update(outputs, targets)

          top_10_correct += TopKAccuracy(outputs, targets, k = 10)
          top_30_correct += TopKAccuracy(outputs, targets, k = 30)
          top_k_correct += TopKAccuracy(outputs, targets)
          num_batches += 1

        scheduler.step()
        model.eval()

        for inputs, targets in eval_dataloader:
          with torch.no_grad():
            inputs, targets = inputs.float().to(device), targets.float().to(device)
            outputs = model(inputs)
            targets = torch.clamp(targets,0,1)

            mse_metric_eval.update(outputs, targets)
            mae_metric_eval.update(outputs, targets)

            top_10_correct_eval += TopKAccuracy(outputs, targets, k = 10)
            top_30_correct_eval += TopKAccuracy(outputs, targets, k = 30)
            top_k_correct_eval += TopKAccuracy(outputs, targets)
            num_batches_eval += 1

        mse = mse_metric.compute()
        mae = mae_metric.compute()

        top_10 = top_10_correct / num_batches
        top_30 = top_30_correct / num_batches
        top_k = top_k_correct / num_batches

        mse_eval = mse_metric_eval.compute()
        mae_eval = mae_metric_eval.compute()

        top_10_correct_eval = top_10_correct_eval / num_batches_eval
        top_30_correct_eval = top_30_correct_eval / num_batches_eval
        top_k_correct_eval = top_k_correct_eval / num_batches_eval

        if experiment is not None:
          experiment.log_metrics({
              "mse": mse,
              "mae": mae,
              "top_10_accuracy": top_10,
              "top_30_accuracy": top_30,
              "top_k_accuracy": top_k,
              "mse_eval": mse_eval,
              "mae_eval": mae_eval,
              "top_10_accuracy_eval": top_10_correct_eval,
              "top_30_accuracy_eval": top_30_correct_eval,
              "top_k_accuracy_eval": top_k_correct_eval
              }, step=epoch
          )

        print(f"Epoch [{epoch + 1}/{num_epochs}], " f"Mean Squared Error: {mse.item():.5f}, " f"Mean Absolute Error: {mae.item():.5f}, " f"Top 10 Accuracy: {top_10:.5f}, " f"Top 30 Accuracy: {top_30:.5f}," f" Top K Accuracy: {top_k:.5f}")

In [12]:
def testing_step(model, dataloader, device, experiment = None):
    model.to(device)

    mse_metric = MeanSquaredError().to(device)
    mae_metric = MeanAbsoluteError().to(device)

    mse_metric.reset()
    mae_metric.reset()

    top_10_correct = 0
    top_30_correct = 0
    top_k_correct = 0
    num_batches = 0

    model.eval()

    for inputs, targets in dataloader:
      inputs, targets = inputs.float().to(device), targets.float().to(device)
      outputs = model(inputs)
      targets = torch.clamp(targets,0,1)

      mse_metric.update(outputs, targets)
      mae_metric.update(outputs, targets)

      top_10_correct += TopKAccuracy(outputs, targets, k = 10)
      top_30_correct += TopKAccuracy(outputs, targets, k = 30)
      top_k_correct += TopKAccuracy(outputs, targets)
      num_batches += 1

    mse = mse_metric.compute()
    mae = mae_metric.compute()

    top_10 = top_10_correct / num_batches
    top_30 = top_30_correct / num_batches
    top_k = top_k_correct / num_batches

    print(f"Mean Squared Error: {mse.item():.5f}, Mean Absolute Error: {mae.item():.5f}")

    print(f"Top 10 Accuracy: {top_10:.5f}, Top 30 Accuracy: {top_30:.5f}, Top K Accuracy: {top_k:.5f}")

    if experiment is not None:
      experiment.log_metrics({
          "mse_test": mse,
          "mae_test": mae,
          "top_10_accuracy_test": top_10,
          "top_30_accuracy_test": top_30,
          "top_k_accuracy_test": top_k
      })

In [38]:
experiment = Experiment(
  api_key=userdata.get('comet_api_key'),
  project_name="bird-species-distribution-modeling-with-location-information",
  workspace="danielleraine"
)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/danielleraine/bird-species-distribution-modeling-with-location-information/8e6feba80c8d4e97a9017780d6d9c3db

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


In [25]:
experiment.set_name("baseline1")

In [39]:
experiment.set_name("baseline2")

In [40]:
hyper_params = {
    'batch_size': 64,
    'initial_learning_rate': 0.005,
    'scheduler_gamma': 0.1,
    'scheduler_step_size:': 2,
    'num_epochs': 10,
}

experiment.log_parameters(hyper_params)

In [41]:
model = EncounterRateMLP(27, 671, hidden_dimensions=128,ebd=False).float()

In [42]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = nn.BCELoss()

training_step(
    model=model,
    dataloader=train_dataloader,
    eval_dataloader=valid_dataloader,
    criterion=criterion,
    optimizer=optimizer,
    device=device,
    num_epochs=10,  # Number of epochs for training
    experiment=experiment
)

A NEW EPOCH HAS STARTED
Epoch [1/10], Mean Squared Error: 0.05555, Mean Absolute Error: 0.07445, Top 10 Accuracy: 0.11363, Top 30 Accuracy: 0.20975, Top K Accuracy: 0.26171
A NEW EPOCH HAS STARTED
Epoch [2/10], Mean Squared Error: 0.01232, Mean Absolute Error: 0.03001, Top 10 Accuracy: 0.32311, Top 30 Accuracy: 0.47550, Top K Accuracy: 0.47107
A NEW EPOCH HAS STARTED
Epoch [3/10], Mean Squared Error: 0.00779, Mean Absolute Error: 0.02504, Top 10 Accuracy: 0.40747, Top 30 Accuracy: 0.52082, Top K Accuracy: 0.50692
A NEW EPOCH HAS STARTED
Epoch [4/10], Mean Squared Error: 0.00766, Mean Absolute Error: 0.02461, Top 10 Accuracy: 0.41663, Top 30 Accuracy: 0.52896, Top K Accuracy: 0.51459
A NEW EPOCH HAS STARTED
Epoch [5/10], Mean Squared Error: 0.00755, Mean Absolute Error: 0.02429, Top 10 Accuracy: 0.42402, Top 30 Accuracy: 0.53580, Top K Accuracy: 0.52054
A NEW EPOCH HAS STARTED
Epoch [6/10], Mean Squared Error: 0.00751, Mean Absolute Error: 0.02412, Top 10 Accuracy: 0.42485, Top 30 Accur

In [None]:
torch.save(model, "/content/drive/MyDrive/TeamMila/Models/baseline1.pth")

In [None]:
# model = torch.load("/content/drive/MyDrive/TeamMila/Models/baseline_model.pth", weights_only=False)

In [43]:
testing_step(model, test_dataloader, device, experiment)

Mean Squared Error: 0.00756, Mean Absolute Error: 0.02426
Top 10 Accuracy: 0.41766, Top 30 Accuracy: 0.53281, Top K Accuracy: 0.51782


In [44]:
log_model(experiment, model, "baseline1")

In [45]:
experiment.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline2
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/danielleraine/bird-species-distribution-modeling-with-location-information/8e6feba80c8d4e97a9017780d6d9c3db
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [1337]               : (0.04870668426156044, 44.98221969604492)
[1;38;5;39mCOMET INFO:[0m     mae [10]                  : (0.023667776957154274, 0.0744464322924614)
[1;38;5;39mCOMET INFO:[0m     mae_eval [10]             : (0.02337699569761753, 0.036707960069179535)
