In [None]:
!pip install nbimporter
!pip install import-ipynb
!pip install torch
!pip install torcheval
!pip install torchmetrics
!pip install comet-ml

In [2]:
import sys
import os
import json
import pickle
from typing import Any
from ctypes import sizeof

from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model

import torch
import torch.utils.data
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

from torcheval.metrics import MeanSquaredError
from torchmetrics.regression import MeanAbsoluteError
from torchmetrics.classification import Accuracy

from google.colab import userdata
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd

Mounted at /content/drive


In [4]:
class BirdHotspotsDataset(Dataset):
  """
  Dataset for Bird Species Distribution Modeling with Location Information (Bird Hotspots).
  """

  def __init__(self, features_df, targets_df):
    """
    Args:
      features_df (pd.DataFrame): DataFrame containing the features.
      targets_df (pd.DataFrame): DataFrame containing the targets.
    """

    self.data = [
        [torch.tensor(pd.to_numeric(row.drop(labels = ["hotspot_id"]).values, errors = "coerce")).squeeze(),
         torch.tensor(targets_df.loc[targets_df["hotspot_id"] == row["hotspot_id"]]
                                    .drop(columns = ["hotspot_id", "num_complete_checklists"]).values).squeeze()]
                 for i, row in features_df.iterrows()]

  def __len__(self):
    """
    Returns the length of the dataset.
    Returns:
      length (int): Length of the dataset.
    """

    return len(self.data)

  def __getitem__(self, idx):
    """
    Returns the record and its target.
    Args:
      idx (int): Index of the record.
    Returns:
      record (torch.Tensor): Record.
      target (torch.Tensor): Target.
    """

    return self.data[idx][0], self.data[idx][1]

In [7]:
# Data that includes location data
# df_train = pd.read_csv('/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_train_loc.csv')
# df_val = pd.read_csv('/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_val_loc.csv')
# df_test = pd.read_csv('/content/drive/MyDrive/TeamMila/ProjectDataset/normalized_test_loc.csv')
df_targets = pd.read_csv('/content/drive/MyDrive/TeamMila/ProjectDataset/targets.csv')

df_train = pd.read_csv('/content/drive/MyDrive/TeamMila/ProjectDataset/train_split.csv')
df_val = pd.read_csv('/content/drive/MyDrive/TeamMila/ProjectDataset/valid_split.csv')
df_test = pd.read_csv('/content/drive/MyDrive/TeamMila/ProjectDataset/test_split.csv')

In [8]:
# Columns that will be used for the data
df_columns = ["hotspot_id", "lon", "lat"] + [f"bio_{i}" for i in range(1, 20)] + ["orcdrc", "phihox", "cecsol", "bdticm", "clyppt", "sltppt", "sndppt", "bldfie"]

In [None]:
# Pickle each dataset
# with open('/content/drive/MyDrive/TeamMila/ProjectDataset/train_dataset_loc.p', 'wb') as f:
#     pickle.dump(train_dataset, f)

# with open('/content/drive/MyDrive/TeamMila/ProjectDataset/valid_dataset_loc.p', 'wb') as f:
#     pickle.dump(valid_dataset, f)

# with open('/content/drive/MyDrive/TeamMila/ProjectDataset/test_dataset_loc.p', 'wb') as f:
#     pickle.dump(test_dataset, f)

In [None]:
with open('/content/drive/MyDrive/TeamMila/ProjectDataset/train_dataset_loc.p', "rb") as f:
  train_dataset = pickle.load(f)

with open('/content/drive/MyDrive/TeamMila/ProjectDataset/valid_dataset_loc.p', "rb") as f:
  valid_dataset = pickle.load(f)

with open('/content/drive/MyDrive/TeamMila/ProjectDataset/test_dataset_loc.p', "rb") as f:
   test_dataset = pickle.load(f)


In [24]:
training_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/augmented_train.csv")
evaluation_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/augmented_val.csv")
testing_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/augmented_test.csv")
targets_df = pd.read_csv("/content/drive/MyDrive/TeamMila/ProjectDataset/targets.csv")

# Columns that will be used for the data
df_columns = ["hotspot_id", "lon", "lat"] + [f"bio_{i}" for i in range(1, 20)] + ["orcdrc", "phihox", "cecsol", "bdticm", "clyppt", "sltppt", "sndppt", "bldfie"]

training_df = training_df[df_columns]
evaluation_df = evaluation_df[df_columns]
testing_df = testing_df[df_columns]

In [26]:
train_dataset = BirdHotspotsDataset(training_df, df_targets)
valid_dataset = BirdHotspotsDataset(evaluation_df, df_targets)
test_dataset = BirdHotspotsDataset(testing_df, df_targets)

In [27]:
# different datloaders for different splits of the data
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [12]:
class EncounterRateMLP(nn.Module):
    def __init__(self, env_input_size=27, loc_input_size=2, num_classes=1, hidden_dimensions=128):
        super(EncounterRateMLP, self).__init__()
        self.inc_bias = False
        # Encoder for environmental features
        self.env_encoder = nn.Sequential(
            nn.Linear(env_input_size, hidden_dimensions),
            nn.LeakyReLU(inplace=True),
            nn.Linear(hidden_dimensions, hidden_dimensions),
            nn.LeakyReLU(inplace=True)
        )
        # Encoder for location features
        self.loc_encoder = nn.Sequential(
            nn.Linear(loc_input_size, hidden_dimensions),
            nn.LeakyReLU(inplace=True),
            nn.Linear(hidden_dimensions, hidden_dimensions),
            nn.LeakyReLU(inplace=True)
        )
        # Final output layer after concatenating encoded features
        self.output_layer = nn.Linear(hidden_dimensions * 2, num_classes)
    def forward(self, env_features, loc_features):
        # Pass through environmental encoder
        env_encoded = self.env_encoder(env_features)
        # Pass through location encoder
        loc_encoded = self.loc_encoder(loc_features)
        # Concatenate encoded features
        combined_features = torch.cat((env_encoded, loc_encoded), dim=1)
        # Pass through final output layer and apply sigmoid
        output = torch.sigmoid(self.output_layer(combined_features))
        return output




In [13]:
def TopKAccuracy(outputs, targets, k = None):
    sum_topk = 0
    for output, target in zip(outputs, targets):
        if k is None:
            # Let K be the number of nonzero values for a set of predictions
            k = torch.count_nonzero(target).item()
        top_k_preds = torch.topk(output, k).indices
        true_labels = torch.topk(target, k).indices
        sum_topk += torch.count_nonzero(torch.isin(top_k_preds, true_labels)) / k
    batch_size = outputs.shape[0]
    return sum_topk / batch_size

In [14]:
def training_step(model, dataloader, eval_dataloader, criterion, optimizer, device, num_epochs, experiment = None):
    model.to(device)
    mse_metric = MeanSquaredError().to(device)
    mae_metric = MeanAbsoluteError().to(device)
    mse_metric_eval = MeanSquaredError().to(device)
    mae_metric_eval = MeanAbsoluteError().to(device)

    for epoch in range(num_epochs):
        model.train()
        mse_metric.reset()
        mae_metric.reset()
        mse_metric_eval.reset()
        mae_metric_eval.reset()

        print("A NEW EPOCH HAS STARTED")

        running_loss = 0.0

        top_10_correct = 0
        top_30_correct = 0
        top_k_correct = 0
        num_batches = 0

        top_10_correct_eval = 0
        top_30_correct_eval = 0
        top_k_correct_eval = 0
        num_batches_eval = 0

        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2)

        for inputs, targets in dataloader:
          ### runtime error of having inputs and targets as non-floats
          inputs, targets = inputs.float().to(device), targets.float().to(device)

          optimizer.zero_grad()

          outputs = model(inputs[:, 2:], inputs[:, :2])

          #runtime error address: targets not in between 0 to 1
          targets = torch.clamp(targets,0,1)
          loss = criterion(outputs, targets)

          loss.backward()
          optimizer.step()

          running_loss += loss.item()

          mse_metric.update(outputs, targets)
          mae_metric.update(outputs, targets)

          top_10_correct += TopKAccuracy(outputs, targets, k = 10)
          top_30_correct += TopKAccuracy(outputs, targets, k = 30)
          top_k_correct += TopKAccuracy(outputs, targets)
          num_batches += 1

        scheduler.step()
        model.eval()

        for inputs, targets in eval_dataloader:
          with torch.no_grad():
            inputs, targets = inputs.float().to(device), targets.float().to(device)
            outputs = model(inputs[:, 2:], inputs[:, :2])
            targets = torch.clamp(targets,0,1)

            mse_metric_eval.update(outputs, targets)
            mae_metric_eval.update(outputs, targets)

            top_10_correct_eval += TopKAccuracy(outputs, targets, k = 10)
            top_30_correct_eval += TopKAccuracy(outputs, targets, k = 30)
            top_k_correct_eval += TopKAccuracy(outputs, targets)
            num_batches_eval += 1

        mse = mse_metric.compute()
        mae = mae_metric.compute()

        top_10 = top_10_correct / num_batches
        top_30 = top_30_correct / num_batches
        top_k = top_k_correct / num_batches

        mse_eval = mse_metric_eval.compute()
        mae_eval = mae_metric_eval.compute()

        top_10_correct_eval = top_10_correct_eval / num_batches_eval
        top_30_correct_eval = top_30_correct_eval / num_batches_eval
        top_k_correct_eval = top_k_correct_eval / num_batches_eval

        if experiment is not None:
          experiment.log_metrics({
              "mse": mse,
              "mae": mae,
              "top_10_accuracy": top_10,
              "top_30_accuracy": top_30,
              "top_k_accuracy": top_k,
              "mse_eval": mse_eval,
              "mae_eval": mae_eval,
              "top_10_accuracy_eval": top_10_correct_eval,
              "top_30_accuracy_eval": top_30_correct_eval,
              "top_k_accuracy_eval": top_k_correct_eval
              }, step=epoch
          )

        print(f"Epoch [{epoch + 1}/{num_epochs}], " f"Mean Squared Error: {mse.item():.5f}, " f"Mean Absolute Error: {mae.item():.5f}, " f"Top 10 Accuracy: {top_10:.5f}, " f"Top 30 Accuracy: {top_30:.5f}," f" Top K Accuracy: {top_k:.5f}")




In [15]:
def testing_step(model, dataloader, device, experiment = None):
    model.to(device)

    mse_metric = MeanSquaredError().to(device)
    mae_metric = MeanAbsoluteError().to(device)

    mse_metric.reset()
    mae_metric.reset()

    top_10_correct = 0
    top_30_correct = 0
    top_k_correct = 0
    num_batches = 0

    model.eval()

    for inputs, targets in dataloader:
      inputs, targets = inputs.float().to(device), targets.float().to(device)
      outputs = model(inputs[:, 2:], inputs[:, :2])
      targets = torch.clamp(targets,0,1)

      mse_metric.update(outputs, targets)
      mae_metric.update(outputs, targets)

      top_10_correct += TopKAccuracy(outputs, targets, k = 10)
      top_30_correct += TopKAccuracy(outputs, targets, k = 30)
      top_k_correct += TopKAccuracy(outputs, targets)
      num_batches += 1

    mse = mse_metric.compute()
    mae = mae_metric.compute()

    top_10 = top_10_correct / num_batches
    top_30 = top_30_correct / num_batches
    top_k = top_k_correct / num_batches

    print(f"Mean Squared Error: {mse.item():.5f}, Mean Absolute Error: {mae.item():.5f}")

    print(f"Top 10 Accuracy: {top_10:.5f}, Top 30 Accuracy: {top_30:.5f}, Top K Accuracy: {top_k:.5f}")

    if experiment is not None:
      experiment.log_metrics({
          "mse_test": mse,
          "mae_test": mae,
          "top_10_accuracy_test": top_10,
          "top_30_accuracy_test": top_30,
          "top_k_accuracy_test": top_k
      })

In [28]:
# Use 'Secrets' tab to store real api_key value

experiment = Experiment(
    api_key=userdata.get('comet_api_key'),
    project_name="bird-species-distribution-modeling-with-location-information",
    workspace="danielleraine"
)

experiment.set_name("baseline3")

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/danielleraine/bird-species-distribution-modeling-with-location-information/fdb4476e91804de8a894b40d3a69ba3a



In [29]:
hyper_params = {
    'batch_size': 64,
    'initial_learning_rate': 0.005,
    'scheduler_gamma': 0.1,
    'scheduler_step_size:': 2,
    'num_epochs': 10,
}

experiment.log_parameters(hyper_params)

In [30]:
model = EncounterRateMLP(27, 2, 671, hidden_dimensions=128).float()
criterion = nn.BCELoss()

In [31]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = nn.BCELoss()

training_step(
    model=model,
    dataloader=train_dataloader,
    eval_dataloader=valid_dataloader,
    criterion=criterion,
    optimizer=optimizer,
    device=device,
    num_epochs=10,  # Number of epochs for training
    experiment=experiment
)

A NEW EPOCH HAS STARTED
Epoch [1/10], Mean Squared Error: 0.00972, Mean Absolute Error: 0.02743, Top 10 Accuracy: 0.38704, Top 30 Accuracy: 0.48218, Top K Accuracy: 0.47328
A NEW EPOCH HAS STARTED
Epoch [2/10], Mean Squared Error: 0.00782, Mean Absolute Error: 0.02503, Top 10 Accuracy: 0.42137, Top 30 Accuracy: 0.51938, Top K Accuracy: 0.50698
A NEW EPOCH HAS STARTED
Epoch [3/10], Mean Squared Error: 0.00791, Mean Absolute Error: 0.02505, Top 10 Accuracy: 0.41862, Top 30 Accuracy: 0.51795, Top K Accuracy: 0.50680
A NEW EPOCH HAS STARTED
Epoch [4/10], Mean Squared Error: 0.00766, Mean Absolute Error: 0.02462, Top 10 Accuracy: 0.42957, Top 30 Accuracy: 0.53030, Top K Accuracy: 0.51746
A NEW EPOCH HAS STARTED
Epoch [5/10], Mean Squared Error: 0.00790, Mean Absolute Error: 0.02492, Top 10 Accuracy: 0.42007, Top 30 Accuracy: 0.51948, Top K Accuracy: 0.50669
A NEW EPOCH HAS STARTED
Epoch [6/10], Mean Squared Error: 0.00791, Mean Absolute Error: 0.02489, Top 10 Accuracy: 0.41771, Top 30 Accur

In [None]:
# checkpoint = torch.load("checkpoints/best_checkpoint.pth")
# print(checkpoint.keys())
# print(checkpoint['feats.0.weight'])

NameError: name 'torch' is not defined

In [None]:
torch.save(model, "/content/drive/MyDrive/TeamMila/Models/baseline-loc-augnorm.pth")

In [None]:
# model = torch.load("/content/drive/MyDrive/TeamMila/Models/baseline_model.pth", weights_only=False)

In [32]:
testing_step(model, test_dataloader, device, experiment)

Mean Squared Error: 0.00775, Mean Absolute Error: 0.02472
Top 10 Accuracy: 0.41887, Top 30 Accuracy: 0.52251, Top K Accuracy: 0.51101


In [33]:
log_model(experiment, model, "baseline3")

In [34]:
experiment.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline3
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/danielleraine/bird-species-distribution-modeling-with-location-information/fdb4476e91804de8a894b40d3a69ba3a
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [1337]               : (0.05103765428066254, 27.46247100830078)
[1;38;5;39mCOMET INFO:[0m     mae [10]                  : (0.024059725925326347, 0.027428925037384033)
[1;38;5;39mCOMET INFO:[0m     mae_eval [10]             : (0.02433507889509201, 0.02692839875817299)