In [25]:
import numpy as np
import pandas as pd
import itertools
import geopandas as gpd
import rasterio
import fiona
import sklearn.model_selection as model_selection
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, Subset, TensorDataset
import torcheval.metrics as metrics

import matplotlib.pyplot as plt

In [3]:
import bird_data as data

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class JuncoDataset(Dataset):
    def __init__(self, tensor_data, labels):
        self.tensor_data = tensor_data
        self.labels = labels
        self.C, self.H, self.W = tensor_data.shape

        # self.features = tensor_data.permute(1, 2, 0).reshape(-1, self.C)
        # self.targets = labels.flatten()

    def __len__(self):
        # return self.H * self.W
        return 141

    def __getitem__(self, idx):
        # return self.features[idx], self.targets[idx]
        return self.tensor_data[idx], self.labels[idx]

    def get_input_dim(self):
        return self.C

    def get_true_labels(self):
        return self.labels[0]


class JuncoDatasetBuilder:
    def __init__(self, dataframe, feature_cols=None, label_col='species_observed',
                 num_lat_bins=100, num_lon_bins=100, fill_missing=True, fill_method='zeros'):
        self.df = dataframe.copy()
        self.label_col = label_col
        self.num_lat_bins = num_lat_bins
        self.num_lon_bins = num_lon_bins
        self.fill_missing = fill_missing
        self.fill_method = fill_method

        if feature_cols is None:
            exclude = ['latitude', 'longitude', 'lat_bin', 'lon_bin', label_col]
            self.feature_cols = [col for col in self.df.columns if col not in exclude]
        else:
            self.feature_cols = feature_cols

        self.grid = None
        self.labels = None

    def bin_coordinates(self):
        lat_min, lat_max = self.df['latitude'].min(), self.df['latitude'].max()
        lon_min, lon_max = self.df['longitude'].min(), self.df['longitude'].max()

        lat_bins = np.linspace(lat_min, lat_max, self.num_lat_bins + 1)
        lon_bins = np.linspace(lon_min, lon_max, self.num_lon_bins + 1)

        self.df['lat_bin'] = np.digitize(self.df['latitude'], lat_bins) - 1
        self.df['lon_bin'] = np.digitize(self.df['longitude'], lon_bins) - 1

        self.df['lat_bin'] = self.df['lat_bin'].clip(0, self.num_lat_bins - 1)
        self.df['lon_bin'] = self.df['lon_bin'].clip(0, self.num_lon_bins - 1)

    def build_grid(self):
        self.bin_coordinates()
        grid = np.zeros((self.num_lat_bins, self.num_lon_bins, len(self.feature_cols)), dtype=np.float32)
        counts = np.zeros((self.num_lat_bins, self.num_lon_bins), dtype=np.int32)

        for _, row in self.df.iterrows():
            i, j = row['lat_bin'], row['lon_bin']
            grid[i, j] += row[self.feature_cols].values.astype(np.float32)
            counts[i, j] += 1

        nonzero_mask = counts > 0
        grid[nonzero_mask] /= counts[nonzero_mask, None]

        if self.fill_missing:
            if self.fill_method == 'mean':
                global_mean = np.nanmean(grid[nonzero_mask], axis=0)
                grid[~nonzero_mask] = global_mean
            if self.fill_method == 'zeros':
                grid[~nonzero_mask] = 0

        self.grid = grid

    def extract_labels(self):
        labels = np.zeros((self.num_lat_bins, self.num_lon_bins), dtype=np.int64)
        for _, row in self.df.iterrows():
            i, j = row['lat_bin'], row['lon_bin']
            labels[i, j] = row[self.label_col]
        self.labels = labels

    def get_dataset(self):
        if self.grid is None:
            self.build_grid()
        if self.labels is None:
            self.extract_labels()

        tensor_data = torch.from_numpy(self.grid).permute(2, 0, 1)
        labels = torch.tensor(self.labels)
        return TensorDataset(tensor_data, labels)

In [13]:
class birdNN(nn.Module):
    def __init__(self, num_layers, dropout_rate, 
                 input_dim, hidden_dim, output_dim):
        super(birdNN, self).__init__()
        self.activation = nn.LeakyReLU(0.01)
        self.dropout = nn.Dropout(dropout_rate) if dropout_rate else nn.Identity()
        ## Dense Layers
        self.input_fc = nn.Linear(input_dim, hidden_dim)
        self.layers = nn.ModuleList()
        for _ in range(num_layers):
            self.layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.layers.append(nn.LeakyReLU(0.01))
            self.layers.append(nn.Dropout(dropout_rate) if dropout_rate else nn.Identity())
        self.output_fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.activation(self.input_fc(x))
        x = self.dropout(x)
        for layer in self.layers:
            x = layer(x)
        x = self.output_fc(x)
        return x

In [28]:
builder = JuncoDatasetBuilder(data.junco_data, label_col='species_observed',
                              num_lat_bins = 100, num_lon_bins = 100)
# dataset = builder.get_dataset()
# length = len(dataset)
# print(length)
# input_dim = dataset.get_input_dim()
# print(input_dim)
# truth = dataset.get_true_labels()
# truth = pd.DataFrame(truth.numpy())

loader = DataLoader(dataset, batch_size = 32, shuffle = True)
for x, y in loader:
    print(x.shape)
    print(y.shape)
    break

torch.Size([32, 100, 100])
torch.Size([32, 100])


In [24]:
model = birdNN(input_dim = 41, output_dim = 2,
               num_layers = 4, dropout_rate = 0.2,
               hidden_dim = 64).to(device)
criterion = nn.CrossEntropyLoss()
accuracy = metrics.MulticlassAccuracy(num_classes = 2)
auroc = metrics.MulticlassAUROC(num_classes = 2)
# confusion_matrix = metrics.functional.multiclass_confusion_matrix
optimizer = optim.Adam(model.parameters(), lr = 0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 10, gamma  = 0.75)
print("Starting Model Training")
model.train()
train_loss = []
train_accuracy = []
train_auroc = []
test_accuracy = []
test_auroc = []
for epoch in range(100):
    ## Model Training
    running_loss = 0
    optimizer.zero_grad()
    for inputs, presence in loader:
        inputs, presence = inputs.to(device), presence.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, presence)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        accuracy.update(outputs, presence)
        auroc.update(outputs, presence)
    train_accuracy.append(accuracy.compute().numpy())
    train_auroc.append(auroc.compute().numpy())
    train_loss.append(running_loss)
    if (epoch + 1) % 10 == 0:
        print(f'Epoch: {epoch + 1}\n')
        print(f'Loss: {train_loss[epoch]}\n')
        print(f'Accuracy: {train_accuracy[epoch]}\n')
        print(f'AUROC: {train_auroc[epoch]}\n')
    ## Model Validation
    # accuracy.reset()
    # auroc.reset()
    # model.eval()
    # with torch.no_grad():
    #     for inputs, presence in testloader:
    #         outputs = model(inputs)
    #         accuracy.update(outputs, presence)
    #         auroc.update(outputs, presence)
    
    # test_accuracy.append(accuracy.compute().numpy())
    # test_auroc.append(auroc.compute().numpy())
    # if (epoch + 1) % 10 == 0:
    #     print(f'Test Accuracy: {test_accuracy[epoch]}\n')
    #     print(f'Test AUROC: {test_auroc[epoch]}\n')
    
    # # scores.append(accuracy)
    accuracy.reset()
    auroc.reset()
    model.train()
    ## Saves model state
    # torch.save(model.state_dict(), f'./model_states/model_{folds}.pth')

Starting Model Training


RuntimeError: mat1 and mat2 shapes cannot be multiplied (3200x100 and 41x64)