In [1]:
import numpy as np
import pandas as pd
import itertools
import geopandas as gpd
import rasterio
import fiona
import sklearn.model_selection as model_selection
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, Subset, TensorDataset
import torcheval.metrics as metrics

import matplotlib.pyplot as plt

In [None]:
import bird_data as data
from Archive.bird_model import birdNN

In [3]:
class ZeroDataset(Dataset):
    def __init__(self, a, b, c):
        self.shape = (b, c)
        self.length = a  # this will be the number of samples

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return torch.zeros(self.shape)

In [35]:
class JuncoDataset(Dataset):
    def __init__(self, tensor_data, labels):
        self.tensor_data = tensor_data
        self.labels = labels
        self.C, self.H, self.W = tensor_data.shape

        self.features = tensor_data.permute(1, 2, 0).reshape(-1, self.C)
        self.targets = labels.flatten()

    def __len__(self):
        return self.H * self.W

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

    def get_input_dim(self):
        return self.C

    def get_true_labels(self):
        return self.labels


class JuncoDatasetBuilder:
    def __init__(self, dataframe, feature_cols=None, label_col='species_observed',
                 num_lat_bins=100, num_lon_bins=100, fill_missing=True, fill_method='zeros'):
        self.df = dataframe.copy()
        self.label_col = label_col
        self.num_lat_bins = num_lat_bins
        self.num_lon_bins = num_lon_bins
        self.fill_missing = fill_missing
        self.fill_method = fill_method

        if feature_cols is None:
            exclude = ['latitude', 'longitude', 'lat_bin', 'lon_bin', label_col]
            self.feature_cols = [col for col in self.df.columns if col not in exclude]
        else:
            self.feature_cols = feature_cols

        self.grid = None
        self.labels = None

    def bin_coordinates(self):
        lat_min, lat_max = self.df['latitude'].min(), self.df['latitude'].max()
        lon_min, lon_max = self.df['longitude'].min(), self.df['longitude'].max()

        lat_bins = np.linspace(lat_min, lat_max, self.num_lat_bins + 1)
        lon_bins = np.linspace(lon_min, lon_max, self.num_lon_bins + 1)

        self.df['lat_bin'] = np.digitize(self.df['latitude'], lat_bins) - 1
        self.df['lon_bin'] = np.digitize(self.df['longitude'], lon_bins) - 1

        self.df['lat_bin'] = self.df['lat_bin'].clip(0, self.num_lat_bins - 1)
        self.df['lon_bin'] = self.df['lon_bin'].clip(0, self.num_lon_bins - 1)

    def build_grid(self):
        self.bin_coordinates()
        grid = np.zeros((self.num_lat_bins, self.num_lon_bins, len(self.feature_cols)), dtype=np.float32)
        counts = np.zeros((self.num_lat_bins, self.num_lon_bins), dtype=np.int32)

        for _, row in self.df.iterrows():
            i, j = row['lat_bin'], row['lon_bin']
            grid[i, j] += row[self.feature_cols].values.astype(np.float32)
            counts[i, j] += 1

        nonzero_mask = counts > 0
        grid[nonzero_mask] /= counts[nonzero_mask, None]

        if self.fill_missing:
            if self.fill_method == 'mean':
                global_mean = np.nanmean(grid[nonzero_mask], axis=0)
                grid[~nonzero_mask] = global_mean
            if self.fill_method == 'zeros':
                grid[~nonzero_mask] = 0

        self.grid = grid

    def extract_labels(self):
        labels = np.zeros((self.num_lat_bins, self.num_lon_bins), dtype=np.int64)
        for _, row in self.df.iterrows():
            i, j = row['lat_bin'], row['lon_bin']
            labels[i, j] = row[self.label_col]
        self.labels = labels

    def get_dataset(self):
        if self.grid is None:
            self.build_grid()
        if self.labels is None:
            self.extract_labels()

        tensor_data = torch.from_numpy(self.grid).permute(2, 0, 1)
        labels = torch.tensor(self.labels)
        return JuncoDataset(tensor_data, labels)

In [36]:
builder = JuncoDatasetBuilder(data.junco_data, label_col='species_observed',
                              num_lat_bins = 100, num_lon_bins = 100)
dataset = builder.get_dataset()
length = len(dataset)
print(length)
input_dim = dataset.get_input_dim()
print(input_dim)
truth = dataset.get_true_labels()
truth = pd.DataFrame(truth.numpy())

10000
41


In [41]:
loader = DataLoader(dataset, batch_size = 32, shuffle = True)
for x, y in loader:
    print(x.shape)
    print(y.shape)
    break

torch.Size([32, 41])
torch.Size([32])


In [28]:
# dataset = ZeroDataset(30, 100, 100)
dataset.__getitem__(10)

(tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 tensor(0))

In [23]:
epochs = 100
num_classes = 2
num_features = 30
batchsize = 32
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_data = Subset(dataset, range(100))
# train_data, test_data = torch.utils.data.random_split(dataset, [train_size, test_size])
trainloader = DataLoader(train_data, batch_size = batchsize, shuffle = True)
# testloader = DataLoader(test_data, batch_size = batchsize)


In [24]:
model = birdNN(input_dim = num_features, output_dim = num_classes,
               num_layers = 4, dropout_rate = 0.2,
               kernelsize = 3, pad = 1, stride_len = 1, 
               hidden_dim = 64
        )
criterion = nn.CrossEntropyLoss()
accuracy = metrics.MulticlassAccuracy(num_classes = num_classes)
auroc = metrics.MulticlassAUROC(num_classes = num_classes)
# confusion_matrix = metrics.functional.multiclass_confusion_matrix
optimizer = optim.Adam(model.parameters(), lr = 0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 10, gamma  = 0.75)
print("Starting Model Training")
model.train()
train_loss = []
train_accuracy = []
train_auroc = []
test_accuracy = []
test_auroc = []
for epoch in range(epochs):
    ## Model Training
    running_loss = 0
    for inputs, presence in trainloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, presence)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        accuracy.update(outputs, presence)
        auroc.update(outputs, presence)
    train_accuracy.append(accuracy.compute().numpy())
    train_auroc.append(auroc.compute().numpy())
    train_loss.append(running_loss)
    if (epoch + 1) % 10 == 0:
        print(f'Epoch: {epoch + 1}\n')
        print(f'Loss: {train_loss[epoch]}\n')
        print(f'Accuracy: {train_accuracy[epoch]}\n')
        print(f'AUROC: {train_auroc[epoch]}\n')
    ## Model Validation
    # accuracy.reset()
    # auroc.reset()
    # model.eval()
    # with torch.no_grad():
    #     for inputs, presence in testloader:
    #         outputs = model(inputs)
    #         accuracy.update(outputs, presence)
    #         auroc.update(outputs, presence)
    
    # test_accuracy.append(accuracy.compute().numpy())
    # test_auroc.append(auroc.compute().numpy())
    # if (epoch + 1) % 10 == 0:
    #     print(f'Test Accuracy: {test_accuracy[epoch]}\n')
    #     print(f'Test AUROC: {test_auroc[epoch]}\n')
    
    # # scores.append(accuracy)
    accuracy.reset()
    auroc.reset()
    model.train()
    ## Saves model state
    # torch.save(model.state_dict(), f'./model_states/model_{folds}.pth')

Starting Model Training


RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [32, 41]