In [None]:
import numpy as np
import pandas as pd
import itertools
import geopandas as gpd
import rasterio
import fiona
import sklearn.model_selection as model_selection
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, Subset
import torcheval.metrics as metrics

import matplotlib.pyplot as plt

In [21]:
torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

device(type='cuda')

In [None]:
# parameter_grid = {
#     'num_layers': [2, 3, 4, 5, 6], 
#     'dropout_rate': [0, 0.1, 0.2], 
#     'learning_rate': [0.01, 0.005, 0.001],
#     'num_conv': [2, 4, 6], 
#     'kernelsize': [2, 3, 4, 5], 
#     'pad': [0, 1, 2], 
#     'stride_len': [1, 2],
#     'hidden_dim': [64, 128, 256, 512, 1024],
# }

In [None]:
# junco_data = pd.merge(checklist_zf, env_checklist, how = 'left', left_index = True, right_index = True)
# junco_data.reset_index(inplace=True)
# junco_data.drop(labels = ['checklist_id', 'observer_id', 'type', 'state_code', 'locality_id', 'protocol_type', 'observation_date'], axis = 1, inplace = True)

In [None]:
class JuncoDataset(Dataset):
    def __init__(self, tensor_data, labels):
        self.tensor_data = tensor_data
        self.labels = labels

    def __len__(self):
        return len(self.tensor_data)

    def __getitem__(self, idx):
        return self.tensor_data[idx], self.labels[idx]


class JuncoDatasetBuilder:
    def __init__(self, dataframe, feature_cols=None, label_col='species_observed',
                 num_lat_bins=100, num_lon_bins=100, fill_missing=True, fill_method:'zeros'):
        self.df = dataframe.copy()
        self.label_col = label_col
        self.num_lat_bins = num_lat_bins
        self.num_lon_bins = num_lon_bins
        self.fill_missing = fill_missing
        self.fill_method = fill_method

        if feature_cols is None:
            exclude = ['latitude', 'longitude', 'lat_bin', 'lon_bin', label_col]
            self.feature_cols = [col for col in self.df.columns if col not in exclude]
        else:
            self.feature_cols = feature_cols

        self.grid = None
        self.labels = None

    def bin_coordinates(self):
        lat_min, lat_max = self.df['latitude'].min(), self.df['latitude'].max()
        lon_min, lon_max = self.df['longitude'].min(), self.df['longitude'].max()

        lat_bins = np.linspace(lat_min, lat_max, self.num_lat_bins + 1)
        lon_bins = np.linspace(lon_min, lon_max, self.num_lon_bins + 1)

        self.df['lat_bin'] = np.digitize(self.df['latitude'], lat_bins) - 1
        self.df['lon_bin'] = np.digitize(self.df['longitude'], lon_bins) - 1

        self.df['lat_bin'] = self.df['lat_bin'].clip(0, self.num_lat_bins - 1)
        self.df['lon_bin'] = self.df['lon_bin'].clip(0, self.num_lon_bins - 1)

    def build_grid(self):
        self.bin_coordinates()
        grid = np.zeros((self.num_lat_bins, self.num_lon_bins, len(self.feature_cols)), dtype=np.float32)
        counts = np.zeros((self.num_lat_bins, self.num_lon_bins), dtype=np.int32)

        for _, row in self.df.iterrows():
            i, j = row['lat_bin'], row['lon_bin']
            grid[i, j] += row[self.feature_cols].values.astype(np.float32)
            counts[i, j] += 1

        nonzero_mask = counts > 0
        grid[nonzero_mask] /= counts[nonzero_mask, None]

        if self.fill_missing:
            if self.fill_method == 'mean':
                global_mean = np.nanmean(grid[nonzero_mask], axis=0)
                grid[~nonzero_mask] = global_mean
            if self.fill_method == 'zeros':
                grid[~nonzero_mask] = 0

        self.grid = grid

    def extract_labels(self):
        labels = np.zeros((self.num_lat_bins, self.num_lon_bins), dtype=np.int64)
        # Assumes 0 or 1 labels. If multiple values, majority voting or other logic needed
        for _, row in self.df.iterrows():
            i, j = row['lat_bin'], row['lon_bin']
            labels[i, j] = row[self.label_col]
        self.labels = labels

    def get_dataset(self):
        if self.grid is None:
            self.build_grid()
        if self.labels is None:
            self.extract_labels()

        tensor_data = torch.from_numpy(self.grid).permute(2, 0, 1).unsqueeze(0)  # (1, C, H, W)
        labels = torch.tensor(self.labels).unsqueeze(0)  # (1, H, W) or flatten if needed
        return JuncoDataset(tensor_data, labels)


In [None]:
class JuncoDataset(Dataset):
    def __init__(self, tensor_data, labels):
        self.tensor_data = tensor_data
        self.labels = labels
        self.C, self.H, self.W = tensor_data.shape

    def __len__(self):
        return self.H * self.W

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

    def get_input_dim(self):
        return self.C

    def get_true_labels(self):
        return self.labels


class JuncoDatasetBuilder:
    def __init__(self, dataframe, feature_cols=None, label_col='species_observed',
                 num_lat_bins=100, num_lon_bins=100, fill_missing=True, fill_method='zeros'):
        self.df = dataframe.copy()
        self.label_col = label_col
        self.num_lat_bins = num_lat_bins
        self.num_lon_bins = num_lon_bins
        self.fill_missing = fill_missing
        self.fill_method = fill_method

        if feature_cols is None:
            exclude = ['latitude', 'longitude', 'lat_bin', 'lon_bin', label_col]
            self.feature_cols = [col for col in self.df.columns if col not in exclude]
        else:
            self.feature_cols = feature_cols

        self.grid = None
        self.labels = None

    def bin_coordinates(self):
        lat_min, lat_max = self.df['latitude'].min(), self.df['latitude'].max()
        lon_min, lon_max = self.df['longitude'].min(), self.df['longitude'].max()

        lat_bins = np.linspace(lat_min, lat_max, self.num_lat_bins + 1)
        lon_bins = np.linspace(lon_min, lon_max, self.num_lon_bins + 1)

        self.df['lat_bin'] = np.digitize(self.df['latitude'], lat_bins) - 1
        self.df['lon_bin'] = np.digitize(self.df['longitude'], lon_bins) - 1

        self.df['lat_bin'] = self.df['lat_bin'].clip(0, self.num_lat_bins - 1)
        self.df['lon_bin'] = self.df['lon_bin'].clip(0, self.num_lon_bins - 1)

    def build_grid(self):
        self.bin_coordinates()
        grid = np.zeros((self.num_lat_bins, self.num_lon_bins, len(self.feature_cols)), dtype=np.float32)
        counts = np.zeros((self.num_lat_bins, self.num_lon_bins), dtype=np.int32)

        for _, row in self.df.iterrows():
            i, j = row['lat_bin'], row['lon_bin']
            grid[i, j] += row[self.feature_cols].values.astype(np.float32)
            counts[i, j] += 1

        nonzero_mask = counts > 0
        grid[nonzero_mask] /= counts[nonzero_mask, None]

        if self.fill_missing:
            if self.fill_method == 'mean':
                global_mean = np.nanmean(grid[nonzero_mask], axis=0)
                grid[~nonzero_mask] = global_mean
            if self.fill_method == 'zeros':
                grid[~nonzero_mask] = 0

        self.grid = grid

    def extract_labels(self):
        labels = np.zeros((self.num_lat_bins, self.num_lon_bins), dtype=np.int64)
        for _, row in self.df.iterrows():
            i, j = row['lat_bin'], row['lon_bin']
            labels[i, j] = row[self.label_col]
        self.labels = labels

    def get_dataset(self):
        if self.grid is None:
            self.build_grid()
        if self.labels is None:
            self.extract_labels()

        tensor_data = torch.from_numpy(self.grid).permute(2, 0, 1)
        labels = torch.tensor(self.labels)
        return JuncoDataset(tensor_data, labels)


In [None]:
def infer_flattened_size(model, input_shape):
    with torch.no_grad():
        dummy = torch.zeros(1, *input_shape)
        out = model(dummy)
        return out.view(1, -1).shape[1]

class birdNN(nn.Module):
    def __init__(self, num_layers, dropout_rate, kernelsize, pad, stride_len, 
                 input_dim, hidden_dim, output_dim):
        super(birdNN, self).__init__()
        ## Convolution Layers
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.conv_layers = nn.ModuleList()
        self.conv_layers.append(nn.Conv2d(input_dim, 16,
                                        kernel_size=kernelsize, stride=stride_len,
                                        padding=pad))
        self.conv_layers.append(nn.Conv2d(16, 32,
                                        kernel_size=kernelsize, stride=stride_len,
                                        padding=pad))
        self.conv_layers.append(nn.Conv2d(32, 32,
                                        kernel_size=kernelsize, stride=stride_len,
                                        padding=pad))
        self.conv_layers.append(nn.Conv2d(32, 64,
                                        kernel_size=kernelsize, stride=stride_len,
                                        padding=pad))
        self.pool = nn.MaxPool2d(kernel_size = kernelsize, stride = stride_len)
        self.flatten = nn.Flatten()
        self.activation = nn.LeakyReLU(0.01)
        self.dropout = nn.Dropout(dropout_rate) if dropout_rate else nn.Identity()
        ## Dense Layers
        self.layers = nn.ModuleList()
        for _ in range(num_layers):
            self.layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.layers.append(nn.LeakyReLU(0.01))
            self.layers.append(nn.Dropout(dropout_rate) if dropout_rate else nn.Identity())
        self.output_fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        for conv_layer in self.conv_layers:
            x = self.activation(conv_layer(x))
            x = self.pool(x)
            x = self.dropout(x)
        x = self.flatten(x)
        flatten_dim = x.shape[1]
        self.input_fc = nn.Linear(flatten_dim, self.hidden_dim)
        x = self.activation(self.input_fc(x))
        x = self.dropout(x)
        for layer in self.layers:
            x = layer(x)
        x = self.output_fc(x)
        return x

In [None]:
def conv_dim(input_dim, pad, kernelsize, stride_len):
    return int(((input_dim + (2*pad) - kernelsize) / stride_len) + 1)

def pool_dim(input_dim, kernelsize, stride_len):
    return int(((input_dim - kernelsize) / stride_len) + 1)

class birdNN(nn.Module):
    def __init__(self, num_layers, dropout_rate, 
                 num_conv, kernelsize, pad, stride_len, 
                 input_dim, hidden_dim, output_dim):
        super(birdNN, self).__init__()
        ## Convolution Layers
        self.conv_layers = nn.ModuleList()
        self.conv_layers.append(nn.Conv2d(input_dim, 16, 
                                          kernel_size = kernelsize, stride = stride_len, 
                                          padding = pad))
        conv_size = conv_dim(input_dim, pad, kernelsize, stride_len)
        for _ in range(num_conv):
            self.conv_layers.append(nn.Conv2d(16, 16, 
                                          kernel_size = kernelsize, stride = stride_len, 
                                          padding = pad))
            conv_size = conv_dim(conv_size, pad, kernelsize, stride_len)
        self.conv_layers.append(nn.Conv2d(16, 32, 
                                          kernel_size = kernelsize, stride = stride_len, 
                                          padding = pad))
        conv_size = conv_dim(conv_size, pad, kernelsize, stride_len)
        self.pool = nn.MaxPool2d(kernel_size = kernelsize, stride = stride_len)
        pool_size = pool_dim(conv_size, kernelsize, stride_len)
        self.flatten = nn.Flatten()
        self.activation = nn.LeakyReLU(0.01)

        self.dropout = nn.Dropout(dropout_rate) if dropout_rate else nn.Identity()
        ## Dense Layers
        self.input_fc = nn.Linear(int(32 * pool_size * pool_size), hidden_dim)
        self.layers = nn.ModuleList()
        for _ in range(num_layers):
            self.layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.layers.append(nn.LeakyReLU(0.01))
            self.layers.append(nn.Dropout(dropout_rate) if dropout_rate else nn.Identity())
        self.output_fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        for conv_layer in self.conv_layers:
            x = self.activation(conv_layer(x))
            x = self.pool(x)
            x = self.dropout(x)
        # x = self.pool(x)
        # x = self.dropout(x)
        x = self.flatten(x)
        x = self.activation(self.input_fc(x))
        x = self.dropout(x)
        for layer in self.layers:
            x = layer(x)
        x = self.output_fc(x)
        return x

In [93]:
builder = JuncoDatasetBuilder(junco_data, label_col='species_observed',
                              num_lat_bins = 1000, num_lon_bins = 1000)
dataset = builder.get_dataset()
print(len(dataset))
print(dataset.get_input_dim())
print(dataset.get_true_labels())


1000000
41
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])


In [95]:
epochs = 100
num_classes = 2
num_features = dataset.get_input_dim()
batchsize = 32
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = torch.utils.data.random_split(dataset, [train_size, test_size])
trainloader = DataLoader(train_data, batch_size = batchsize, shuffle = True)
testloader = DataLoader(test_data, batch_size = batchsize)
model = birdNN(input_dim = num_features, output_dim = num_classes,
               num_layers = 4, dropout_rate = 0.2,
               kernelsize = 3, pad = 1, stride_len = 1, 
               hidden_dim = 64
        )
criterion = nn.CrossEntropyLoss()
accuracy = metrics.MulticlassAccuracy(num_classes = num_classes)
auroc = metrics.MulticlassAUROC(num_classes = num_classes)
# confusion_matrix = metrics.functional.multiclass_confusion_matrix
optimizer = optim.Adam(model.parameters(), lr = 0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 10, gamma  = 0.75)
print("Starting Model Training")
model.train()
train_loss = []
train_accuracy = []
train_auroc = []
test_accuracy = []
test_auroc = []
for epoch in range(epochs):
    ## Model Training
    running_loss = 0
    inputs = train_data.dataset.tensor_data
    presence = train_data.dataset.labels
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, presence)
    loss.backward()
    optimizer.step()
    running_loss += loss.item()
    accuracy.update(outputs, presence)
    auroc.update(outputs, presence)
    train_accuracy.append(accuracy.compute().numpy())
    train_auroc.append(auroc.compute().numpy())
    train_loss.append(running_loss)
    if (epoch + 1) % 10 == 0:
        print(f'Epoch: {epoch + 1}\n')
        print(f'Loss: {train_loss[epoch]}\n')
        print(f'Accuracy: {train_accuracy[epoch]}\n')
        print(f'AUROC: {train_auroc[epoch]}\n')
    ## Model Validation
    # accuracy.reset()
    # auroc.reset()
    # model.eval()
    # with torch.no_grad():
    #     for inputs, presence in testloader:
    #         outputs = model(inputs)
    #         accuracy.update(outputs, presence)
    #         auroc.update(outputs, presence)
    
    # test_accuracy.append(accuracy.compute().numpy())
    # test_auroc.append(auroc.compute().numpy())
    # if (epoch + 1) % 10 == 0:
    #     print(f'Test Accuracy: {test_accuracy[epoch]}\n')
    #     print(f'Test AUROC: {test_auroc[epoch]}\n')
    
    # # scores.append(accuracy)
    accuracy.reset()
    auroc.reset()
    model.train()
    ## Saves model state
    # torch.save(model.state_dict(), f'./model_states/model_{folds}.pth')

Starting Model Training


ValueError: Expected input batch_size (64) to match target batch_size (1000).

In [59]:
for x, y in trainloader:
    print(x.shape())

IndexError: index 359314 is out of bounds for dimension 0 with size 41

In [None]:
epochs = 10
num_classes = 2
num_features = len(junco_data.columns) - 3
folds = 5
results = {}
batchsize = 10
kfold = model_selection.KFold(n_splits = folds, shuffle = True, random_state = 6260)
for params in itertools.product(*parameter_grid.values()):
    num_layers, dropout_rate, learning_rate, num_conv, kernelsize, pad, stride_len, hidden_dim = params
    scores = []
    for folds, (train_index, test_index) in enumerate(kfold.split(dataset)):
        train_subset = Subset(dataset, train_index)
        test_subset = Subset(dataset, test_index)
        trainloader = DataLoader(train_subset, batch_size = batchsize, shuffle = True)
        testloader = DataLoader(test_subset, batch_size = batchsize)

        model = birdNN(input_dim = num_features, output_dim = num_classes,
                       num_layers = num_layers, dropout_rate = dropout_rate, 
                       num_conv = num_conv, kernelsize = kernelsize, pad = pad, stride_len = stride_len, 
                       hidden_dim = hidden_dim
                )
        criterion = nn.CrossEntropyLoss()
        accuracy = metrics.MulticlassAccuracy(num_classes = num_classes)
        auroc = metrics.MulticlassAUROC(num_classes = num_classes)
        # confusion_matrix = metrics.functional.multiclass_confusion_matrix
        optimizer = optim.Adam(model.parameters(), lr = learning_rate)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 10, gamma  = 0.75)

        print("Starting Model Training")
        model.train()
        train_loss = []
        train_accuracy = []
        train_auroc = []

        test_accuracy = []
        test_auroc = []

        for epoch in range(epochs):
            ## Model Training
            running_loss = 0
            for inputs, presence in trainloader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, presence)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()

                accuracy.update(outputs, presence)
                auroc.update(outputs, presence)
            train_accuracy.append(accuracy.compute().numpy())
            train_auroc.append(auroc.compute().numpy())
            train_loss.append(running_loss)

            if (epoch + 1) % 10 == 0:
                print(f'Epoch: {epoch + 1}\n')
                print(f'Loss: {train_loss[epoch]}\n')
                print(f'Accuracy: {train_accuracy[epoch]}\n')
                print(f'AUROC: {train_auroc[epoch]}\n')

            ## Model Validation
            accuracy.reset()
            auroc.reset()
            model.eval()
            with torch.no_grad():
                for inputs, presence in testloader:
                    outputs = model(inputs)
                    accuracy.update(outputs, presence)
                    auroc.update(outputs, presence)
            
            test_accuracy.append(accuracy.compute().numpy())
            test_auroc.append(auroc.compute().numpy())

            if (epoch + 1) % 10 == 0:
                print(f'Test Accuracy: {test_accuracy[epoch]}\n')
                print(f'Test AUROC: {test_auroc[epoch]}\n')
            

            scores.append(accuracy)
            accuracy.reset()
            auroc.reset()
            model.train()

            ## Saves model state
            torch.save(model.state_dict(), f'./model_states/model_{folds}.pth')
    avg_score = np.mean(scores)
    results[params] = avg_score

ValueError: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=1.