In [39]:
import numpy as np
import pandas as pd
import itertools
import geopandas as gpd
import rasterio
import fiona
import sklearn.model_selection as model_selection
from torch.utils.data import DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
import torcheval.metrics as metrics

import matplotlib.pyplot as plt

In [38]:
data_folder = "Pre-Processed Dark-Eyed Junco Data/"

checklist_zf = pd.read_csv(data_folder + 'checklists_zf_md_deju_jan.csv')
env_checklist = pd.read_csv(data_folder + 'environmental_vars_checklists_md_jan.csv')
env_prediction_grid = pd.read_csv(data_folder + 'environmental_vars_prediction_grid_md.csv')
layer_names = fiona.listlayers(data_folder + 'gis-data.gpkg')
gis_layers = {layer: gpd.read_file(data_folder + 'gis-data.gpkg', layer = layer) for layer in layer_names}
with rasterio.open(data_folder + 'prediction_grid_md.tif') as src:
    grid_array = src.read(1)
    prediction_grid = pd.DataFrame(grid_array)

In [None]:
# if src.transform:
#     cols, rows = np.meshgrid(np.arange(prediction_grid.shape[1]), np.arange(prediction_grid.shape[0]))
#     x, y = src.transform * (cols, rows)
#     prediction_grid['x'] = x.flatten()
#     prediction_grid['y'] = y.flatten()
#     prediction_grid = prediction_grid[['x', 'y', 0]]

Unnamed: 0,geometry
0,"MULTIPOLYGON (((-132.89307 54.14077, -132.9914..."


In [26]:
print(checklist_zf.head(10))
print(checklist_zf.shape)

  checklist_id observer_id   type  observation_count  species_observed  \
0    S21144361  obsr145749   test               12.0              True   
1    S21223704  obsr358359  train                8.0              True   
2    S21568350   obsr36330  train                0.0             False   
3    S21379010   obsr36330   test                0.0             False   
4    S21396728   obsr36330  train                0.0             False   
5    S21319445   obsr36330  train                3.0              True   
6    S21534954  obsr349140  train                0.0             False   
7    S21598685  obsr349140   test                0.0             False   
8    S21386465  obsr414797  train                0.0             False   
9    S21330992  obsr293603  train                0.0             False   

  state_code locality_id   latitude  longitude protocol_type  \
0      US-MD    L1925233  39.291564 -76.818772     Traveling   
1      US-MD    L1925870  39.071241 -76.549682    Station

In [27]:
print(env_checklist.head(10))
print(env_checklist.shape)

  checklist_id  elevation_mean  elevation_sd  ed_c00_water  pland_c00_water  \
0    S21144361      130.517303      8.637477      0.000000         0.000000   
1    S21223704       12.751621      6.732613      2.569487         9.523810   
2    S21568350       76.296135     22.626635      6.166770        14.285714   
3    S21379010       76.296135     22.626635      6.166770        14.285714   
4    S21396728       76.296135     22.626635      6.166770        14.285714   
5    S21319445       76.296135     22.626635      6.166770        14.285714   
6    S21534954       76.296135     22.626635      6.166770        14.285714   
7    S21598685       76.296135     22.626635      6.166770        14.285714   
8    S21386465       76.296135     22.626635      6.166770        14.285714   
9    S21330992       76.296135     22.626635      6.166770        14.285714   

   ed_c01_evergreen_needleleaf  pland_c01_evergreen_needleleaf  \
0                          0.0                             0.0  

In [28]:
print(env_prediction_grid.head(10))
print(env_prediction_grid.shape)

   cell_id              x             y  elevation_mean  elevation_sd  \
0        2 -227623.701670  71016.284889      633.591003     48.028278   
1        3 -224632.022042  71016.284889      556.442810     51.946083   
2        4 -221640.342413  71016.284889      528.164001     43.113594   
3        5 -218648.662784  71016.284889      609.500244     79.011230   
4        6 -215656.983155  71016.284889      720.016785     76.634804   
5        7 -212665.303527  71016.284889      777.005615     44.189632   
6        8 -209673.623898  71016.284889      786.669800     60.027111   
7        9 -206681.944269  71016.284889      816.645874     54.481831   
8       10 -203690.264640  71016.284889      782.974854     60.780556   
9       11 -200698.585012  71016.284889      716.187378     38.633827   

   ed_c00_water  pland_c00_water  ed_c01_evergreen_needleleaf  \
0           0.0              0.0                          0.0   
1           0.0              0.0                          0.0   
2

In [29]:
print(prediction_grid.head(10))
print(prediction_grid.shape)

   0    1    2    3    4    5    6    7    8    9    ...  119  120  121  122  \
0  NaN  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  NaN  NaN  NaN  NaN   
1  NaN  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  NaN  NaN  NaN  NaN   
2  NaN  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  NaN  NaN  NaN  NaN   
3  NaN  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  NaN  NaN  NaN  NaN   
4  NaN  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  NaN  NaN  NaN  NaN   
5  NaN  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  NaN  NaN  NaN  NaN   
6  NaN  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  NaN  NaN  NaN  NaN   
7  NaN  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  NaN  NaN  NaN  NaN   
8  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  NaN  NaN  NaN  NaN   
9  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  NaN  NaN  NaN  NaN   

   123  124  125  126  127  128  
0  NaN  NaN  NaN  NaN  NaN  NaN  
1  NaN  NaN  NaN  NaN  NaN  NaN  
2  NaN  NaN  NaN 

In [30]:
parameter_grid = {
    'num_layers': [2, 3, 4, 5, 6], 
    'dropout_rate': [0, 0.1, 0.2], 
    'learning_rate': [0.01, 0.005, 0.001],
    'num_conv': [2, 4, 6], 
    'kernelsize': [2, 3, 4, 5], 
    'pad': [0, 1, 2], 
    'stride_len': [0, 1, 2], 
    # 'input_dim': 2, 
    'hidden_dim': [64, 128, 256, 512, 1024], 
    # 'output_dim': 1, 
}

In [None]:
junco_data = 

In [40]:
class birdNN(nn.Module):
    def __init__(self, num_layers, dropout_rate, 
                 num_conv, kernelsize, pad, stride_len, 
                 input_dim, hidden_dim, output_dim):
        super(birdNN, self).__init__()
        ## Convolution Layers
        self.conv_layers = nn.ModuleList()
        self.conv_layers.append(nn.Conv2d(input_dim, 16, 
                                          kernel_size = kernelsize, stride = stride_len, 
                                          padding = pad))
        for _ in range(num_conv):
            self.conv_layers.append(nn.Conv2d(16, 16, 
                                          kernel_size = kernelsize, stride = stride_len, 
                                          padding = pad))
        self.conv_layers.append(nn.Conv2d(16, 32, 
                                          kernel_size = kernelsize, stride = stride_len, 
                                          padding = pad))
        self.pool = nn.MaxPool2d(kernel_size = kernelsize, stride = stride_len)
        self.flatten = nn.Flatten()
        self.activation = nn.LeakyReLU(0.01)
        if not dropout_rate:
            self.dropout = nn.Dropout(dropout_rate)
        ## Dense Layers
        self.input_fc = nn.Linear(_, hidden_dim)
        self.layers = nn.ModuleList()
        for _ in range(num_layers):
            self.layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.layers.append(nn.LeakyReLU(0.01))
            if not dropout_rate:
                self.layers.append(nn.Dropout(dropout_rate))
        self.output_fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        for conv_layer in self.conv_layers:
            x = self.activation(conv_layer(x))
            x = self.pool(x)
            x = self.dropout(x)
        # x = self.pool(x)
        # x = self.dropout(x)
        x = self.flatten(x)
        x = self.activation(self.input_fc(x))
        x = self.dropout(x)
        for layer in self.layers:
            x = layer(x)
        x = self.output_fc(x)
        return F.softmax(x)

In [None]:
epochs = 0
num_classes = 2
folds = 5
results = {}
batchsize = 10
kfold = model_selection.KFold(n_splits = folds, shuffle = True)
for params in itertools.product(*parameter_grid.values()):
    num_layers, dropout_rate, learning_rate, num_conv, kernelsize, pad, stride_len, hidden_dim = params
    scores = []
    for folds, (train_index, test_index) in enumerate(kfold.split(junco_data)):
        train_subset = torch.utils.data.Subset(junco_data, train_index)
        test_subset = torch.utils.data.Subset(junco_data, test_index)
        
        trainloader = DataLoader(train_subset, batch_size = batchsize, shuffle = True)
        testloader = DataLoader(test_subset, batch_size = batchsize, shuffle = True)

        model = birdNN(num_layers = num_layers, dropout_rate = dropout_rate, 
                       num_conv = num_conv, kernelsize = kernelsize, pad = pad, stride_len = stride_len, 
                       hidden_dim = hidden_dim
                )
        criterion = nn.CrossEntropyLoss()
        accuracy = metrics.MulticlassAccuracy()
        auroc = metrics.MulticlassAUROC()
        # confusion_matrix = metrics.functional.multiclass_confusion_matrix
        optimizer = optim.Adam(model.parameters(), lr = learning_rate)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 10, gamma  = 0.75)

        print("Starting Model Training")
        model.train()
        train_loss = []
        train_accuracy = []
        train_auroc = []

        test_accuracy = []
        test_auroc = []

        for epoch in range(epochs):
            ## Model Training
            running_loss = 0
            for inputs, presence in trainloader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, presence)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()

                accuracy.update(outputs, presence)
                auroc.update(outputs, presence)
            train_accuracy.append(accuracy.compute().numpy())
            train_auroc.append(accuracy.compute().numpy())
            train_loss.append(running_loss)

            if (epoch + 1) % 10 == 0:
                print(f'Epoch: {epoch + 1}\n')
                print(f'Loss: {train_loss[epoch]}\n')
                print(f'Accuracy: {train_accuracy[epoch]}\n')
                print(f'AUROC: {train_auroc[epoch]}\n')

            ## Model Validation
            accuracy.reset()
            auroc.reset()
            model.eval()
            with torch.no_grad():
                for inputs, presence in testloader:
                    outputs = model(inputs)
                    accuracy.update(outputs, presence)
                    auroc.update(outputs, presence)
            
            test_accuracy.append(accuracy.compute().numpy())
            test_auroc.append(auroc.compute().numpy())

            if (epoch + 1) % 10 == 0:
                print(f'Test Accuracy: {test_accuracy[epoch]}\n')
                print(f'Test AUROC: {test_auroc[epoch]}\n')
            

            scores.append(accuracy)
            accuracy.reset()
            auroc.reset()
            model.train()

            ## Saves model state
            torch.save(model.state_dict, f'./model_states/model.pth')
    avg_score = np.mean(scores)
    results[params] = avg_score