In [16]:
import pandas as pd
import os
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch
import numpy as np
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Reading in the data

In [17]:
inputs = "C:/Users/Aidan/OneDrive - Simon Fraser University (1sfu)/Forest Fire Data Sets/Model/Input/"

In [18]:
for file in os.listdir(inputs):
    # Only reading in the csv file - I know there will only be a single CSV file.
    if file[-7:] == 'parquet':
        print(file)
        data = pd.read_parquet(inputs + file)

part-00000-3cf16e0f-ca38-45f7-8eaa-c6026341b7ec-c000.snappy.parquet


In [19]:
data

Unnamed: 0,squareID,date,inputs,hasFire
0,0,1942-07-15,,0
1,0,1943-01-11,,0
2,0,1943-03-03,,0
3,0,1943-06-18,,0
4,0,1943-10-30,,0
...,...,...,...,...
1054525,77309411328,2019-10-17,,0
1054526,77309411328,2020-03-21,,0
1054527,77309411328,2020-04-01,,0
1054528,77309411328,2020-05-14,,0


# Modelling

## Functions and Modelling

In [None]:
def train_val_dataset(dataset, val_split=0.25):
    # CITATION: https://discuss.pytorch.org/t/how-to-split-dataset-into-test-and-validation-sets/33987/5
    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=val_split)
    datasets = {}
    return Subset(dataset, train_idx), Subset(dataset, val_idx)

In [None]:
# determine the supported device
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu') # don't have GPU 
    return device

In [None]:
class ModelInput(Dataset):
    # CITATION: https://www.youtube.com/watch?v=PXOzkkB5eH0
    def __init__(self, input_path="C:/Users/Aidan/OneDrive - Simon Fraser University (1sfu)/Forest Fire Data Sets/Model/Input/"):
        for file in os.listdir(input_path):
            # Only reading in the parquet file - I know there will only be a single CSV file.
            if file[-7:] == 'parquet':
                data = pd.read_parquet(input_path + file)
        self.x = data['inputs'].values
        self.y = data[['squareID','date','hasFire']].values
        self.n_samples = data.values.shape[0]
    
    def __getitem__(self, index):
        sequence = self.preprocess_x(self.x[index])
        label = self.preprocess_y(self.y[index])
        return sequence, label
        
    def __len__(self):
        return self.n_samples
    
    def preprocess_x(self, sequence):
        seq = []
        # CITATION: https://towardsdatascience.com/dataloader-for-sequential-data-using-pytorch-deep-learning-framework-part-2-ed3ad5f6ad82
        for previousDay in sequence:
            seq.append(previousDay)
        return torch.from_numpy(np.array(seq))
    
    def preprocess_y(self, values):
        return torch.tensor(np.array([values[0], 
                                      int(values[1].strftime("%Y%m%d")),
                                      values[2]]))

In [None]:
# Creating the dataset
modelInput = ModelInput()

# Creating the split
train_data, validation_data = train_val_dataset(modelInput, val_split=0.1)

In [None]:
# Defining the dataloaders
# NOTE: there is a bug in pytorch, have to set num_workers=0 for the data loaders to work
trainLoader = DataLoader(train_data, batch_size=1, shuffle=True, num_workers=0)
validLoader = DataLoader(validation_data, batch_size=1, shuffle=False, num_workers=0)

In [None]:
# Defining the max number of epochs
NUM_EPOCHS = 20

In [None]:
# Defining the device so I don't keep calling the function
device = get_device()

In [None]:
class Model(nn.Module):
    # CITATION: https://github.com/avickars/NHL-Database/blob/master/Analysis/player-valuation-deep_rl.ipynb
    def __init__(self):
        super().__init__()
        self.inputSize = 4
        self.numLSTMNodes = 100
        self.numLSTMLayers = 1
        
        self.lstmLayer = nn.LSTM(input_size=self.inputSize, 
                                  hidden_size=self.numLSTMNodes,
                                  num_layers=self.numLSTMLayers, 
                                  bias=True, 
                                  dropout=0, 
                                  batch_first=True).double()
        self.hidden1 = nn.Linear(in_features=self.numLSTMNodes, out_features=100).double()   
        self.hidden2 = nn.Linear(in_features=100, out_features=100).double()   
        self.output = nn.Linear(in_features=100, out_features=1).double()
        
    def forward(self, modelInput):
        hidden = (
                    torch.cuda.FloatTensor(self.numLSTMLayers , 1, self.numLSTMNodes).normal_().double() ,
                    torch.cuda.FloatTensor(self.numLSTMLayers , 1, self.numLSTMNodes).normal_().double() 
                  )
        
        # Using a for loop here to handle to varying lengths of the sequences
        for sequence in modelInput:
            out, hidden = self.lstmLayer(sequence.view(1,1,-1), hidden)
        t = F.relu(out)
        t = F.relu(self.hidden1(t))
        t = F.relu(self.hidden2(t))
        t = F.sigmoid(self.output(t))
        return t   

In [None]:
# Defining the model
model = Model()
# Moving Model to GPU (if available) otherwise it just stays on the CPU
model.to(device)

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.5)
criterion = nn.MSELoss()

In [None]:
# CITATION: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    for i, data in enumerate(trainLoader, 0):
        # get the inputs
        inputs, labels = data
        
        inputs = inputs.to(device)
        actualLabel = labels[:,2].to(device)
        
        output = model(inputs[0].double())
        
        loss = criterion(output, actualLabel.double())
        loss.backward()
        
        # print statistics
        running_loss += loss.item()
        if i % 1000 == 0:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.10f' %
                  (epoch + 1, i, running_loss / 1000))
            running_loss = 0.0    

In [None]:
torch.save(model.state_dict(), './model.pt')