# DEEPLY PEEVED: Neural Nets for Volcano Prediction


In [94]:
import numpy as np
from util import load_hypocenters, PuuOo, load_puuoo_eqs
from matplotlib import pyplot as plt
import datetime
from sklearn import ensemble as ml_models
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim

%matplotlib inline

## Prepare dataset/dataloader

In [95]:
from __future__ import absolute_import, print_function

import os
import os.path as osp
from glob import glob

import numpy as np
import random
import scipy.io as sio
import torch
from torch.utils import data
from util import load_hypocenters, PuuOo, load_puuoo_eqs


class BaseEarthquakes(data.Dataset):
    """Earthquake and Eruption Dataset"""

    def __init__(self, root, eruption_csv_path, eq_csv_path, split):
        self.root  = root
        self.split = split
        self.eruption_csv_path = eruption_csv_path
        self.eq_csv_path = eq_csv_path
        self._load_data()
    
    
    def _load_data(self):
        # Create data list via train, val split
        p = PuuOo(eruption_csv_path)
        time, lat, lon, depth, mag = load_puuoo_eqs(eq_csv_path)
        
        if self.split in ["train", "val"]:
            random.seed(0)
            percent_train = 0.8 
            
            # Make additional array for erupting or not
            erupt = np.array([p.was_erupting(t) for t in time])
            
            # Get indices of eruption and non-eruption earthquakes so we can split both
            eruption_idx    = [i for i, e in enumerate(erupt) if e == True]
            no_eruption_idx = [i for i, e in enumerate(erupt) if e == False]

            num_train_eruptions = int(percent_train * len(eruption_idx))
            num_val_eruptions   = len(eruption_idx) - num_train_eruptions

            num_train_no_eruptions = int(percent_train * len(no_eruption_idx))
            num_val_no_eruptions   = len(no_eruption_idx) - num_train_eruptions

            train_idx = sorted(random.sample(eruption_idx, num_train_eruptions))
            val_idx   = sorted(list(set(eruption_idx) - set(train_idx)))
            train_idx += sorted(random.sample(no_eruption_idx, num_train_no_eruptions))
            val_idx   += sorted(list(set(no_eruption_idx) - set(train_idx)))
            
            if self.split == "train":
                idx = train_idx
            elif self.split == "val":
                idx = val_idx
            
            # Shuffle for data loader
            random.shuffle(idx)
            
            self.time = np.array(time)[idx]
            self.lat = np.array(lat)[idx]
            self.lon = np.array(lon)[idx]
            self.depth = np.array(depth)[idx]
            self.mag = np.array(mag)[idx]
            self.erupt = np.array(erupt)[idx]
                   
        else:
            raise ValueError("Invalid split name: {}".format(self.split))

    def _get_label_weights(self):
        # Get weights for a given dataset
        ids = [self._get_disaster_id(sample_name) for sample_name in self.files]
        id_counts = Counter(ids)

        total = len(ids)

        weights = [min(total/id_counts[i], 15) for i in range(self.num_classes)]

        return weights
    
    def __getitem__(self, index):
        raise NotImplementedError
        
    def __len__(self):
        return len(self.erupt)


class NoDerivedFeatures(BaseEarthquakes):
    
    def __init__(self, **kwargs):
        super(NoDerivedFeatures, self).__init__(**kwargs)
    
    def __getitem__(self, index):
        y = self.erupt[index]
        x = np.array([self.lat[index], self.lon[index], self.depth[index], self.mag[index]])
        return x,y
        
    

In [118]:
eruption_csv_path = 'PuuOo.csv'
eq_csv_path       = 'puuoo_earthquakes.csv' 

dataset_train = NoDerivedFeatures(
        root=".",
        eruption_csv_path=eruption_csv_path, 
        eq_csv_path=eq_csv_path,    
        split="train",
    )



dataset_val = NoDerivedFeatures(
        root=".",
        eruption_csv_path=eruption_csv_path, 
        eq_csv_path=eq_csv_path,    
        split="train",
    )

loader_train = DataLoader(dataset_train, batch_size=50)
loader_val = DataLoader(dataset_val, batch_size=50)


## Build Model

In [149]:
def get_two_layer_model(input_features, hidden_layer_sizes=[5000,5000], output_size=2):

    model = nn.Sequential(
        nn.Linear(input_features, hidden_layer_sizes[0]),
        nn.ReLU(),
        nn.Linear(hidden_layer_sizes[0], hidden_layer_sizes[1]),
        nn.ReLU(),
        nn.Linear(hidden_layer_sizes[1],output_size),
        nn.Sigmoid()
    )
    
    return model.double()

def get_four_layer_model(input_features, hidden_layer_sizes=[1000,1000,1000,1000], output_size=2):

    model = nn.Sequential(
        nn.Linear(input_features, hidden_layer_sizes[0]),
        nn.ReLU(),
        nn.Linear(hidden_layer_sizes[0], hidden_layer_sizes[1]),
        nn.ReLU(),
        nn.Linear(hidden_layer_sizes[1], hidden_layer_sizes[2]),
        nn.ReLU(),
        nn.Linear(hidden_layer_sizes[2], hidden_layer_sizes[3]),
        nn.ReLU(),
        nn.Linear(hidden_layer_sizes[3],output_size),
        nn.Sigmoid()
    )
    
    return model.double()


In [151]:
model = get_four_layer_model(4)
print(model.modules)

<bound method Module.modules of Sequential(
  (0): Linear(in_features=4, out_features=1000, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1000, out_features=1000, bias=True)
  (3): ReLU()
  (4): Linear(in_features=1000, out_features=1000, bias=True)
  (5): ReLU()
  (6): Linear(in_features=1000, out_features=1000, bias=True)
  (7): ReLU()
  (8): Linear(in_features=1000, out_features=2, bias=True)
  (9): Sigmoid()
)>


## Train!

In [152]:
def check_accuracy(loader, model):
    if loader.dataset.split == "train":
        print('Checking accuracy on validation set')
    else:
        print('Checking accuracy on test set')   
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)
            scores = model(x)
            _, preds = scores.max(1)
            num_correct += (preds == y).sum()
            num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))

def train_model(model, optimizer, epochs=1):
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        for t, (x, y) in enumerate(loader_train):
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)
            loss = F.cross_entropy(scores, y)

            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()

            # This is the backwards pass: compute the gradient of the loss with
            # respect to each  parameter of the model.
            loss.backward()

            # Actually update the parameters of the model using the gradients
            # computed by the backwards pass.
            optimizer.step()

        if e % print_every == 0:
            print('Epoch %d, loss = %.4f' % (e, loss.item()))
            check_accuracy(loader_val, model)
            print()

In [156]:
learning_rate = 0.000000
print_every = 10
device = torch.device('cpu')
dtype = torch.float64

model = get_two_layer_model(4)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

train_model(model, optimizer, epochs=40)

Epoch 0, loss = 0.3860
Checking accuracy on validation set
Got 4512 / 5116 correct (88.19)



KeyboardInterrupt: 