# CNN for time series prediction

In [16]:
# source: https://machinelearningmastery.com/how-to-develop-convolutional-neural-network-models-for-time-series-forecasting/

# multivariate cnn example
from numpy import array
from numpy import hstack
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from sklearn.preprocessing import StandardScaler
from util import load_hypocenters, PuuOo, load_puuoo_eqs
import numpy as np
import random


In [17]:
# get dataset
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

In [23]:
class BaseEarthquakes():
    """Earthquake and Eruption Dataset"""

    def __init__(self, root, eruption_csv_path, eq_csv_path, split):
        self.root  = root
        self.split = split
        self.eruption_csv_path = eruption_csv_path
        self.eq_csv_path = eq_csv_path
        self._get_east_rift_zone()
        self._load_data()

        self._normalize()
        
    def _get_east_rift_zone(self): 
        self.p = PuuOo(self.eruption_csv_path)
        time, lat, lon, depth, mag = load_puuoo_eqs(self.eq_csv_path)
        
        latpts = np.array([19.3,19.5])
        lonpts = np.array([-155.5,-155])
        A      = np.array([lonpts,[1,1]]).T
        line   = np.linalg.solve(A, latpts)   
        croplocs = lat - line[0]*lon - line[1]        
        idx = [j for j in range(len(lon)) if croplocs[j]<=0]
        
        self.time = np.array(time)[idx]
        self.lat = np.array(lat)[idx]
        self.lon = np.array(lon)[idx]
        self.depth = np.array(depth)[idx]
        self.mag = np.array(mag)[idx]
    
    def _normalize(self):
        scaler = StandardScaler()
        scaler.fit(self.x)
        self.x = scaler.transform(self.x)
    
    def _load_data(self):
        # Create data list via train, val split

        if self.split in ["train", "val", "test"]:
            random.seed(0)
            percent_train = 0.7 
            percent_dev   = 0.2 # Percent test is what is left
            
            # Make additional array for erupting or not
            erupt = np.array([self.p.was_erupting(t) for t in self.time])
            
            # Get indices of eruption and non-eruption earthquakes so we can split both
            eruption_idx    = [i for i, e in enumerate(erupt) if e == True]
            no_eruption_idx = [i for i, e in enumerate(erupt) if e == False]

            num_train_eruptions = int(percent_train * len(eruption_idx))
            num_val_eruptions   = int(percent_dev * len(eruption_idx))
            num_test_eruptions  = len(eruption_idx) - num_val_eruptions - num_train_eruptions

            num_train_no_eruptions = int(percent_train * len(no_eruption_idx))
            num_val_no_eruptions   = int(percent_dev * len(no_eruption_idx))
            num_test_no_eruptions  = len(no_eruption_idx) - num_val_no_eruptions - num_train_no_eruptions
            
            train_idx = sorted(random.sample(eruption_idx, num_train_eruptions))
            remaining = sorted(list(set(eruption_idx) - set(train_idx)))
            val_idx   = sorted(random.sample(remaining, num_val_eruptions))
            test_idx  = sorted(list(set(remaining) - set(val_idx)))
            
            train_idx += sorted(random.sample(no_eruption_idx, num_train_no_eruptions))
            remaining  = sorted(list(set(no_eruption_idx) - set(train_idx)))
            val_idx   += sorted(random.sample(remaining, num_val_no_eruptions))
            test_idx  += sorted(list(set(remaining) - set(val_idx)))
            
            assert(len(train_idx) + len(val_idx) + len(test_idx) == len(erupt))
            
            if self.split == "train":
                idx = train_idx
            elif self.split == "val":
                idx = val_idx
            elif self.split == "test":
                idx = test_idx
            
            # Shuffle for data loader
            
            random.shuffle(idx)
            self.idx = idx
            
            self.time = self.time[idx]
            self.lat = self.lat[idx]
            self.lon = self.lon[idx]
            self.depth = self.depth[idx]
            self.mag = self.mag[idx]
            self.erupt = erupt[idx]
            
            self.y = self.erupt
            self.x = np.array([self.lat, self.lon, \
                               self.depth, self.mag]).T
            
        else:
            raise ValueError("Invalid split name: {}".format(self.split))

    def _get_label_weights(self):
        # Get weights for a given dataset
        num_erupt = np.sum(self.y)
        total = len(self.y)
        weights = [1, total/num_erupt]

        return weights
    
    def __getitem__(self, index):
        raise NotImplementedError
        
    def __len__(self):
        return len(self.erupt)

In [24]:
eruption_csv_path = 'PuuOo.csv'
eq_csv_path       = 'puuoo_earthquakes.csv' 

dataset_train = BaseEarthquakes(
        root=".",
        eruption_csv_path=eruption_csv_path, 
        eq_csv_path=eq_csv_path,    
        split="train",
    )

In [25]:
dataset_train.x

array([[ 1.42968854,  1.44683956,  0.57251574, -1.00951239],
       [ 0.20901947, -0.34014885,  0.21934803, -1.00951239],
       [-0.36819613, -0.70997428, -0.28364841,  0.24119861],
       ...,
       [-1.06211653, -0.30406833,  1.1968021 ,  0.28508321],
       [-0.9138699 , -0.49449334, -0.35856277, -0.13182046],
       [-0.55113879, -0.5205515 ,  0.23718478,  1.42608272]])

In [26]:
dataset_train.y

array([False, False, False, ..., False, False, False])