In [1]:
import numpy as np
from util import load_hypocenters, PuuOo, load_puuoo_eqs, prune_data, GetTimeToEruption, GetTimeSinceEruption, GetEQRates
from matplotlib import pyplot as plt
import datetime
from sklearn import ensemble as ml_models

%matplotlib inline

In [5]:
5.5*365.25*24*60*60


173566800.0

# import data

In [8]:
eruption_csv_path = 'PuuOo.csv'
eq_csv_path       = 'puuoo_earthquakes.csv' 

p = PuuOo(eruption_csv_path)
time, lat, lon, depth, mag = load_puuoo_eqs(eq_csv_path)

In [30]:
SecsToEruption = GetTimeToEruption(time, p)
SecsSinceEruption = GetTimeSinceEruption(time, p)


In [6]:
EQsLastDay, EQsLastWeek, EQsLastMonth = GetEQRates(time)

In [31]:
print(len(time))
print(time[-10],p.was_erupting(time[-10]))

erupt = np.array([p.was_erupting(t) for t in time])

print(SecsSinceEruption[:10])
print(SecsToEruption[-1])

6485
1986-12-28 19:04:48 True
[8468331. 8469813. 8470244. 8472345. 8474310. 8486762. 8489716. 8501841.
 8520735. 8523764.]
0.0


## Prepare data for machine learning

In [28]:
X = np.vstack((lat, lon, depth, mag, EQsLastDay, EQsLastWeek, EQsLastMonth)).T
Y = np.array([1 if val==True else 0 for val in erupt])

print(sum(Y)/len(Y))

0.19059367771781033


## Separate data into train and test set

In [21]:
import random

# Make additional array for erupting or not
erupt = np.array([p.was_erupting(t) for t in time])
random.seed(0)
percent_train = 0.8

# Get indices of eruption and non-eruption earthquakes so we can split both
eruption_idx    = [i for i, e in enumerate(erupt) if e == True]
no_eruption_idx = [i for i, e in enumerate(erupt) if e == False]
num_train_eruptions = int(percent_train * len(eruption_idx))
num_val_eruptions   = len(eruption_idx) - num_train_eruptions
num_train_no_eruptions = int(percent_train * len(no_eruption_idx))
num_val_no_eruptions   = len(no_eruption_idx) - num_train_eruptions
train_idx = sorted(random.sample(eruption_idx, num_train_eruptions))
val_idx   = sorted(list(set(eruption_idx) - set(train_idx)))
train_idx += sorted(random.sample(no_eruption_idx, num_train_no_eruptions))
val_idx   += sorted(list(set(no_eruption_idx) - set(train_idx)))

# Shuffle for random feed into model during training
random.shuffle(train_idx)
random.shuffle(val_idx)
x_train = X[train_idx,:]
x_val   = X[val_idx,:]

y_train = Y[train_idx]
y_val   = Y[val_idx]


In [22]:
print(x_train.shape, x_val.shape)
print(y_train.shape, y_val.shape)

(5116, 7) (1281, 7)
(5116,) (1281,)


## start machine learning!

In [23]:


def get_accuracy(model, X, Y):
    num_train = Y.shape[0]
    pred     = model.predict(X)
    accuracy = np.sum((pred==Y))/num_train
    
    return accuracy

### Run a logistic regression

In [24]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0,solver='lbfgs').fit(x_train, y_train)
yhat = lr.predict(x_val)

print(np.all(yhat==0))


True


### Run the random forest

In [25]:
rf = ml_models.RandomForestClassifier(n_estimators=10, max_depth=20)
rf.fit(x_train, y_train)
yhat = rf.predict(x_val)
print(np.all(yhat==0))

False


In [26]:
naive = np.sum(1-y_train)/y_train.shape[0]
print("Model accuracy:")
print(get_accuracy(rf, x_val, y_val))
print("\nNaive model would get:")
print(naive)
print('\nFeature importances:')
print(rf.feature_importances_)

Model accuracy:
0.8743169398907104

Naive model would get:
0.8068803752931978

Feature importances:
[0.11121384 0.14568364 0.13558118 0.08944436 0.10667255 0.18378104
 0.2276234 ]


### try k means

In [41]:
from sklearn.cluster import KMeans

km = KMeans(init='k-means++', n_clusters=5, n_init=10)

print(x_train.shape)
print(y_train.reshape(x_train.shape[0],1).shape)

xy_train = np.concatenate((x_train, y_train.reshape((x_train.shape[0],1))), axis=1)
km.fit(xy_train)

(4656, 7)
(4656, 1)


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)