In [None]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import ensemble
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

import sys
import os
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.processing import GatherFeatureDatasets


In [2]:
data_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
all_train = pd.read_csv(f'{data_dir}/s.train.csv')
all_test = pd.read_csv(f'{data_dir}/s.test.csv')

In [3]:
proc = GatherFeatureDatasets(is_p=False)

In [4]:
X_train, y_train, _, feature_names = proc.get_X_y(all_train, 
                                               scaler=False,
                                               source_dist_type='dist',
                                                linear_model=False)

X shape: (12738, 45), y shape: (12738,)


In [5]:
X_test, y_test, _, _ = proc.get_X_y(all_test, 
                                    scaler=False,
                                    source_dist_type='dist',
                                    linear_model=False)

X shape: (3327, 45), y shape: (3327,)


In [6]:
def encode_stations(X, stats, le=None, names=None):
    if le is None:
        le = LabelEncoder()
        le.fit(np.unique(stats))

    if names is not None:
        if 'station' in names:
            raise ValueError('station already in feature names')
        names = np.append(names, 'station') 

    encoded_stats = le.transform(stats)
    X= np.append(X, encoded_stats[:, None], 1)

    return X, le, names

In [7]:
X_train, le, feature_names = encode_stations(X_train, all_train['station'].values, names=feature_names)

In [8]:
X_test, _, _ = encode_stations(X_test, all_test['station'].values, le=le)

In [9]:
print(X_train.shape)
print(feature_names)
X_train[0, :]

(12738, 46)
['amp_ratio_1' 'amp_ratio_2' 'amp_ratio_3' 'amp_ratio_4' 'amp_ratio_5'
 'amp_ratio_6' 'amp_ratio_7' 'amp_ratio_8' 'amp_ratio_9' 'amp_ratio_10'
 'amp_ratio_11' 'amp_ratio_12' 'amp_ratio_13' 'amp_ratio_14'
 'amp_ratio_15' 'amp_ratio_16' 'amp_ratio_17' 'amp_ratio_18' 'amp_1'
 'amp_2' 'amp_3' 'amp_4' 'amp_5' 'amp_6' 'amp_7' 'amp_8' 'amp_9' 'amp_10'
 'amp_11' 'amp_12' 'amp_13' 'amp_14' 'amp_15' 'amp_16' 'amp_17' 'amp_18'
 'signal_dominant_frequency' 'signal_dominant_amplitude'
 'noise_max_amplitude' 'signal_max_amplitude' 'signal_variance'
 'noise_variance' 'source_depth_km' 'source_receiver_distance_logkm'
 'source_receiver_back_azimuth_deg' 'station']


array([ 1.21448028e+00,  1.44962219e+00,  1.67456315e+00,  1.47840254e+00,
        1.28761567e+00,  1.24113808e+00,  1.40968286e+00,  1.45675300e+00,
        1.43571169e+00,  1.35353062e+00,  1.18042577e+00,  1.00803660e+00,
        8.93877059e-01,  8.10170993e-01,  7.29164783e-01,  6.47255285e-01,
        5.67123926e-01,  4.85244669e-01, -3.11348738e+00, -2.21589873e+00,
       -1.76753049e+00, -1.33298600e+00, -1.25227789e+00, -1.26106209e+00,
       -1.22805816e+00, -1.24118775e+00, -1.30720159e+00, -1.41598749e+00,
       -1.55052539e+00, -1.66281665e+00, -1.74599295e+00, -1.83493787e+00,
       -1.93726861e+00, -2.05148749e+00, -2.17533634e+00, -2.31115999e+00,
        2.04717228e+00, -2.33259986e+00,  1.30463078e+00,  2.41727420e+00,
        5.02459030e-02, -9.13227169e-01,  7.69000000e+00,  2.16347672e+00,
        7.98852166e+01,  7.00000000e+00])

In [10]:
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}
tree_reg = ensemble.GradientBoostingRegressor(**params)

tree_reg.fit(X_train, y_train)

In [11]:
r2_score(y_train, tree_reg.predict(X_train))

0.9048544036572275

In [12]:
r2_score(y_test, tree_reg.predict(X_test))

0.8897518729657583

In [13]:
test_preds = tree_reg.predict(X_test)

In [14]:
avg_preds = np.zeros(all_test.Evid.unique().shape)
test_act_mags = np.zeros(all_test.Evid.unique().shape)
for i, evid in enumerate(all_test.Evid.unique()):
    mag = all_test[all_test['Evid'] == evid]['Event-Mean-YPML-S'].values[0]
    inds = np.where(all_test.Evid == evid)[0]
    avg = np.mean(test_preds[inds])
    avg_preds[i] = avg
    test_act_mags[i] = mag
    

In [15]:
r2_score(test_act_mags, avg_preds)

0.9174656057623187

In [53]:
# I dont know that this is correct
stat_r2 = np.zeros(all_test['station'].unique().shape)
for i, stat in enumerate(all_test.station.unique()):
    stat_df = all_test[all_test['station'] == stat]
    preds = tree_reg.predict(X_test[stat_df.index.values])
    stat_r2[i] = r2_score(stat_df['Event-Mean-YPML-S'],
                          preds)

In [54]:
np.median(stat_r2)

0.8811506788759833

In [55]:
stat_r2

array([0.88342606, 0.93787898, 0.71955389, 0.92807675, 0.91337593,
       0.90344411, 0.57990959, 0.82935026, 0.8788753 , 0.91693153,
       0.81621769, 0.82673026, 0.90548161, 0.74541614, 0.77156753,
       0.91332062, 0.76213484, 0.89857025])

# Random forest

In [46]:
rf = ensemble.RandomForestRegressor(n_estimators=500, max_features=6, n_jobs=10)

In [47]:
rf.fit(X_train, y_train)

In [48]:
r2_score(y_train, rf.predict(X_train))

0.9809034761306435

In [49]:
rf_test_preds = rf.predict(X_test)
r2_score(y_test, rf_test_preds)

0.854250787763737

In [50]:
avg_preds = np.zeros(all_test.Evid.unique().shape)
test_act_mags = np.zeros(all_test.Evid.unique().shape)
for i, evid in enumerate(all_test.Evid.unique()):
    mag = all_test[all_test['Evid'] == evid]['Event-Mean-YPML-S'].values[0]
    inds = np.where(all_test.Evid == evid)[0]
    avg = np.mean(rf_test_preds[inds])
    avg_preds[i] = avg
    test_act_mags[i] = mag
r2_score(test_act_mags, avg_preds)

0.8898355459604069

In [51]:
# I dont know that this is correct
stat_r2 = np.zeros(all_test['station'].unique().shape)
for i, stat in enumerate(all_test.station.unique()):
    stat_df = all_test[all_test['station'] == stat]
    #preds = tree_reg.predict(X_test[stat_df.index.values])
    stat_r2[i] = r2_score(stat_df['Event-Mean-YPML-S'],
                          rf_test_preds[stat_df.index.values])

In [52]:
np.median(stat_r2)

0.8089616019613715