In [23]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import ensemble
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

import sys
import os
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.processing import GatherFeatureDatasets


In [24]:
data_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
all_train = pd.read_csv(f'{data_dir}/p.train.csv')
all_test = pd.read_csv(f'{data_dir}/p.test.csv')

In [25]:
proc = GatherFeatureDatasets(is_p=True)

In [26]:
X_train, y_train, _, feature_names = proc.get_X_y(all_train, 
                                               scaler=False,
                                               source_dist_type='dist',
                                                linear_model=False)

X shape: (65031, 45), y shape: (65031,)


In [27]:
X_test, y_test, _, _ = proc.get_X_y(all_test, 
                                    scaler=False,
                                    source_dist_type='dist',
                                    linear_model=False)

X shape: (16531, 45), y shape: (16531,)


In [28]:
def encode_stations(X, stats, le=None, names=None):
    if le is None:
        le = LabelEncoder()
        le.fit(np.unique(stats))

    if names is not None:
        if 'station' in names:
            raise ValueError('station already in feature names')
        names = np.append(names, 'station') 

    encoded_stats = le.transform(stats)
    X= np.append(X, encoded_stats[:, None], 1)

    return X, le, names

In [29]:
X_train, le, feature_names = encode_stations(X_train, all_train['station'].values, names=feature_names)

In [30]:
X_test, _, _ = encode_stations(X_test, all_test['station'].values, le=le)

In [31]:
print(X_train.shape)
print(feature_names)
X_train[0, :]

(65031, 46)
['amp_ratio_1' 'amp_ratio_2' 'amp_ratio_3' 'amp_ratio_4' 'amp_ratio_5'
 'amp_ratio_6' 'amp_ratio_7' 'amp_ratio_8' 'amp_ratio_9' 'amp_ratio_10'
 'amp_ratio_11' 'amp_ratio_12' 'amp_ratio_13' 'amp_ratio_14'
 'amp_ratio_15' 'amp_ratio_16' 'amp_ratio_17' 'amp_ratio_18' 'amp_1'
 'amp_2' 'amp_3' 'amp_4' 'amp_5' 'amp_6' 'amp_7' 'amp_8' 'amp_9' 'amp_10'
 'amp_11' 'amp_12' 'amp_13' 'amp_14' 'amp_15' 'amp_16' 'amp_17' 'amp_18'
 'signal_dominant_frequency' 'signal_dominant_amplitude'
 'noise_max_amplitude' 'signal_max_amplitude' 'signal_variance'
 'noise_variance' 'source_depth_km' 'source_receiver_distance_logkm'
 'source_receiver_back_azimuth_deg' 'station']


array([ 2.47744353,  2.53305999,  2.93512781,  3.52129371,  3.76847963,
        4.23535151,  4.41872084,  4.28590903,  4.26921754,  4.34385778,
        4.43942912,  4.4936378 ,  4.49191638,  4.44878747,  4.37982151,
        4.30826924,  4.25007158,  4.20490098, -4.11949516, -2.8802031 ,
       -2.39762744, -1.8694013 , -1.74099749, -1.61845206, -1.66670288,
       -1.76068281, -1.90440231, -2.09030123, -2.29811339, -2.52891383,
       -2.77455398, -3.02717571, -3.28071982, -3.53126864, -3.7761202 ,
       -4.01327211,  1.79175947, -3.18827381, -2.75211698,  1.34317148,
       -1.10851584, -8.6701376 ,  7.69      ,  2.16347672, 79.88521657,
       13.        ])

In [32]:
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}
tree_reg = ensemble.GradientBoostingRegressor(**params)

tree_reg.fit(X_train, y_train)

In [33]:
r2_score(y_train, tree_reg.predict(X_train))

0.8224032041227307

In [34]:
r2_score(y_test, tree_reg.predict(X_test))

0.8149072542820834

In [36]:
test_preds = tree_reg.predict(X_test)

In [49]:
avg_preds = np.zeros(all_test.Evid.unique().shape)
test_act_mags = np.zeros(all_test.Evid.unique().shape)
for i, evid in enumerate(all_test.Evid.unique()):
    mag = all_test[all_test['Evid'] == evid]['Event-Mean-YPML-S'].values[0]
    inds = np.where(all_test.Evid == evid)[0]
    avg = np.mean(test_preds[inds])
    avg_preds[i] = avg
    test_act_mags[i] = mag
    

In [50]:
r2_score(test_act_mags, avg_preds)

0.8815252448692831

In [52]:
# I dont know that this is correct
stat_r2 = np.zeros(all_test['station'].unique().shape)
for i, stat in enumerate(all_test.station.unique()):
    stat_df = all_test[all_test['station'] == stat]
    stat_r2[i] = r2_score(stat_df['Event-Mean-YPML-S'],
                          test_preds[stat_df.index.values])

In [54]:
np.median(stat_r2)

0.7865581270734703

In [57]:
stat_r2

array([0.78316701, 0.8705684 , 0.81626263, 0.85170743, 0.69067106,
       0.85588159, 0.77736595, 0.84177781, 0.7852004 , 0.80997647,
       0.81378092, 0.79079427, 0.75245422, 0.83752061, 0.70008667,
       0.73067275, 0.78655813, 0.83608944, 0.74370865, 0.62229362,
       0.72821915, 0.79555924, 0.65030704, 0.82873984, 0.7117057 ,
       0.75153537, 0.83038132, 0.8599538 , 0.6767676 , 0.8363835 ,
       0.86136483, 0.78539956, 0.8667546 , 0.7396959 , 0.77851195])