In [None]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import ensemble
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

import sys
import os
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.processing import GatherFeatureDatasets


In [2]:
data_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
all_train = pd.read_csv(f'{data_dir}/p.train.csv')
all_test = pd.read_csv(f'{data_dir}/p.test.csv')
all_2023 = pd.read_csv(f'{data_dir}/p.20230101.csv')

In [3]:
proc = GatherFeatureDatasets(is_p=True)

In [4]:
X_train, y_train, _, feature_names = proc.get_X_y(all_train, 
                                               scaler=False,
                                               source_dist_type='dist',
                                                linear_model=False)

X shape: (65031, 45), y shape: (65031,)


In [5]:
X_testA, y_testA, _, _ = proc.get_X_y(all_test, 
                                    scaler=False,
                                    source_dist_type='dist',
                                    linear_model=False)

X shape: (16531, 45), y shape: (16531,)


In [6]:
X_testB, y_testB, _, _ = proc.get_X_y(all_2023, 
                                    scaler=False,
                                    source_dist_type='dist',
                                    linear_model=False)

X shape: (5861, 45), y shape: (5861,)


In [7]:
def encode_stations(X, stats, le=None, names=None):
    if le is None:
        le = LabelEncoder()
        le.fit(np.unique(stats))

    if names is not None:
        if 'station' in names:
            raise ValueError('station already in feature names')
        names = np.append(names, 'station') 

    encoded_stats = le.transform(stats)
    X= np.append(X, encoded_stats[:, None], 1)

    return X, le, names

In [8]:
X_train, le, feature_names = encode_stations(X_train, all_train['station'].values, names=feature_names)

In [9]:
X_testA, _, _ = encode_stations(X_testA, all_test['station'].values, le=le)

In [10]:
X_testB, _, _ = encode_stations(X_testB, all_2023['station'].values, le=le)

In [11]:
print(X_train.shape)
print(feature_names)
X_train[0, :]

(65031, 46)
['amp_ratio_1' 'amp_ratio_2' 'amp_ratio_3' 'amp_ratio_4' 'amp_ratio_5'
 'amp_ratio_6' 'amp_ratio_7' 'amp_ratio_8' 'amp_ratio_9' 'amp_ratio_10'
 'amp_ratio_11' 'amp_ratio_12' 'amp_ratio_13' 'amp_ratio_14'
 'amp_ratio_15' 'amp_ratio_16' 'amp_ratio_17' 'amp_ratio_18' 'amp_1'
 'amp_2' 'amp_3' 'amp_4' 'amp_5' 'amp_6' 'amp_7' 'amp_8' 'amp_9' 'amp_10'
 'amp_11' 'amp_12' 'amp_13' 'amp_14' 'amp_15' 'amp_16' 'amp_17' 'amp_18'
 'signal_dominant_frequency' 'signal_dominant_amplitude'
 'noise_max_amplitude' 'signal_max_amplitude' 'signal_variance'
 'noise_variance' 'source_depth_km' 'source_receiver_distance_logkm'
 'source_receiver_back_azimuth_deg' 'station']


array([ 2.47744353,  2.53305999,  2.93512781,  3.52129371,  3.76847963,
        4.23535151,  4.41872084,  4.28590903,  4.26921754,  4.34385778,
        4.43942912,  4.4936378 ,  4.49191638,  4.44878747,  4.37982151,
        4.30826924,  4.25007158,  4.20490098, -4.11949516, -2.8802031 ,
       -2.39762744, -1.8694013 , -1.74099749, -1.61845206, -1.66670288,
       -1.76068281, -1.90440231, -2.09030123, -2.29811339, -2.52891383,
       -2.77455398, -3.02717571, -3.28071982, -3.53126864, -3.7761202 ,
       -4.01327211,  1.79175947, -3.18827381, -2.75211698,  1.34317148,
       -1.10851584, -8.6701376 ,  7.69      ,  2.16347672, 79.88521657,
       13.        ])

In [12]:
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}
tree_reg = ensemble.GradientBoostingRegressor(**params)

tree_reg.fit(X_train, y_train)

In [25]:
r2_score(y_train, tree_reg.predict(X_train))

0.8224032041227308

In [26]:
r2_score(y_testA, tree_reg.predict(X_testA))

0.8148830455372293

In [27]:
r2_score(y_testB, tree_reg.predict(X_testB))

0.7923271538800253

In [28]:
testA_preds = tree_reg.predict(X_testA)
testB_preds = tree_reg.predict(X_testB)
train_preds = tree_reg.predict(X_train)

In [29]:
def network_avg(testdf, testpreds):
    avg_preds = np.zeros(testdf.Evid.unique().shape)
    test_act_mags = np.zeros(testdf.Evid.unique().shape)
    for i, evid in enumerate(testdf.Evid.unique()):
        mag = testdf[testdf['Evid'] == evid]['Event-Mean-YPML-S'].values[0]
        inds = np.where(testdf.Evid == evid)[0]
        avg = np.mean(testpreds[inds])
        avg_preds[i] = avg
        test_act_mags[i] = mag
    return test_act_mags, avg_preds
    

In [30]:
testA_net_act_mags, testA_avg_preds = network_avg(all_test, testA_preds)
testA_avg_r2 = r2_score(testA_net_act_mags, testA_avg_preds)
testA_avg_r2

0.8815242575025456

In [31]:
testB_net_act_mags, testB_avg_preds = network_avg(all_2023, testB_preds)
testB_avg_r2 = r2_score(testB_net_act_mags, testB_avg_preds)
testB_avg_r2

0.8716868405785804

In [32]:
def stationr2(testdf, testpreds):
    stat_r2 = {}
    for i, stat in enumerate(testdf.station.unique()):
        stat_df = testdf[testdf['station'] == stat]
        stat_r2[stat] = r2_score(stat_df['Event-Mean-YPML-S'],
                            testpreds[stat_df.index.values])
        
    return stat_r2

In [33]:
train_station_r2 = stationr2(all_train, train_preds)
#np.median(train_station_r2)

In [34]:
testA_station_r2 = stationr2(all_test, testA_preds)
#np.median(testA_station_r2)
testA_station_r2['network'] = testA_avg_r2

In [35]:
testB_station_r2 = stationr2(all_2023, testB_preds)
#np.median(testB_station_r2)
testB_station_r2['network'] = testB_avg_r2

In [36]:
r2_df = pd.DataFrame([train_station_r2, testA_station_r2, testB_station_r2]).T.reset_index().rename(columns={'index':'station', 0:'train_r2', 1:'test_r2', 2:'holdout_r2'})

In [37]:
r2_df

Unnamed: 0,station,train_r2,test_r2,holdout_r2
0,YHB,0.857961,0.855882,0.847069
1,YDC,0.708102,0.690671,-3.904963
2,YWB,0.862648,0.870568,0.860126
3,MCID,0.819436,0.816263,0.821493
4,YHL,0.819549,0.813781,0.828336
5,YMR,0.837914,0.841778,0.83805
6,YHH,0.821671,0.809976,0.828764
7,B207,0.77303,0.777366,
8,YPP,0.799461,0.783103,0.777544
9,YPM,0.855843,0.851707,0.829647


In [38]:
outdir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/gbt'
r2_df.to_csv(os.path.join(outdir, 'r2.summary.P.csv'), index=False)

In [39]:
from joblib import dump
dump(tree_reg, os.path.join(outdir, 'gbt.P.joblib'))

['/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/gbt/gbt.P.joblib']