In [None]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import ensemble
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

import sys
import os
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.processing import GatherFeatureDatasets


In [2]:
data_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
all_train = pd.read_csv(f'{data_dir}/s.train.csv')
all_test = pd.read_csv(f'{data_dir}/s.test.csv')
all_2023 = pd.read_csv(f'{data_dir}/s.20230101.csv')

In [3]:
proc = GatherFeatureDatasets(is_p=False)

In [4]:
X_train, y_train, _, feature_names = proc.get_X_y(all_train, 
                                               scaler=False,
                                               source_dist_type='dist',
                                                linear_model=False)

X shape: (12738, 45), y shape: (12738,)


In [5]:
X_testA, y_testA, _, _ = proc.get_X_y(all_test, 
                                    scaler=False,
                                    source_dist_type='dist',
                                    linear_model=False)

X shape: (3327, 45), y shape: (3327,)


In [6]:
X_testB, y_testB, _, _ = proc.get_X_y(all_2023, 
                                    scaler=False,
                                    source_dist_type='dist',
                                    linear_model=False)

X shape: (1185, 45), y shape: (1185,)


In [7]:
def encode_stations(X, stats, le=None, names=None):
    if le is None:
        le = LabelEncoder()
        le.fit(np.unique(stats))

    if names is not None:
        if 'station' in names:
            raise ValueError('station already in feature names')
        names = np.append(names, 'station') 

    encoded_stats = le.transform(stats)
    X= np.append(X, encoded_stats[:, None], 1)

    return X, le, names

In [8]:
X_train, le, feature_names = encode_stations(X_train, all_train['station'].values, names=feature_names)

In [9]:
X_testA, _, _ = encode_stations(X_testA, all_test['station'].values, le=le)

In [10]:
X_testB, _, _ = encode_stations(X_testB, all_2023['station'].values, le=le)

In [11]:
print(X_train.shape)
print(feature_names)
X_train[0, :]

(12738, 46)
['amp_ratio_1' 'amp_ratio_2' 'amp_ratio_3' 'amp_ratio_4' 'amp_ratio_5'
 'amp_ratio_6' 'amp_ratio_7' 'amp_ratio_8' 'amp_ratio_9' 'amp_ratio_10'
 'amp_ratio_11' 'amp_ratio_12' 'amp_ratio_13' 'amp_ratio_14'
 'amp_ratio_15' 'amp_ratio_16' 'amp_ratio_17' 'amp_ratio_18' 'amp_1'
 'amp_2' 'amp_3' 'amp_4' 'amp_5' 'amp_6' 'amp_7' 'amp_8' 'amp_9' 'amp_10'
 'amp_11' 'amp_12' 'amp_13' 'amp_14' 'amp_15' 'amp_16' 'amp_17' 'amp_18'
 'signal_dominant_frequency' 'signal_dominant_amplitude'
 'noise_max_amplitude' 'signal_max_amplitude' 'signal_variance'
 'noise_variance' 'source_depth_km' 'source_receiver_distance_logkm'
 'source_receiver_back_azimuth_deg' 'station']


array([ 1.21448028e+00,  1.44962219e+00,  1.67456315e+00,  1.47840254e+00,
        1.28761567e+00,  1.24113808e+00,  1.40968286e+00,  1.45675300e+00,
        1.43571169e+00,  1.35353062e+00,  1.18042577e+00,  1.00803660e+00,
        8.93877059e-01,  8.10170993e-01,  7.29164783e-01,  6.47255285e-01,
        5.67123926e-01,  4.85244669e-01, -3.11348738e+00, -2.21589873e+00,
       -1.76753049e+00, -1.33298600e+00, -1.25227789e+00, -1.26106209e+00,
       -1.22805816e+00, -1.24118775e+00, -1.30720159e+00, -1.41598749e+00,
       -1.55052539e+00, -1.66281665e+00, -1.74599295e+00, -1.83493787e+00,
       -1.93726861e+00, -2.05148749e+00, -2.17533634e+00, -2.31115999e+00,
        2.04717228e+00, -2.33259986e+00,  1.30463078e+00,  2.41727420e+00,
        5.02459030e-02, -9.13227169e-01,  7.69000000e+00,  2.16347672e+00,
        7.98852166e+01,  7.00000000e+00])

In [12]:
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}
tree_reg = ensemble.GradientBoostingRegressor(**params)

tree_reg.fit(X_train, y_train)

In [13]:
r2_score(y_train, tree_reg.predict(X_train))

0.9048544036572275

In [14]:
r2_score(y_testA, tree_reg.predict(X_testA))

0.8897684408282568

In [15]:
r2_score(y_testB, tree_reg.predict(X_testB))

0.8723639137413872

In [16]:
testA_preds = tree_reg.predict(X_testA)
testB_preds = tree_reg.predict(X_testB)
train_preds = tree_reg.predict(X_train)

In [17]:
def network_avg(testdf, testpreds):
    avg_preds = np.zeros(testdf.Evid.unique().shape)
    test_act_mags = np.zeros(testdf.Evid.unique().shape)
    for i, evid in enumerate(testdf.Evid.unique()):
        mag = testdf[testdf['Evid'] == evid]['Event-Mean-YPML-S'].values[0]
        inds = np.where(testdf.Evid == evid)[0]
        avg = np.mean(testpreds[inds])
        avg_preds[i] = avg
        test_act_mags[i] = mag
    return test_act_mags, avg_preds
    

In [41]:
testA_net_act_mags, testA_avg_preds = network_avg(all_test, testA_preds)
testA_avg_r2 = r2_score(testA_net_act_mags, testA_avg_preds)
testA_avg_r2

0.9174729372247676

In [42]:
testB_net_act_mags, testB_avg_preds = network_avg(all_2023, testB_preds)
testB_avg_r2 = r2_score(testB_net_act_mags, testB_avg_preds)
testB_avg_r2

0.9234632640294296

In [25]:
def stationr2(testdf, testpreds):
    stat_r2 = {}
    for i, stat in enumerate(testdf.station.unique()):
        stat_df = testdf[testdf['station'] == stat]
        stat_r2[stat] = r2_score(stat_df['Event-Mean-YPML-S'],
                            testpreds[stat_df.index.values])
        
    return stat_r2

In [28]:
train_station_r2 = stationr2(all_train, train_preds)
train_station_r2

{'YHB': 0.9404002135324934,
 'YHL': 0.9062565549313113,
 'YMR': 0.9134143335619498,
 'YHH': 0.8467698503813517,
 'B207': 0.8837114282367003,
 'FLWY': 0.7919133810622189,
 'YUF': 0.9122446315583168,
 'YPP': 0.9011225568721951,
 'YNR': 0.9381358412603192,
 'YML': 0.7731690029294522,
 'YFT': 0.9351985626683605,
 'LKWY': 0.8049521295436526,
 'YTP': 0.7559064232724151,
 'B206': 0.8674194783252944,
 'YMC': 0.7979367441324865,
 'YNM': 0.9408222035598749,
 'B950': 0.8271930830030825,
 'YDD': 0.873367929683228}

In [43]:
testA_station_r2 = stationr2(all_test, testA_preds)
testA_station_r2['network'] = testA_avg_r2

In [44]:
testB_station_r2 = stationr2(all_2023, testB_preds)
testB_station_r2['network'] = testB_avg_r2

In [45]:
r2_df = pd.DataFrame([train_station_r2, testA_station_r2, testB_station_r2]).T.reset_index().rename(columns={'index':'station', 0:'train_r2', 1:'test_r2', 2:'holdout_r2'})

In [46]:
r2_df

Unnamed: 0,station,train_r2,test_r2,holdout_r2
0,YHB,0.9404,0.937879,0.933174
1,YHL,0.906257,0.913347,0.857942
2,YMR,0.913414,0.903444,0.896393
3,YHH,0.84677,0.816218,0.805882
4,B207,0.883711,0.88342,
5,FLWY,0.791913,0.719554,0.632912
6,YUF,0.912245,0.905528,0.896621
7,YPP,0.901123,0.82935,0.830661
8,YNR,0.938136,0.928077,0.915527
9,YML,0.773169,0.746212,-7.122966


In [47]:
outdir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/gbt'
r2_df.to_csv(os.path.join(outdir, 'r2.summary.S.csv'), index=False)

In [48]:
from joblib import dump
dump(tree_reg, os.path.join(outdir, 'gbt.S.joblib'))

['/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/gbt/gbt.S.joblib']