In [1]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import sys
import os
from sklearn.base import clone 
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.processing import GatherFeatureDatasets
from src.train import TrainStationModel
from src.utils import CrossValidation as cv

In [2]:
proc = GatherFeatureDatasets(is_p=False)

In [3]:
data_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
train = pd.read_csv(f'{data_dir}/s.train.csv')
test = pd.read_csv(f'{data_dir}/s.test.csv')
holdout = pd.read_csv(f'{data_dir}/s.20230101.csv')
station_feature_dict, station_meta_dict, feature_names = proc.process_all_stations_datasets(train, 
                                                                  test, 
                                                                  holdout_df=holdout,
                                                                  scaler=False,
                                                                  linear_model=True,
                                                                  source_dist_type='dist')
feature_names

YHB
X shape: (1248, 45), y shape: (1248,)
X shape: (331, 45), y shape: (331,)
X shape: (225, 45), y shape: (225,)
YHL
X shape: (537, 45), y shape: (537,)
X shape: (133, 45), y shape: (133,)
X shape: (39, 45), y shape: (39,)
YMR
X shape: (1533, 45), y shape: (1533,)
X shape: (392, 45), y shape: (392,)
X shape: (172, 45), y shape: (172,)
YHH
X shape: (1222, 45), y shape: (1222,)
X shape: (328, 45), y shape: (328,)
X shape: (113, 45), y shape: (113,)
B207
X shape: (341, 45), y shape: (341,)
X shape: (83, 45), y shape: (83,)
FLWY
X shape: (319, 45), y shape: (319,)
X shape: (89, 45), y shape: (89,)
X shape: (12, 45), y shape: (12,)
YUF
X shape: (816, 45), y shape: (816,)
X shape: (227, 45), y shape: (227,)
X shape: (118, 45), y shape: (118,)
YPP
X shape: (189, 45), y shape: (189,)
X shape: (58, 45), y shape: (58,)
X shape: (35, 45), y shape: (35,)
YNR
X shape: (2090, 45), y shape: (2090,)
X shape: (541, 45), y shape: (541,)
X shape: (99, 45), y shape: (99,)
YML
X shape: (423, 45), y shape:

array(['amp_ratio_1', 'amp_ratio_2', 'amp_ratio_3', 'amp_ratio_4',
       'amp_ratio_5', 'amp_ratio_6', 'amp_ratio_7', 'amp_ratio_8',
       'amp_ratio_9', 'amp_ratio_10', 'amp_ratio_11', 'amp_ratio_12',
       'amp_ratio_13', 'amp_ratio_14', 'amp_ratio_15', 'amp_ratio_16',
       'amp_ratio_17', 'amp_ratio_18', 'amp_1', 'amp_2', 'amp_3', 'amp_4',
       'amp_5', 'amp_6', 'amp_7', 'amp_8', 'amp_9', 'amp_10', 'amp_11',
       'amp_12', 'amp_13', 'amp_14', 'amp_15', 'amp_16', 'amp_17',
       'amp_18', 'signal_dominant_frequency', 'signal_dominant_amplitude',
       'noise_max_amplitude', 'signal_max_amplitude', 'signal_variance',
       'noise_variance', 'source_depth_km',
       'source_receiver_distance_logkm',
       'source_receiver_back_azimuth_sine'], dtype='<U33')

In [4]:
# selected_features = ['amp_2',
#                     'source_receiver_distance_logkm']
selected_features = ['amp_1', 'amp_2', 
                     'amp_4', 'noise_variance',
                     'source_depth_km',
                     'source_receiver_distance_logkm',
                     'source_receiver_back_azimuth_sine']

In [5]:
selected_feat_dict, selected_feature_names = proc.filter_station_dict_features(station_feature_dict,
                                                                              feature_names,
                                                                              selected_features)

YHB
X_train: (1248, 7), X_test: (331, 7), X_holdout: (225, 7)
YHL
X_train: (537, 7), X_test: (133, 7), X_holdout: (39, 7)
YMR
X_train: (1533, 7), X_test: (392, 7), X_holdout: (172, 7)
YHH
X_train: (1222, 7), X_test: (328, 7), X_holdout: (113, 7)
B207
X_train: (341, 7), X_test: (83, 7), X_holdout: 0
FLWY
X_train: (319, 7), X_test: (89, 7), X_holdout: (12, 7)
YUF
X_train: (816, 7), X_test: (227, 7), X_holdout: (118, 7)
YPP
X_train: (189, 7), X_test: (58, 7), X_holdout: (35, 7)
YNR
X_train: (2090, 7), X_test: (541, 7), X_holdout: (99, 7)
YML
X_train: (423, 7), X_test: (126, 7), X_holdout: (7, 7)
YFT
X_train: (899, 7), X_test: (214, 7), X_holdout: (55, 7)
LKWY
X_train: (209, 7), X_test: (50, 7), X_holdout: (2, 7)
YTP
X_train: (391, 7), X_test: (95, 7), X_holdout: (49, 7)
B206
X_train: (195, 7), X_test: (60, 7), X_holdout: 0
YMC
X_train: (1443, 7), X_test: (363, 7), X_holdout: (171, 7)
YNM
X_train: (487, 7), X_test: (137, 7), X_holdout: (6, 7)
B950
X_train: (152, 7), X_test: (42, 7), X_hold

In [6]:
# Model parameters
model = LinearRegression()
model_scaler = False
outdir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/s_models/linearReg_selectedFeats'
phase = 'S'
scoring_method='r2'

In [7]:
if not os.path.exists(outdir):
    raise ValueError('outdir path does not exist')

with open(os.path.join(outdir, 'selected_feature_names.txt'), 'w') as fp:
    for ft in selected_feature_names:
        fp.write(f'{ft}\n')

In [8]:
stations = selected_feat_dict.keys()
results_dict_list = []

for station in stations:
    # Set up the grid search
    print(station)
    X = selected_feat_dict[station]['X_train'] 
    y = station_meta_dict[station]['y_train']   
    trainer = TrainStationModel(station,
                                phase,
                                selected_feat_dict[station],
                                station_meta_dict[station])
    
    opt_pipeline = cv.make_simple_pipeline(clone(model), model_scaler)
    opt_pipeline.fit(X, 
                     y)
    all_yhat, eval_results_dict = trainer.eval_all_splits(opt_pipeline)
    eval_results_dict['station'] = station
    eval_results_dict['phase'] = phase
    trainer.save_all_predictions(all_yhat, outdir)
    trainer.save_model(opt_pipeline, outdir)
    results_dict_list.append(eval_results_dict)

results_df = pd.DataFrame(results_dict_list)
results_df.to_csv(os.path.join(outdir, 'all.stat.results.csv'), index=False)                                   

YHB
YHL
YMR
YHH
B207
FLWY
YUF
YPP
YNR
YML
YFT
LKWY
YTP
B206
YMC
YNM
B950
YDD


In [9]:
results_df

Unnamed: 0,train_r2,train_rmse,test_r2,test_rmse,holdout_r2,holdout_rmse,station,phase
0,0.938,0.142,0.942,0.143,0.951,0.118,YHB,S
1,0.931,0.151,0.96,0.129,0.911,0.155,YHL,S
2,0.927,0.147,0.915,0.16,0.93,0.152,YMR,S
3,0.856,0.173,0.849,0.18,0.815,0.182,YHH,S
4,0.909,0.167,0.92,0.173,,,B207,S
5,0.8,0.198,0.778,0.18,0.468,0.228,FLWY,S
6,0.908,0.168,0.917,0.164,0.854,0.194,YUF,S
7,0.893,0.185,0.852,0.204,0.871,0.165,YPP,S
8,0.919,0.16,0.938,0.141,0.922,0.153,YNR,S
9,0.679,0.331,0.553,0.374,-9.216,1.466,YML,S
