In [1]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import sys
import os
from sklearn.base import clone 
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.processing import GatherFeatureDatasets
from src.train import TrainStationModel
from src.utils import CrossValidation as cv

In [2]:
proc = GatherFeatureDatasets(is_p=True)

In [3]:
data_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
train = pd.read_csv(f'{data_dir}/p.train.csv')
test = pd.read_csv(f'{data_dir}/p.test.csv')
holdout = pd.read_csv(f'{data_dir}/p.20230101.csv')
station_feature_dict, station_meta_dict, feature_names = proc.process_all_stations_datasets(train, 
                                                                  test, 
                                                                  holdout_df=holdout,
                                                                  scaler=False,
                                                                  linear_model=True,
                                                                  source_dist_type='dist')
feature_names

YHB
X shape: (3580, 45), y shape: (3580,)
X shape: (897, 45), y shape: (897,)
X shape: (363, 45), y shape: (363,)
YDC
X shape: (2928, 45), y shape: (2928,)
X shape: (725, 45), y shape: (725,)
X shape: (29, 45), y shape: (29,)
YWB
X shape: (3780, 45), y shape: (3780,)
X shape: (913, 45), y shape: (913,)
X shape: (357, 45), y shape: (357,)
MCID
X shape: (3389, 45), y shape: (3389,)
X shape: (832, 45), y shape: (832,)
X shape: (311, 45), y shape: (311,)
YHL
X shape: (3213, 45), y shape: (3213,)
X shape: (812, 45), y shape: (812,)
X shape: (374, 45), y shape: (374,)
YMR
X shape: (4154, 45), y shape: (4154,)
X shape: (1071, 45), y shape: (1071,)
X shape: (446, 45), y shape: (446,)
YHH
X shape: (4813, 45), y shape: (4813,)
X shape: (1215, 45), y shape: (1215,)
X shape: (397, 45), y shape: (397,)
B207
X shape: (1718, 45), y shape: (1718,)
X shape: (411, 45), y shape: (411,)
YPP
X shape: (1558, 45), y shape: (1558,)
X shape: (399, 45), y shape: (399,)
X shape: (217, 45), y shape: (217,)
YPM
X 

array(['amp_ratio_1', 'amp_ratio_2', 'amp_ratio_3', 'amp_ratio_4',
       'amp_ratio_5', 'amp_ratio_6', 'amp_ratio_7', 'amp_ratio_8',
       'amp_ratio_9', 'amp_ratio_10', 'amp_ratio_11', 'amp_ratio_12',
       'amp_ratio_13', 'amp_ratio_14', 'amp_ratio_15', 'amp_ratio_16',
       'amp_ratio_17', 'amp_ratio_18', 'amp_1', 'amp_2', 'amp_3', 'amp_4',
       'amp_5', 'amp_6', 'amp_7', 'amp_8', 'amp_9', 'amp_10', 'amp_11',
       'amp_12', 'amp_13', 'amp_14', 'amp_15', 'amp_16', 'amp_17',
       'amp_18', 'signal_dominant_frequency', 'signal_dominant_amplitude',
       'noise_max_amplitude', 'signal_max_amplitude', 'signal_variance',
       'noise_variance', 'source_depth_km',
       'source_receiver_distance_logkm',
       'source_receiver_back_azimuth_sine'], dtype='<U33')

In [4]:
# selected_features = ['amp_1',
#                     'source_receiver_distance_logkm']
selected_features = ['amp_1', 'amp_2','signal_variance',
                    'noise_variance', 'source_depth_km',
                    'source_receiver_distance_logkm',
                    'source_receiver_back_azimuth_sine']

In [5]:
selected_feat_dict, selected_feature_names = proc.filter_station_dict_features(station_feature_dict,
                                                                              feature_names,
                                                                              selected_features)

YHB
X_train: (3580, 7), X_test: (897, 7), X_holdout: (363, 7)
YDC
X_train: (2928, 7), X_test: (725, 7), X_holdout: (29, 7)
YWB
X_train: (3780, 7), X_test: (913, 7), X_holdout: (357, 7)
MCID
X_train: (3389, 7), X_test: (832, 7), X_holdout: (311, 7)
YHL
X_train: (3213, 7), X_test: (812, 7), X_holdout: (374, 7)
YMR
X_train: (4154, 7), X_test: (1071, 7), X_holdout: (446, 7)
YHH
X_train: (4813, 7), X_test: (1215, 7), X_holdout: (397, 7)
B207
X_train: (1718, 7), X_test: (411, 7), X_holdout: 0
YPP
X_train: (1558, 7), X_test: (399, 7), X_holdout: (217, 7)
YPM
X_train: (4078, 7), X_test: (1018, 7), X_holdout: (438, 7)
YLT
X_train: (1345, 7), X_test: (348, 7), X_holdout: (97, 7)
H17A
X_train: (524, 7), X_test: (149, 7), X_holdout: 0
B208
X_train: (507, 7), X_test: (153, 7), X_holdout: 0
LKWY
X_train: (1062, 7), X_test: (265, 7), X_holdout: (19, 7)
FLWY
X_train: (791, 7), X_test: (203, 7), X_holdout: (19, 7)
YGC
X_train: (2233, 7), X_test: (558, 7), X_holdout: (273, 7)
YMC
X_train: (4322, 7), X_t

In [6]:
# Model parameters
model = LinearRegression()
model_scaler = False
outdir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/p_models/linearReg_selectedFeats'
phase = 'P'
scoring_method='r2'

In [7]:
if not os.path.exists(outdir):
    raise ValueError('outdir path does not exist')

with open(os.path.join(outdir, 'selected_feature_names.txt'), 'w') as fp:
    for ft in selected_feature_names:
        fp.write(f'{ft}\n')

In [8]:
stations = selected_feat_dict.keys()
results_dict_list = []

for station in stations:
    # Set up the grid search
    print(station)
    X = selected_feat_dict[station]['X_train'] 
    y = station_meta_dict[station]['y_train']   
    trainer = TrainStationModel(station,
                                phase,
                                selected_feat_dict[station],
                                station_meta_dict[station])
    
    opt_pipeline = cv.make_simple_pipeline(clone(model), model_scaler)
    opt_pipeline.fit(X, 
                     y)
    all_yhat, eval_results_dict = trainer.eval_all_splits(opt_pipeline)
    eval_results_dict['station'] = station
    eval_results_dict['phase'] = phase
    trainer.save_all_predictions(all_yhat, outdir)
    trainer.save_model(opt_pipeline, outdir)
    results_dict_list.append(eval_results_dict)

results_df = pd.DataFrame(results_dict_list)
results_df.to_csv(os.path.join(outdir, 'all.stat.results.csv'), index=False)                                   

YHB
YDC
YWB
MCID
YHL
YMR
YHH
B207
YPP
YPM
YLT
H17A
B208
LKWY
FLWY
YGC
YMC
YML
YUF
B206
B944
YLA
YTP
YNR
YNM
YFT
YMV
YPC
YSB
YJC
YMS
YNE
YPK
B950
YDD


In [9]:
results_df

Unnamed: 0,train_r2,train_rmse,test_r2,test_rmse,holdout_r2,holdout_rmse,station,phase
0,0.847,0.221,0.842,0.225,0.817,0.254,YHB,P
1,0.728,0.294,0.721,0.299,-5.198,1.217,YDC,P
2,0.869,0.207,0.873,0.207,0.868,0.215,YWB,P
3,0.848,0.218,0.852,0.22,0.853,0.211,MCID,P
4,0.851,0.216,0.857,0.211,0.85,0.23,YHL,P
5,0.86,0.207,0.863,0.212,0.759,0.296,YMR,P
6,0.819,0.239,0.825,0.233,0.807,0.265,YHH,P
7,0.841,0.212,0.86,0.205,,,B207,P
8,0.829,0.243,0.828,0.249,0.793,0.252,YPP,P
9,0.875,0.204,0.868,0.209,0.855,0.226,YPM,P
