In [1]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import json
import sys
import os
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.features import SelectPFeatures, NumpyEncoder
from src.features import FeaturePlots as fp

In [2]:
sf = SelectPFeatures()

In [3]:
data_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
train = pd.read_csv(f'{data_dir}/p.train.csv')
test = pd.read_csv(f'{data_dir}/p.test.csv')

In [4]:
station_feature_dict, feature_names = sf.process_station_datasets(train, test, scaler=False, linear_model=False)
feature_names

YHB
X shape: (2920, 47), y shape: (2920,)
X shape: (734, 47), y shape: (734,)
YDC
X shape: (2509, 47), y shape: (2509,)
X shape: (645, 47), y shape: (645,)
YWB
X shape: (3069, 47), y shape: (3069,)
X shape: (786, 47), y shape: (786,)
MCID
X shape: (2942, 47), y shape: (2942,)
X shape: (771, 47), y shape: (771,)
YHL
X shape: (2739, 47), y shape: (2739,)
X shape: (682, 47), y shape: (682,)
YMR
X shape: (3393, 47), y shape: (3393,)
X shape: (845, 47), y shape: (845,)
YHH
X shape: (4005, 47), y shape: (4005,)
X shape: (1002, 47), y shape: (1002,)
B207
X shape: (1609, 47), y shape: (1609,)
X shape: (380, 47), y shape: (380,)
YPP
X shape: (1334, 47), y shape: (1334,)
X shape: (338, 47), y shape: (338,)
YPM
X shape: (3358, 47), y shape: (3358,)
X shape: (843, 47), y shape: (843,)
YLT
X shape: (1275, 47), y shape: (1275,)
X shape: (291, 47), y shape: (291,)
QLMT
X shape: (792, 47), y shape: (792,)
X shape: (190, 47), y shape: (190,)
H17A
X shape: (527, 47), y shape: (527,)
X shape: (142, 47), 

array(['amp_ratio_1', 'amp_ratio_2', 'amp_ratio_3', 'amp_ratio_4',
       'amp_ratio_5', 'amp_ratio_6', 'amp_ratio_7', 'amp_ratio_8',
       'amp_ratio_9', 'amp_ratio_10', 'amp_ratio_11', 'amp_ratio_12',
       'amp_ratio_13', 'amp_ratio_14', 'amp_ratio_15', 'amp_ratio_16',
       'amp_ratio_17', 'amp_ratio_18', 'amp_1', 'amp_2', 'amp_3', 'amp_4',
       'amp_5', 'amp_6', 'amp_7', 'amp_8', 'amp_9', 'amp_10', 'amp_11',
       'amp_12', 'amp_13', 'amp_14', 'amp_15', 'amp_16', 'amp_17',
       'amp_18', 'signal_dominant_frequency', 'signal_dominant_amplitude',
       'noise_max_amplitude', 'signal_max_amplitude', 'signal_variance',
       'noise_variance', 'source_depth_km', 'source_latitude',
       'source_longitude', 'source_receiver_distance_logkm',
       'source_receiver_back_azimuth_deg'], dtype='<U32')

In [5]:
sr_nonlin_feature_subset = ['amp_ratio_4', 'amp_ratio_5', 'amp_ratio_6', 'amp_ratio_7',
                'amp_1', 'amp_2', 'amp_3', 'amp_4','amp_5', 'signal_variance',
                'signal_dominant_amplitude', 'signal_max_amplitude','signal_dominant_frequency',
                'noise_max_amplitude', 'noise_variance', 
                'source_depth_km', 'source_receiver_distance_logkm',
                'source_receiver_back_azimuth_deg']

In [6]:
sr_station_feat_dict, sr_feature_names = sf.filter_station_dict_features(station_feature_dict,
                                                                        feature_names,
                                                                        sr_nonlin_feature_subset)

YHB
X_train: (2920, 18), X_test: (734, 18)
YDC
X_train: (2509, 18), X_test: (645, 18)
YWB
X_train: (3069, 18), X_test: (786, 18)
MCID
X_train: (2942, 18), X_test: (771, 18)
YHL
X_train: (2739, 18), X_test: (682, 18)
YMR
X_train: (3393, 18), X_test: (845, 18)
YHH
X_train: (4005, 18), X_test: (1002, 18)
B207
X_train: (1609, 18), X_test: (380, 18)
YPP
X_train: (1334, 18), X_test: (338, 18)
YPM
X_train: (3358, 18), X_test: (843, 18)
YLT
X_train: (1275, 18), X_test: (291, 18)
QLMT
X_train: (792, 18), X_test: (190, 18)
H17A
X_train: (527, 18), X_test: (142, 18)
B208
X_train: (526, 18), X_test: (134, 18)
LKWY
X_train: (1016, 18), X_test: (278, 18)
FLWY
X_train: (694, 18), X_test: (177, 18)
YGC
X_train: (1725, 18), X_test: (451, 18)
TPMT
X_train: (408, 18), X_test: (97, 18)
YMC
X_train: (3553, 18), X_test: (898, 18)
YML
X_train: (2678, 18), X_test: (638, 18)
B206
X_train: (671, 18), X_test: (175, 18)
B944
X_train: (360, 18), X_test: (95, 18)
YLA
X_train: (681, 18), X_test: (169, 18)
YUF
X_trai

In [7]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [9]:
### Set Parameters ###
# The model to used to estimate feature importance 
estimator_model =  DecisionTreeRegressor()
# Boolean - True if data for estimator_model needs to be scaled
estimator_scaler = False
# The main model to fit
model = SVR(kernel='rbf')
# Boolean - True if data for model needs to be scaled 
model_scaler = True
# The scoring method to use in RFECV and GridCV
scoring_method = 'r2'
# The function used to evaluate the performance on the test model
score_func = r2_score
# The parameter space for GridCV to search
# Decrease upper-limit of C for linear model because 100 is very slow 
svr_Cspace = [0.1, 1, 10, 100, 1000]
svr_gspace = [1, 0.1, 0.01, 0.001]
param_grid = [
    {'m__C': svr_Cspace, 'm__gamma': svr_gspace},
]
# The number of jobs for RFECV and GridCV to use
n_jobs = 10
# number of folds for outer CV
cv_folds_outer = 10
# number of folds for inner cv
cv_folds_inner = 5
n_outer_repeats = 1
# Run grid search over all features
run_gridsearchcv_all = True
#####################

In [10]:
outfile_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/p_feature_selection/rfecv'
outfile_name = 'SVR.sr'

In [11]:
stat_important_feat_dict = {}
stat_results = {}
for stat in sr_station_feat_dict.keys():
    print(stat)
    X = sr_station_feat_dict[stat]['X_train']
    y = sr_station_feat_dict[stat]['y_train']
    results = sf.nested_rfecv(X, y, estimator_model, model, param_grid, 
                                estimator_scaler=estimator_scaler, 
                                model_scaler=model_scaler, 
                                scoring_method=scoring_method,
                                score_func=score_func,
                                n_jobs=n_jobs, 
                                cv_folds_outer=cv_folds_outer, 
                                cv_folds_inner=cv_folds_inner,
                                n_outer_repeats=n_outer_repeats,
                                run_gridsearchcv_all_feats=run_gridsearchcv_all)
    
    filename = os.path.join(outfile_dir, f'{stat}.{outfile_name}.json')
    print('Writing', filename)
    with open(filename, 'w') as fp:
        json.dump(results, fp, indent=4, cls=NumpyEncoder)

    feat_usage_df = sf.count_feature_usage(results['optfts_bool'], sr_feature_names)
    if_usage = sf.import_feats_by_usage(feat_usage_df, thresh=(cv_folds_outer*n_outer_repeats)//2)
    if_best = sf.important_feats_by_best_model(results, sr_feature_names)
    impfeat_cnts_df = sf.combine_important_features([if_usage, if_best], feat_usage_df)

    stat_important_feat_dict[stat] = impfeat_cnts_df
    stat_results[stat] = results


YHB
Fold 0: test score (12 best feats): 0.910, test score (all feats): 0.910, diff: 0.001, time: 58.32 s, best model params: {'m__C': 1, 'm__gamma': 0.1}
Fold 1: test score (11 best feats): 0.902, test score (all feats): 0.907, diff: -0.005, time: 64.22 s, best model params: {'m__C': 1, 'm__gamma': 0.1}
Fold 2: test score (9 best feats): 0.893, test score (all feats): 0.908, diff: -0.015, time: 63.46 s, best model params: {'m__C': 10, 'm__gamma': 0.1}
Fold 3: test score (8 best feats): 0.901, test score (all feats): 0.918, diff: -0.017, time: 54.45 s, best model params: {'m__C': 10, 'm__gamma': 0.1}
Fold 4: test score (11 best feats): 0.922, test score (all feats): 0.927, diff: -0.005, time: 65.40 s, best model params: {'m__C': 10, 'm__gamma': 0.1}
Fold 5: test score (12 best feats): 0.935, test score (all feats): 0.934, diff: 0.002, time: 71.18 s, best model params: {'m__C': 1000, 'm__gamma': 0.01}
Fold 6: test score (12 best feats): 0.915, test score (all feats): 0.924, diff: -0.010,

In [None]:
filename = os.path.join(outfile_dir, f'all.{outfile_name}.json')
print('Writing', filename)
with open(filename, 'w') as fp:
    json.dump(results, fp, indent=4, cls=NumpyEncoder)