In [1]:
import numpy as np
import pandas as pd
from sklearn.base import clone 
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import json
import sys
import os
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.processing import GatherFeatureDatasets
from src.feature_selection import RFE
from src import plotting
from src.utils import write_dict_to_json

In [2]:
proc = GatherFeatureDatasets(is_p=False)

In [3]:
data_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
train = pd.read_csv(f'{data_dir}/s.train.csv')
test = pd.read_csv(f'{data_dir}/s.test.csv')
outfile_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/s_feature_selection/rfecv'
outfile_name = 'SVR.sr'

In [4]:
station_feature_dict, station_meta_dict, feature_names = proc.process_all_stations_datasets(train, 
                                                                  test, 
                                                                  scaler=False,
                                                                  linear_model=False,
                                                                  source_dist_type='dist')

YHB
X shape: (942, 45), y shape: (942,)
X shape: (227, 45), y shape: (227,)
YHL
X shape: (468, 45), y shape: (468,)
X shape: (114, 45), y shape: (114,)
YMR
X shape: (1293, 45), y shape: (1293,)
X shape: (303, 45), y shape: (303,)
YHH
X shape: (883, 45), y shape: (883,)
X shape: (214, 45), y shape: (214,)
B207
X shape: (314, 45), y shape: (314,)
X shape: (81, 45), y shape: (81,)
FLWY
X shape: (310, 45), y shape: (310,)
X shape: (78, 45), y shape: (78,)
YPP
X shape: (157, 45), y shape: (157,)
X shape: (44, 45), y shape: (44,)
YNR
X shape: (1784, 45), y shape: (1784,)
X shape: (454, 45), y shape: (454,)
YUF
X shape: (725, 45), y shape: (725,)
X shape: (190, 45), y shape: (190,)
YML
X shape: (421, 45), y shape: (421,)
X shape: (107, 45), y shape: (107,)
YFT
X shape: (741, 45), y shape: (741,)
X shape: (188, 45), y shape: (188,)
LKWY
X shape: (203, 45), y shape: (203,)
X shape: (53, 45), y shape: (53,)
YTP
X shape: (382, 45), y shape: (382,)
X shape: (93, 45), y shape: (93,)
B206
X shape: (

In [5]:
feature_names

array(['amp_ratio_1', 'amp_ratio_2', 'amp_ratio_3', 'amp_ratio_4',
       'amp_ratio_5', 'amp_ratio_6', 'amp_ratio_7', 'amp_ratio_8',
       'amp_ratio_9', 'amp_ratio_10', 'amp_ratio_11', 'amp_ratio_12',
       'amp_ratio_13', 'amp_ratio_14', 'amp_ratio_15', 'amp_ratio_16',
       'amp_ratio_17', 'amp_ratio_18', 'amp_1', 'amp_2', 'amp_3', 'amp_4',
       'amp_5', 'amp_6', 'amp_7', 'amp_8', 'amp_9', 'amp_10', 'amp_11',
       'amp_12', 'amp_13', 'amp_14', 'amp_15', 'amp_16', 'amp_17',
       'amp_18', 'signal_dominant_frequency', 'signal_dominant_amplitude',
       'noise_max_amplitude', 'signal_max_amplitude', 'signal_variance',
       'noise_variance', 'source_depth_km',
       'source_receiver_distance_logkm',
       'source_receiver_back_azimuth_deg'], dtype='<U32')

In [6]:
feature_subset = ['amp_ratio_1', 'amp_ratio_2', 'amp_ratio_3', 'amp_ratio_4',
                    'amp_1', 'amp_2', 'amp_3', 'amp_4','amp_5', 'signal_dominant_frequency',
                    'signal_dominant_amplitude',
                    'noise_max_amplitude', 'signal_max_amplitude', 'signal_variance',
                    'noise_variance', 'source_depth_km',
                    'source_receiver_distance_logkm',
                    'source_receiver_back_azimuth_deg']

In [7]:
selected_feat_dict, selected_feature_names = proc.filter_station_dict_features(station_feature_dict,
                                                                                feature_names,
                                                                                feature_subset)

YHB
X_train: (942, 18), X_test: (227, 18)
YHL
X_train: (468, 18), X_test: (114, 18)
YMR
X_train: (1293, 18), X_test: (303, 18)
YHH
X_train: (883, 18), X_test: (214, 18)
B207
X_train: (314, 18), X_test: (81, 18)
FLWY
X_train: (310, 18), X_test: (78, 18)
YPP
X_train: (157, 18), X_test: (44, 18)
YNR
X_train: (1784, 18), X_test: (454, 18)
YUF
X_train: (725, 18), X_test: (190, 18)
YML
X_train: (421, 18), X_test: (107, 18)
YFT
X_train: (741, 18), X_test: (188, 18)
LKWY
X_train: (203, 18), X_test: (53, 18)
YTP
X_train: (382, 18), X_test: (93, 18)
B206
X_train: (196, 18), X_test: (59, 18)
YMC
X_train: (1157, 18), X_test: (306, 18)
YNM
X_train: (326, 18), X_test: (70, 18)
YDD
X_train: (216, 18), X_test: (57, 18)


In [8]:
### Set Parameters ###
# The model to used to estimate feature importance 
estimator_model =  DecisionTreeRegressor()
# Boolean - True if data for estimator_model needs to be scaled
estimator_scaler = False
# The main model to fit
model = SVR(kernel='rbf')
# Boolean - True if data for model needs to be scaled 
model_scaler = True
# The scoring method to use in RFECV and GridCV
scoring_method = 'r2'
# The function used to evaluate the performance on the test model
score_func = r2_score
# The parameter space for GridCV to search
# Decrease upper-limit of C for linear model because 100 is very slow 
svr_Cspace = [0.1, 1, 10, 100, 1000]
svr_gspace = [1, 0.1, 0.01, 0.001]
param_grid = [
    {'m__C': svr_Cspace, 'm__gamma': svr_gspace},
]
# The number of jobs for RFECV and GridCV to use
n_jobs = 10
# number of folds for outer CV
cv_folds_outer = 5
# number of folds for inner cv
cv_folds_inner = 5
n_outer_repeats = 2
# Run grid search over all features
run_gridsearchcv_all = True
#####################

In [9]:
#stat_important_feat_dict = {}
stat_results = {}
for stat in selected_feat_dict.keys():
    print(stat)
    X = selected_feat_dict[stat]['X_train']
    y = station_meta_dict[stat]['y_train']
    results = RFE.nested_rfecv(X, y, estimator_model, clone(model), param_grid, 
                                estimator_scaler=estimator_scaler, 
                                model_scaler=model_scaler, 
                                scoring_method=scoring_method,
                                score_func=score_func,
                                n_jobs=n_jobs, 
                                cv_folds_outer=cv_folds_outer, 
                                cv_folds_inner=cv_folds_inner,
                                n_outer_repeats=n_outer_repeats,
                                run_gridsearchcv_all_feats=run_gridsearchcv_all)
    
    filename = os.path.join(outfile_dir, f'{stat}.{outfile_name}.json')
    write_dict_to_json(filename, results)

    # feat_usage_df = RFE.count_feature_usage(results['optfts_bool'], selected_feature_names)
    # if_usage = RFE.import_feats_by_usage(feat_usage_df, thresh=(cv_folds_outer*n_outer_repeats)//2)
    # if_best = RFE.important_feats_by_best_model(results, selected_feature_names)
    # impfeat_cnts_df = RFE.combine_important_features([if_usage, if_best], feat_usage_df)

    #stat_important_feat_dict[stat] = impfeat_cnts_df
    stat_results[stat] = results

filename = os.path.join(outfile_dir, f'all.{outfile_name}.json')
write_dict_to_json(filename, stat_results)

YHB


Fold 0: test score (10 best feats): 0.939, test score (all feats): 0.941, diff: -0.002, time: 7.93 s, best model params: {'m__C': 1000, 'm__gamma': 0.001}
Fold 1: test score (7 best feats): 0.954, test score (all feats): 0.965, diff: -0.011, time: 10.02 s, best model params: {'m__C': 10, 'm__gamma': 0.1}
Fold 2: test score (12 best feats): 0.962, test score (all feats): 0.960, diff: 0.002, time: 6.64 s, best model params: {'m__C': 1000, 'm__gamma': 0.001}
Fold 3: test score (4 best feats): 0.922, test score (all feats): 0.937, diff: -0.015, time: 15.54 s, best model params: {'m__C': 10, 'm__gamma': 0.1}
Fold 4: test score (6 best feats): 0.953, test score (all feats): 0.959, diff: -0.006, time: 8.19 s, best model params: {'m__C': 1000, 'm__gamma': 0.01}
Fold 5: test score (4 best feats): 0.945, test score (all feats): 0.960, diff: -0.015, time: 16.73 s, best model params: {'m__C': 100, 'm__gamma': 0.1}
Fold 6: test score (6 best feats): 0.943, test score (all feats): 0.959, diff: -0.01