In [1]:
import numpy as np
import pandas as pd
from sklearn.base import clone 
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
import json
import sys
import os
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.processing import GatherFeatureDatasets
from src.utils import CrossValidation, write_dict_to_json
from src.feature_selection import CustomRFECV
from src.feature_selection import IntrinsicFeatureSelection as ifs


# Load in the data for all stations

In [2]:
proc = GatherFeatureDatasets(is_p=True)

In [3]:
datadir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
train = pd.read_csv(os.path.join(datadir, 'p.train.csv'))
outfile_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_selection_update/rfecvP_diffSeeds'
outfile_name = 'SVR.DT.rel'

In [4]:
station_feature_dict, station_meta_dict, feature_names = proc.process_all_stations_datasets(train, 
                                                                  scaler=False,
                                                                  linear_model=False,
                                                                  source_dist_type='dist')

YHB
X shape: (3580, 45), y shape: (3580,)
YDC
X shape: (2928, 45), y shape: (2928,)
YWB
X shape: (3780, 45), y shape: (3780,)
MCID
X shape: (3389, 45), y shape: (3389,)
YHL
X shape: (3213, 45), y shape: (3213,)
YMR
X shape: (4154, 45), y shape: (4154,)
YHH
X shape: (4813, 45), y shape: (4813,)
B207
X shape: (1718, 45), y shape: (1718,)
YPP
X shape: (1558, 45), y shape: (1558,)
YPM
X shape: (4078, 45), y shape: (4078,)
YLT
X shape: (1345, 45), y shape: (1345,)
H17A
X shape: (524, 45), y shape: (524,)
B208
X shape: (507, 45), y shape: (507,)
LKWY
X shape: (1062, 45), y shape: (1062,)
FLWY
X shape: (791, 45), y shape: (791,)
YGC
X shape: (2233, 45), y shape: (2233,)
YMC
X shape: (4322, 45), y shape: (4322,)
YML
X shape: (3077, 45), y shape: (3077,)
YUF
X shape: (1676, 45), y shape: (1676,)
B206
X shape: (664, 45), y shape: (664,)
B944
X shape: (348, 45), y shape: (348,)
YLA
X shape: (719, 45), y shape: (719,)
YTP
X shape: (926, 45), y shape: (926,)
YNR
X shape: (4649, 45), y shape: (4649,

In [5]:
feature_names

array(['amp_ratio_1', 'amp_ratio_2', 'amp_ratio_3', 'amp_ratio_4',
       'amp_ratio_5', 'amp_ratio_6', 'amp_ratio_7', 'amp_ratio_8',
       'amp_ratio_9', 'amp_ratio_10', 'amp_ratio_11', 'amp_ratio_12',
       'amp_ratio_13', 'amp_ratio_14', 'amp_ratio_15', 'amp_ratio_16',
       'amp_ratio_17', 'amp_ratio_18', 'amp_1', 'amp_2', 'amp_3', 'amp_4',
       'amp_5', 'amp_6', 'amp_7', 'amp_8', 'amp_9', 'amp_10', 'amp_11',
       'amp_12', 'amp_13', 'amp_14', 'amp_15', 'amp_16', 'amp_17',
       'amp_18', 'signal_dominant_frequency', 'signal_dominant_amplitude',
       'noise_max_amplitude', 'signal_max_amplitude', 'signal_variance',
       'noise_variance', 'source_depth_km',
       'source_receiver_distance_logkm',
       'source_receiver_back_azimuth_deg'], dtype='<U32')

## Set parameters

In [6]:
### Set Parameters ###
cv_random_state=2652124
estimator_seed = 2652129
# The model to used to estimate feature importance 
estimator_model =  DecisionTreeRegressor(random_state=estimator_seed)
# # Boolean - True if data for estimator_model needs to be scaled
estimator_scaler = False
# The main model to fit
predictor_model = SVR(kernel='rbf')
# Boolean - True if data for model needs to be scaled 
model_scaler = True
# The scoring method to use in RFECV and GridCV
scoring_method = 'r2'
# The function used to evaluate the performance on the test model
score_func = r2_score
# True if a larger score_func value is better
larger_score_is_better = True
# The parameter space for GridCV to search
# Decrease upper-limit of C for linear model because 100 is very slow 
svr_Cspace = [1, 10, 100]
svr_gspace = [0.1, 0.01, 0.001]
param_grid = [
    {'m__C': svr_Cspace, 'm__gamma': svr_gspace},
]
# The number of jobs for RFECV and GridCV to use
n_jobs = 10
# number of folds for outer CV
cv_folds_outer = 10
# number of times to repeat outer CV
n_outer_repeats = 1
# number of folds for inner CV (used for hyperparameter tuning)
cv_folds_inner = 5
# number of folds for the final hyperparameter grid search
cv_folds_hp = 10
### Intrisic feature selection information 
# Function that takes in X, y, list of np arrays containing 
# the indices of features to filter, and K features to select
if_feat_inds = [np.arange(0, 18), np.arange(18, 36)]
if_K = 5
#####################

In [7]:
print(if_feat_inds)
print(feature_names[if_feat_inds[0]])
print(feature_names[if_feat_inds[1]])

[array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17]), array([18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35])]
['amp_ratio_1' 'amp_ratio_2' 'amp_ratio_3' 'amp_ratio_4' 'amp_ratio_5'
 'amp_ratio_6' 'amp_ratio_7' 'amp_ratio_8' 'amp_ratio_9' 'amp_ratio_10'
 'amp_ratio_11' 'amp_ratio_12' 'amp_ratio_13' 'amp_ratio_14'
 'amp_ratio_15' 'amp_ratio_16' 'amp_ratio_17' 'amp_ratio_18']
['amp_1' 'amp_2' 'amp_3' 'amp_4' 'amp_5' 'amp_6' 'amp_7' 'amp_8' 'amp_9'
 'amp_10' 'amp_11' 'amp_12' 'amp_13' 'amp_14' 'amp_15' 'amp_16' 'amp_17'
 'amp_18']


## Do RFECV and RFE with the selected N on the full training set for all stations

In [8]:
all_rfecv_results = {}
all_full_rfe_results = {}
for stat in station_feature_dict.keys():
    print(stat)
    X = station_feature_dict[stat]['X_train']
    y = station_meta_dict[stat]['y_train']
    # Do RFECV
    rfecv_results = CustomRFECV.do_rfecv(X, 
                                         y,
                                         clone(estimator_model),
                                         clone(predictor_model),
                                         param_grid,
                                         estimator_scaler=estimator_scaler,
                                         predictor_scaler=model_scaler,
                                         estimator_params_grid=None,
                                         scoring_method=scoring_method,
                                         score_func=score_func,
                                         n_jobs=n_jobs,
                                         cv_folds_outer=cv_folds_outer,
                                         cv_folds_inner=cv_folds_inner,
                                         n_outer_repeats=n_outer_repeats,
                                         cv_random_state=cv_random_state,
                                         larger_score_is_better=larger_score_is_better,
                                         intrinsic_filter_func=ifs.MI_filter_func,
                                         feature_inds_to_filter=if_feat_inds,
                                         intrinsic_filter_K=if_K
                                         )
    filename_rfecv = os.path.join(outfile_dir, f'{stat}.{outfile_name}.rfecv.json')
    write_dict_to_json(filename_rfecv, rfecv_results)
    all_rfecv_results[stat] = rfecv_results


    # Use selected N from RFECV on full training set
    mi_selected_features, mi_filtered_feature_inds = ifs.MI_filter_func(X, 
                                                                        y, 
                                                                        if_feat_inds, 
                                                                        if_K)
    X_mi = X[:, mi_filtered_feature_inds]
    hp_grid_search, hp_cv = CrossValidation.setup_cv(clone(predictor_model), 
                                                        param_grid, 
                                                        model_scaler=model_scaler, 
                                                        scoring_method=scoring_method, 
                                                        n_jobs=n_jobs, 
                                                        cv_folds=cv_folds_hp, 
                                                        cv_random_state=cv_random_state+10, 
                                                        refit_model=False)
    
    full_N_results = CustomRFECV.get_final_N_features(X_mi,
                                                        y,
                                                        rfecv_results,
                                                        estimator_model,
                                                        estimator_scaler,
                                                        hp_grid_search,
                                                        filtered_feat_inds=mi_filtered_feature_inds)

    full_N_results["intrinsic_K_feature_selection"] = mi_selected_features
    filename_rfe = os.path.join(outfile_dir, f'{stat}.{outfile_name}.rfe.full.json')
    write_dict_to_json(filename_rfe, full_N_results)
    all_full_rfe_results[stat] = full_N_results
    

filename_rfecv = os.path.join(outfile_dir, f'all.{outfile_name}.rfecv.json')
write_dict_to_json(filename_rfecv, all_rfecv_results)
filename_rfe = os.path.join(outfile_dir, f'all.{outfile_name}.rfe.full.json')
write_dict_to_json(filename_rfe, all_full_rfe_results)


YHB
reducing features to 19
Fold 0: N=14, test_score=0.919
reducing features to 19
Fold 1: N=19, test_score=0.917
reducing features to 19
Fold 2: N=11, test_score=0.918
reducing features to 19
Fold 3: N=16, test_score=0.914
reducing features to 19
Fold 4: N=11, test_score=0.909
reducing features to 19
Fold 5: N=16, test_score=0.914
reducing features to 19
Fold 6: N=19, test_score=0.917
reducing features to 19
Fold 7: N=19, test_score=0.911
reducing features to 19
Fold 8: N=12, test_score=0.913
reducing features to 19
Fold 9: N=19, test_score=0.917
total time: 1736.16 s
Selected number of features: 19 (avg. score of 0.91); 1 STE: N=8 (avg. 0.90)
Writing /uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_selection_update/rfecvP_diffSeeds/YHB.SVR.DT.rel.rfecv.json
{'best': 19, 'oste': 8}
19: CV Mean: 0.91, CV STD: 0.01
8: CV Mean: 0.89, CV STD: 0.01
Writing /uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_selection_update/rfecvP_diffSeeds/YHB.SVR.