In [1]:
import numpy as np
import pandas as pd
from sklearn.base import clone 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score
import json
import sys
import os
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.processing import GatherFeatureDatasets
from src.utils import CrossValidation
from src.feature_selection import CustomRFECVUpdate
from src.feature_selection import IntrinsicFeatureSelection as ifs
from src.plotting import plot_pairwise_correlations
from src.utils import NumpyEncoder


# Load in the data

In [2]:
proc = GatherFeatureDatasets(is_p=True)

In [3]:
data_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
outpath = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_selection_update/experiment_YUF'
all_train_df = pd.read_csv(f'{data_dir}/p.train.csv')

In [4]:
# Use station YUF for experimenting with models because it has 1433 training examples, 
# which is ~half as much as YNR, so training should go faster. But should still be enough 
# examples for training good models. It is also one of the stations with a 
# UUSS correction, so already used for magnitude calculations and amplitudes should be good quality. 
stat = "YUF"
station_feature_dict_rel, station_meta_dict_rel, feature_names_rel = proc.process_station_datasets(stat, 
                                                                                        all_train_df,
                                                                                        scaler=False,
                                                                                        linear_model=False,
                                                                                        source_dist_type='dist') 

YUF
X shape: (1676, 45), y shape: (1676,)


In [5]:
feature_names_rel

array(['amp_ratio_1', 'amp_ratio_2', 'amp_ratio_3', 'amp_ratio_4',
       'amp_ratio_5', 'amp_ratio_6', 'amp_ratio_7', 'amp_ratio_8',
       'amp_ratio_9', 'amp_ratio_10', 'amp_ratio_11', 'amp_ratio_12',
       'amp_ratio_13', 'amp_ratio_14', 'amp_ratio_15', 'amp_ratio_16',
       'amp_ratio_17', 'amp_ratio_18', 'amp_1', 'amp_2', 'amp_3', 'amp_4',
       'amp_5', 'amp_6', 'amp_7', 'amp_8', 'amp_9', 'amp_10', 'amp_11',
       'amp_12', 'amp_13', 'amp_14', 'amp_15', 'amp_16', 'amp_17',
       'amp_18', 'signal_dominant_frequency', 'signal_dominant_amplitude',
       'noise_max_amplitude', 'signal_max_amplitude', 'signal_variance',
       'noise_variance', 'source_depth_km',
       'source_receiver_distance_logkm',
       'source_receiver_back_azimuth_deg'], dtype='<U32')

In [6]:
X_rel = station_feature_dict_rel['X_train']
y_rel = station_meta_dict_rel['y_train']

# Use RFECV to select N with different estimator models

In [7]:
### Set Parameters ###
cv_random_state=2652124
# The scoring method to use in RFECV and GridCV
scoring_method = 'r2'
# The function used to evaluate the performance on the test model
score_func = r2_score
# True if a larger score_func value is better
larger_score_is_better = True
# The number of jobs for RFECV and GridCV to use
n_jobs = 10
# number of folds for outer CV
cv_folds_outer = 10
# number of times to repeat outer CV
n_outer_repeats = 1
# number of folds for inner CV (used for hyperparameter tuning)
cv_folds_inner = 5
# number of folds for the final hyperparameter grid search
cv_folds_hp = 10
# Run grid search over all features
run_gridsearchcv_all = True

### Intrisic feature selection information 
# Function that takes in X, y, list of np arrays containing 
# the indices of features to filter, and K features to select
if_feat_inds = [np.arange(0, 18), np.arange(18, 36)]
if_K = 5
#####################

In [8]:
estimators_seed = 2652129
outfile = 'rfecv.relDist.trees.json'

models = {"RF": RandomForestRegressor(random_state=estimators_seed), 
         "GBT": GradientBoostingRegressor(random_state=estimators_seed)}
scaler = False

param_grids = {"RF": {"m__max_features": [1, 4, 6],
                                "m__n_estimators": [100, 500, 1000]},
                        "GBT": {'m__n_estimators': [100, 500, 1000], 
                                'm__max_depth': [1, 3, 5]}
                        }


In [9]:
estimator_rfe_results_tuned = {}
for key in models.keys():
  model = models[key]
  params = param_grids[key]
  mname = type(model).__name__
  feat_selection_results = CustomRFECVUpdate.do_rfecv(X_rel,
                                                y_rel,
                                              model,
                                              model,
                                              params,
                                              estimator_params_grid=params,
                                              estimator_scaler=scaler,
                                              predictor_scaler=scaler,
                                              score_func=score_func,
                                              scoring_method=scoring_method,
                                              n_jobs=n_jobs,
                                              cv_folds_outer=cv_folds_outer,
                                              cv_folds_inner=cv_folds_inner,
                                              n_outer_repeats=n_outer_repeats,
                                              cv_random_state=cv_random_state,
                                              larger_score_is_better=larger_score_is_better,
                                              intrinsic_filter_func=ifs.MI_filter_func,
                                              feature_inds_to_filter=if_feat_inds,
                                              intrinsic_filter_K=if_K
                                              )
  estimator_rfe_results_tuned[mname] = feat_selection_results

with open(os.path.join(outpath, outfile), 'w') as fp:
    json.dump(estimator_rfe_results_tuned, fp, indent=4, cls=NumpyEncoder)

reducing features to 19
Using {'m__max_features': 6, 'm__n_estimators': 500} for the estimator model
RandomForestRegressor(max_features=6, n_estimators=500, random_state=2652129)
Fold 0: N=11, test_score=0.902
reducing features to 19
Using {'m__max_features': 6, 'm__n_estimators': 500} for the estimator model
RandomForestRegressor(max_features=6, n_estimators=500, random_state=2652129)
Fold 1: N=12, test_score=0.894
reducing features to 19
Using {'m__max_features': 6, 'm__n_estimators': 1000} for the estimator model
RandomForestRegressor(max_features=6, n_estimators=1000, random_state=2652129)
Fold 2: N=10, test_score=0.893
reducing features to 19
Using {'m__max_features': 6, 'm__n_estimators': 1000} for the estimator model
RandomForestRegressor(max_features=6, n_estimators=1000, random_state=2652129)
Fold 3: N=9, test_score=0.899
reducing features to 19
Using {'m__max_features': 6, 'm__n_estimators': 1000} for the estimator model
RandomForestRegressor(max_features=6, n_estimators=1000

## Train a model using the selected number of features on the full training set

In [10]:
with open(os.path.join(outpath, outfile), 'r') as fp: 
       estimator_rfe_results_tuned = json.load(fp)

In [11]:
# Filter the entire training dataset using MI and the same parameters used in CV
mi_selected_features, mi_filtered_feature_inds = ifs.MI_filter_func(X_rel, 
                                                                    y_rel, 
                                                                    if_feat_inds, 
                                                                    if_K)
print(feature_names_rel[mi_selected_features[0]], feature_names_rel[mi_selected_features[1]])
X_mi = X_rel[:, mi_filtered_feature_inds]
print(X_mi.shape)

['amp_ratio_7' 'amp_ratio_6' 'amp_ratio_4' 'amp_ratio_5' 'amp_ratio_2'] ['amp_1' 'amp_2' 'amp_4' 'amp_3' 'amp_7']
(1676, 19)


In [12]:
feature_names_rel[mi_filtered_feature_inds]

array(['amp_ratio_2', 'amp_ratio_4', 'amp_ratio_5', 'amp_ratio_6',
       'amp_ratio_7', 'amp_1', 'amp_2', 'amp_3', 'amp_4', 'amp_7',
       'signal_dominant_frequency', 'signal_dominant_amplitude',
       'noise_max_amplitude', 'signal_max_amplitude', 'signal_variance',
       'noise_variance', 'source_depth_km',
       'source_receiver_distance_logkm',
       'source_receiver_back_azimuth_deg'], dtype='<U32')

In [13]:
# Set up SVR hyperparameter grid search
# If this isn't reran when the feature selection is reran then
# the results may be slightly different

full_estimator_rfe_results_rel_tuned = {}
for key in models.keys():
    model = models[key]
    params = param_grids[key]
    mname = type(model).__name__
    print(mname, 'Estimator Scaler:', scaler)

    predictor_model = clone(model)
    estimator_model = clone(model)
    hp_grid_search, hp_cv = CrossValidation.setup_cv(predictor_model, 
                                                    params, 
                                                    model_scaler=scaler, 
                                                    scoring_method=scoring_method, 
                                                    n_jobs=n_jobs, 
                                                    cv_folds=cv_folds_outer, 
                                                    cv_random_state=cv_random_state, 
                                                    refit_model=False)

    rfecv_results_dict = estimator_rfe_results_tuned[mname]

    estimator_grid_search, estimator_cv = CrossValidation.setup_cv(estimator_model, 
                                                                params, 
                                                                model_scaler=scaler, 
                                                                scoring_method=scoring_method, 
                                                                n_jobs=n_jobs, 
                                                                cv_folds=cv_folds_inner, 
                                                                cv_random_state=cv_random_state, 
                                                                refit_model=False)
    
    N_results = CustomRFECVUpdate.get_final_N_features_estimator_tuning(X_mi,
                                                                  y_rel,
                                                                  rfecv_results_dict,
                                                                  estimator_model,
                                                                  scaler,
                                                                  estimator_grid_search,
                                                                  hp_grid_search,
                                                                  filtered_feat_inds=mi_filtered_feature_inds
                                                                  )
    full_estimator_rfe_results_rel_tuned[mname] = N_results

RandomForestRegressor Estimator Scaler: False
Using {'m__max_features': 6, 'm__n_estimators': 1000} for the estimator model
RandomForestRegressor(max_features=6, n_estimators=1000, random_state=2652129)
{'best': 12, 'oste': 5}
[17  5  6  7 16  8 14 18 10  9 13  0]


12: CV Mean: 0.89, CV STD: 0.01
[17  5  6  7 16]
5: CV Mean: 0.86, CV STD: 0.02
GradientBoostingRegressor Estimator Scaler: False
Using {'m__max_depth': 1, 'm__n_estimators': 1000} for the estimator model
GradientBoostingRegressor(max_depth=1, n_estimators=1000, random_state=2652129)
{'best': 18, 'oste': 6}
[17  5  6 16  1  8  4 18  0  9  3  7 14 15 10 11 12  2]
18: CV Mean: 0.91, CV STD: 0.01
[17  5  6 16  1  8]
6: CV Mean: 0.88, CV STD: 0.02


In [14]:
with open(os.path.join(outpath, 'rfe.full.relDist.trees.json'), 'w') as fp:
    json.dump(full_estimator_rfe_results_rel_tuned, fp, indent=4, cls=NumpyEncoder)