In [1]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

import sys
import os
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.features import SelectPFeatures
from src.features import FeaturePlots as fp

In [2]:
sf = SelectPFeatures()

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline

# Load the data and process each station individually

In [4]:
data_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
train = pd.read_csv(f'{data_dir}/p.train.csv')
test = pd.read_csv(f'{data_dir}/p.test.csv')

In [5]:
station_feature_dict, feature_names = sf.process_station_datasets(train, test, scaler=False)

YHB
X shape: (2920, 47), y shape: (2920,)
X shape: (734, 47), y shape: (734,)
YDC
X shape: (2509, 47), y shape: (2509,)
X shape: (645, 47), y shape: (645,)
YWB
X shape: (3069, 47), y shape: (3069,)
X shape: (786, 47), y shape: (786,)
MCID
X shape: (2942, 47), y shape: (2942,)
X shape: (771, 47), y shape: (771,)
YHL
X shape: (2739, 47), y shape: (2739,)
X shape: (682, 47), y shape: (682,)
YMR
X shape: (3393, 47), y shape: (3393,)
X shape: (845, 47), y shape: (845,)
YHH
X shape: (4005, 47), y shape: (4005,)
X shape: (1002, 47), y shape: (1002,)
B207
X shape: (1609, 47), y shape: (1609,)
X shape: (380, 47), y shape: (380,)
YPP
X shape: (1334, 47), y shape: (1334,)
X shape: (338, 47), y shape: (338,)
YPM
X shape: (3358, 47), y shape: (3358,)
X shape: (843, 47), y shape: (843,)
YLT
X shape: (1275, 47), y shape: (1275,)
X shape: (291, 47), y shape: (291,)
QLMT
X shape: (792, 47), y shape: (792,)
X shape: (190, 47), y shape: (190,)
H17A
X shape: (527, 47), y shape: (527,)
X shape: (142, 47), 

In [6]:
ll_feature_subset = ['amp_ratio_4', 'amp_ratio_5', 'amp_ratio_6', 'amp_ratio_7',
                'amp_1', 'amp_2', 'amp_3', 'amp_4','amp_5', 'signal_variance',
                'signal_dominant_amplitude', 'signal_max_amplitude','signal_dominant_frequency',
                'noise_max_amplitude', 'noise_variance', 
                'source_depth_km', 'source_latitude',
                'source_longitude']

sr_feature_subset = ['amp_ratio_4', 'amp_ratio_5', 'amp_ratio_6', 'amp_ratio_7',
                'amp_1', 'amp_2', 'amp_3', 'amp_4','amp_5', 'signal_variance',
                'signal_dominant_amplitude', 'signal_max_amplitude','signal_dominant_frequency',
                'noise_max_amplitude', 'noise_variance', 
                'source_depth_km', 'source_receiver_distance_logkm',
                'source_receiver_back_azimuth_sine']

In [7]:
ll_station_feat_dict, ll_feature_names = sf.filter_station_dict_features(station_feature_dict,
                                                                        feature_names,
                                                                        ll_feature_subset)

YHB
X_train: (2920, 18), X_test: (734, 18)
YDC
X_train: (2509, 18), X_test: (645, 18)
YWB
X_train: (3069, 18), X_test: (786, 18)
MCID
X_train: (2942, 18), X_test: (771, 18)
YHL
X_train: (2739, 18), X_test: (682, 18)
YMR
X_train: (3393, 18), X_test: (845, 18)
YHH
X_train: (4005, 18), X_test: (1002, 18)
B207
X_train: (1609, 18), X_test: (380, 18)
YPP
X_train: (1334, 18), X_test: (338, 18)
YPM
X_train: (3358, 18), X_test: (843, 18)
YLT
X_train: (1275, 18), X_test: (291, 18)
QLMT
X_train: (792, 18), X_test: (190, 18)
H17A
X_train: (527, 18), X_test: (142, 18)
B208
X_train: (526, 18), X_test: (134, 18)
LKWY
X_train: (1016, 18), X_test: (278, 18)
FLWY
X_train: (694, 18), X_test: (177, 18)
YGC
X_train: (1725, 18), X_test: (451, 18)
TPMT
X_train: (408, 18), X_test: (97, 18)
YMC
X_train: (3553, 18), X_test: (898, 18)
YML
X_train: (2678, 18), X_test: (638, 18)
B206
X_train: (671, 18), X_test: (175, 18)
B944
X_train: (360, 18), X_test: (95, 18)
YLA
X_train: (681, 18), X_test: (169, 18)
YUF
X_trai

In [8]:
sr_station_feat_dict, sr_feature_names = sf.filter_station_dict_features(station_feature_dict,
                                                                        feature_names,
                                                                        sr_feature_subset)

YHB
X_train: (2920, 18), X_test: (734, 18)
YDC
X_train: (2509, 18), X_test: (645, 18)
YWB
X_train: (3069, 18), X_test: (786, 18)
MCID
X_train: (2942, 18), X_test: (771, 18)
YHL
X_train: (2739, 18), X_test: (682, 18)
YMR
X_train: (3393, 18), X_test: (845, 18)
YHH
X_train: (4005, 18), X_test: (1002, 18)
B207
X_train: (1609, 18), X_test: (380, 18)
YPP
X_train: (1334, 18), X_test: (338, 18)
YPM
X_train: (3358, 18), X_test: (843, 18)
YLT
X_train: (1275, 18), X_test: (291, 18)
QLMT
X_train: (792, 18), X_test: (190, 18)
H17A
X_train: (527, 18), X_test: (142, 18)
B208
X_train: (526, 18), X_test: (134, 18)
LKWY
X_train: (1016, 18), X_test: (278, 18)
FLWY
X_train: (694, 18), X_test: (177, 18)
YGC
X_train: (1725, 18), X_test: (451, 18)
TPMT
X_train: (408, 18), X_test: (97, 18)
YMC
X_train: (3553, 18), X_test: (898, 18)
YML
X_train: (2678, 18), X_test: (638, 18)
B206
X_train: (671, 18), X_test: (175, 18)
B944
X_train: (360, 18), X_test: (95, 18)
YLA
X_train: (681, 18), X_test: (169, 18)
YUF
X_trai

In [9]:
assert np.array_equal(ll_station_feat_dict['YHB']['X_train'][:, 0:-2], sr_station_feat_dict['YHB']['X_train'][:, 0:-2]),\
'The filtered datasets are not the same, excluding the location columns'

# Recursive Feature Elimination

Do nested CV for Recursive Feature Elimination (RFE) to select the optimal number of features for each station and (I think?) the optimal hyperparameters.  

Do this for both the feature set with lat/lon and the feature set with SR distance and baz. Then use the location parameters that result in the best performance in the cross validation overall (across all stations). (I would rather just use sr-distance and backazimuth).  

Then, use the optimal number of features from the CV to select the features from each stations training dataset. Compare the selected features.  

Followed example from [Machine Learning Mastery](https://machinelearningmastery.com/rfe-feature-selection-in-python/)

# I NEED TO TURN THE SCALER OFF IN THE DATA LOADER IF I AM GOING TO USE CROSS-VALIDATION WITH A SVR ESTIMATOR BECUASE SCALING NEEDS TO HAPPEN IN EACH FOLD

In [69]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold, KFold, GridSearchCV
from sklearn.feature_selection import RFECV, RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import r2_score
import time

In [None]:
# param_grid = [
#     {'svr__C': [0.1, 1, 10], 'svr__epsilon': [0.1, 1, 10], 'svr__kernel': ['linear']},
#     {'svr__C': [0.1, 1, 10], 'svr__epsilon': [0.1, 1, 10], 'svr__gamma': [0.1, 0.01, 0.001], 'svr__kernel': ['rbf']},
# ]

In [18]:
def run_grid_search(gs, Xtrain, ytrain, Xtest):
    gs_results = gs.fit(Xtrain, ytrain)
    yhat = gs_results.predict(Xtest)
    return gs_results, yhat

In [19]:
def get_gs_best_results(gs_results):
    cv_mean = gs_results.best_score_
    cv_std = gs_results.cv_results_['std_test_score'][gs_results.best_index_]
    params = gs_results.best_params_

    return cv_mean, cv_std, params

In [72]:
### Set Parameters ###
# Training data to use
X = ll_station_feat_dict['YNR']['X_train']
y = ll_station_feat_dict['YNR']['y_train']
# Feature dict to use
feat_dict = ll_station_feat_dict
# The model to used to estimate feature importance 
estimator_model =  DecisionTreeRegressor()
# Boolean - True if data for estimator_model needs to be scaled
estimator_scaler = False
# The main model to fit
model = SVR()
# Boolean - True if data for model needs to be scaled 
model_scaler = True
# The scoring method to use in RFECV and GridCV
scoring_method = 'r2'
# The function used to evaluate the performance on the test model
score_func = r2_score
# The parameter space for GridCV to search
param_grid =  [{'m__C': [0.1, 1, 10], 
                'm__epsilon': [0.1, 1, 10], 
                'm__gamma': [0.1, 0.01, 0.001], 
                'm__kernel': ['rbf']}]
# The number of jobs for RFECV and GridCV to use
n_jobs = 5
# number of folds for outer CV
cv_folds_outer = 10
# number of folds for inner cv
cv_folds_inner = 5
#####################

cv_outer = KFold(n_splits=cv_folds_outer, shuffle=True, random_state=1)
cv_inner = KFold(n_splits=cv_folds_inner, shuffle=True, random_state=1)

### Lists to store the results of outer loop
# Store results of cross-validation and the best model when 
# using all the features - I probably do not need all of these,
# really just the test score for comparison
outer_cv_params_all = []
outer_cv_mean_all = []
outer_cv_std_all = []
outer_test_score_all = []

# Store the CV results and the best model when using 
# the selected subset of features
outer_cv_params_best = []
outer_cv_mean_best = []
outer_cv_std_best = []
outer_test_score_best = []

# Store the boolean array of features that were kept
outer_kept_feats = []
# Store the number of kept features
outer_n_feats = []

start_outer = time.time()

for i, data in enumerate(cv_outer.split(X)):
    train_ix, test_ix = data
    start_inner = time.time()
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]

    # If the estimator model needs scaled features, add scaling to the 
    # selector pipeline (s_pipe)
    # Each fold in RFECV should be scaled independently
    s_pipe = []
    if estimator_scaler:
        s_pipe.append(('scaler', StandardScaler()))
    
    # If the main model needs scaled features, add to the model pipeline (m_pipe)
    # Can use this pipeline in GridCV and evaluating the final models
    m_pipe = []
    if model_scaler:
        m_pipe.append(('scaler', StandardScaler()))

    s_pipe.append(('m', estimator_model))    
    m_pipe.append(('m', model))
    
    s_pipe, m_pipe = Pipeline(s_pipe), Pipeline(m_pipe)

    # Do RFECV to select the optimal number of features
    if 'feature_importances_' in dir(estimator_model):
        importance_getter = 'named_steps.m.feature_importances_'
    elif 'coef_' in dir(estimator_model):
        importance_getter = 'named_steps.m.coef_'
    else:
        raise ValueError('estimator_model must have coef_ or feature_importances_ attribute')

    rfe = RFECV(s_pipe,
                cv=cv_inner, 
                scoring=scoring_method,
                n_jobs=n_jobs,
                importance_getter=importance_getter)
    rfe.fit(X_train, y_train)

    # Get the best features from the RFECV
    n_feats = rfe.n_features_
    best_feats = rfe.support_

    outer_kept_feats.append(best_feats)
    outer_n_feats.append(n_feats)

    # Do model param. grid search when using all features
    search = GridSearchCV(m_pipe,
                           param_grid=param_grid, 
                           scoring=scoring_method, 
                           n_jobs=n_jobs, 
                           cv=cv_inner,
                           refit=True)
    
    gs_results_all, yhat_all = run_grid_search(search, X_train, y_train, X_test)
    score_all = score_func(y_test, yhat_all)
    outer_test_score_all.append(score_all)
    # Could probably remove the next 4 lines... I don't think I really need to save these
    cv_mean_all, cv_std_all, params_all = get_gs_best_results(gs_results_all)
    outer_cv_mean_all.append(cv_mean_all)
    outer_cv_std_all.append(cv_std_all)
    outer_cv_params_all.append(params_all)

    # Do GridCV using the optimal number of features 
    X_train = X_train[:, best_feats]
    X_test = X_test[:, best_feats]
    gs_results_best, yhat_best = run_grid_search(search, X_train, y_train, X_test)
    score_best = score_func(y_test, yhat_best)
    outer_test_score_best.append(score_best)

    cv_mean_best, cv_std_best, params_best = get_gs_best_results(gs_results_best)
    outer_cv_mean_best.append(cv_mean_best)
    outer_cv_std_best.append(cv_std_best)
    outer_cv_params_best.append(params_best)

    end_inner = time.time()
    print(f'Fold {i}: test score ({n_feats} best feats): {score_best:0.3f}, test score (all feats): {score_all:0.3f}, diff: {(score_best - score_all):0.3f}, time: {end_inner-start_inner:0.2f} s, best model params: {params_best}')

outer_time = (time.time() - start_outer)
print(f'Total time: {outer_time:0.2f} s ({outer_time/60:0.2f} min)')

Fold 0: test r^2 (10 best feats): 0.901, test r^2 (all feats): 0.889, diff: 0.012, time: 44.69 s, best model params: {'m__C': 10, 'm__epsilon': 0.1, 'm__gamma': 0.1, 'm__kernel': 'rbf'}
Fold 1: test r^2 (12 best feats): 0.912, test r^2 (all feats): 0.885, diff: 0.027, time: 34.40 s, best model params: {'m__C': 10, 'm__epsilon': 0.1, 'm__gamma': 0.1, 'm__kernel': 'rbf'}
Fold 2: test r^2 (7 best feats): 0.913, test r^2 (all feats): 0.899, diff: 0.013, time: 30.62 s, best model params: {'m__C': 10, 'm__epsilon': 0.1, 'm__gamma': 0.1, 'm__kernel': 'rbf'}


In [68]:
'feature_importances_' in dir(estimator_model)

True

In [62]:
for i in range(len(outer_cv_mean_all)):
    ts_best = outer_test_score_best[i]
    ts_all = outer_test_score_all[i]
    diff = ts_best - ts_all
    print(f'test r^2 ({outer_n_feats[i]} best feats): {ts_best:0.3f}, test r^2 (all feats): {ts_all:0.3f}, diff: {diff:0.2f}')

test r^2 (9 best feats): 0.907, test r^2 (all feats): 0.889, diff: 0.02
test r^2 (7 best feats): 0.913, test r^2 (all feats): 0.885, diff: 0.03
test r^2 (7 best feats): 0.913, test r^2 (all feats): 0.899, diff: 0.01
test r^2 (7 best feats): 0.913, test r^2 (all feats): 0.887, diff: 0.03
test r^2 (8 best feats): 0.920, test r^2 (all feats): 0.894, diff: 0.03
test r^2 (8 best feats): 0.891, test r^2 (all feats): 0.868, diff: 0.02
test r^2 (6 best feats): 0.884, test r^2 (all feats): 0.871, diff: 0.01
test r^2 (8 best feats): 0.910, test r^2 (all feats): 0.892, diff: 0.02
test r^2 (11 best feats): 0.902, test r^2 (all feats): 0.875, diff: 0.03
test r^2 (9 best feats): 0.919, test r^2 (all feats): 0.907, diff: 0.01


In [63]:
for i in range(len(outer_cv_mean_all)):
    ts_best = outer_test_score_best[i]
    print(f'test r^2 ({outer_n_feats[i]} best feats): {ts_best}, Params: {outer_cv_params_best[i]}')

test r^2 (9 best feats): 0.9072732713057121, Params: {'m__C': 10, 'm__epsilon': 0.1, 'm__gamma': 0.1, 'm__kernel': 'rbf'}
test r^2 (7 best feats): 0.9126117153482627, Params: {'m__C': 10, 'm__epsilon': 0.1, 'm__gamma': 0.1, 'm__kernel': 'rbf'}
test r^2 (7 best feats): 0.9125958764634341, Params: {'m__C': 10, 'm__epsilon': 0.1, 'm__gamma': 0.1, 'm__kernel': 'rbf'}
test r^2 (7 best feats): 0.9129984355627689, Params: {'m__C': 10, 'm__epsilon': 0.1, 'm__gamma': 0.1, 'm__kernel': 'rbf'}
test r^2 (8 best feats): 0.9199351647214102, Params: {'m__C': 10, 'm__epsilon': 0.1, 'm__gamma': 0.1, 'm__kernel': 'rbf'}
test r^2 (8 best feats): 0.8913088612400136, Params: {'m__C': 10, 'm__epsilon': 0.1, 'm__gamma': 0.1, 'm__kernel': 'rbf'}
test r^2 (6 best feats): 0.8842033285542196, Params: {'m__C': 10, 'm__epsilon': 0.1, 'm__gamma': 0.1, 'm__kernel': 'rbf'}
test r^2 (8 best feats): 0.9101431139899008, Params: {'m__C': 10, 'm__epsilon': 0.1, 'm__gamma': 0.1, 'm__kernel': 'rbf'}
test r^2 (11 best feats)

In [56]:
feat_sum = (outer_kept_feats[0])*1
for f_i in outer_kept_feats[1:]:
    feat_sum += (f_i)*1

In [60]:
pd.DataFrame({'Feature':ll_feature_names, 'Cnt':feat_sum}).sort_values('Cnt', ascending=False)

Unnamed: 0,Feature,Cnt
4,amp_1,10
5,amp_2,10
15,source_depth_km,10
16,source_latitude,10
17,source_longitude,10
13,signal_variance,10
6,amp_3,9
10,signal_dominant_amplitude,4
0,amp_ratio_4,3
7,amp_4,1
