In [1]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

import sys
import os
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.features import SelectPFeatures
from src.features import FeaturePlots as fp

In [2]:
sf = SelectPFeatures()

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline

# Load the data and process each station individually

In [4]:
data_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
train = pd.read_csv(f'{data_dir}/p.train.csv')
test = pd.read_csv(f'{data_dir}/p.test.csv')

In [5]:
station_feature_dict, feature_names = sf.process_station_datasets(train, test, scaler=False)

YHB
X shape: (2920, 47), y shape: (2920,)
X shape: (734, 47), y shape: (734,)
YDC
X shape: (2509, 47), y shape: (2509,)
X shape: (645, 47), y shape: (645,)
YWB
X shape: (3069, 47), y shape: (3069,)
X shape: (786, 47), y shape: (786,)
MCID
X shape: (2942, 47), y shape: (2942,)
X shape: (771, 47), y shape: (771,)
YHL
X shape: (2739, 47), y shape: (2739,)
X shape: (682, 47), y shape: (682,)
YMR
X shape: (3393, 47), y shape: (3393,)
X shape: (845, 47), y shape: (845,)
YHH
X shape: (4005, 47), y shape: (4005,)
X shape: (1002, 47), y shape: (1002,)
B207
X shape: (1609, 47), y shape: (1609,)
X shape: (380, 47), y shape: (380,)
YPP
X shape: (1334, 47), y shape: (1334,)
X shape: (338, 47), y shape: (338,)
YPM
X shape: (3358, 47), y shape: (3358,)
X shape: (843, 47), y shape: (843,)
YLT
X shape: (1275, 47), y shape: (1275,)
X shape: (291, 47), y shape: (291,)
QLMT
X shape: (792, 47), y shape: (792,)
X shape: (190, 47), y shape: (190,)
H17A
X shape: (527, 47), y shape: (527,)
X shape: (142, 47), 

In [6]:
ll_feature_subset = ['amp_ratio_4', 'amp_ratio_5', 'amp_ratio_6', 'amp_ratio_7',
                'amp_1', 'amp_2', 'amp_3', 'amp_4','amp_5', 'signal_variance',
                'signal_dominant_amplitude', 'signal_max_amplitude','signal_dominant_frequency',
                'noise_max_amplitude', 'noise_variance', 
                'source_depth_km', 'source_latitude',
                'source_longitude']

sr_feature_subset = ['amp_ratio_4', 'amp_ratio_5', 'amp_ratio_6', 'amp_ratio_7',
                'amp_1', 'amp_2', 'amp_3', 'amp_4','amp_5', 'signal_variance',
                'signal_dominant_amplitude', 'signal_max_amplitude','signal_dominant_frequency',
                'noise_max_amplitude', 'noise_variance', 
                'source_depth_km', 'source_receiver_distance_logkm',
                'source_receiver_back_azimuth_sine']

In [7]:
ll_station_feat_dict, ll_feature_names = sf.filter_station_dict_features(station_feature_dict,
                                                                        feature_names,
                                                                        ll_feature_subset)

YHB
X_train: (2920, 18), X_test: (734, 18)
YDC
X_train: (2509, 18), X_test: (645, 18)
YWB
X_train: (3069, 18), X_test: (786, 18)
MCID
X_train: (2942, 18), X_test: (771, 18)
YHL
X_train: (2739, 18), X_test: (682, 18)
YMR
X_train: (3393, 18), X_test: (845, 18)
YHH
X_train: (4005, 18), X_test: (1002, 18)
B207
X_train: (1609, 18), X_test: (380, 18)
YPP
X_train: (1334, 18), X_test: (338, 18)
YPM
X_train: (3358, 18), X_test: (843, 18)
YLT
X_train: (1275, 18), X_test: (291, 18)
QLMT
X_train: (792, 18), X_test: (190, 18)
H17A
X_train: (527, 18), X_test: (142, 18)
B208
X_train: (526, 18), X_test: (134, 18)
LKWY
X_train: (1016, 18), X_test: (278, 18)
FLWY
X_train: (694, 18), X_test: (177, 18)
YGC
X_train: (1725, 18), X_test: (451, 18)
TPMT
X_train: (408, 18), X_test: (97, 18)
YMC
X_train: (3553, 18), X_test: (898, 18)
YML
X_train: (2678, 18), X_test: (638, 18)
B206
X_train: (671, 18), X_test: (175, 18)
B944
X_train: (360, 18), X_test: (95, 18)
YLA
X_train: (681, 18), X_test: (169, 18)
YUF
X_trai

In [8]:
sr_station_feat_dict, sr_feature_names = sf.filter_station_dict_features(station_feature_dict,
                                                                        feature_names,
                                                                        sr_feature_subset)

YHB
X_train: (2920, 18), X_test: (734, 18)
YDC
X_train: (2509, 18), X_test: (645, 18)
YWB
X_train: (3069, 18), X_test: (786, 18)
MCID
X_train: (2942, 18), X_test: (771, 18)
YHL
X_train: (2739, 18), X_test: (682, 18)
YMR
X_train: (3393, 18), X_test: (845, 18)
YHH
X_train: (4005, 18), X_test: (1002, 18)
B207
X_train: (1609, 18), X_test: (380, 18)
YPP
X_train: (1334, 18), X_test: (338, 18)
YPM
X_train: (3358, 18), X_test: (843, 18)
YLT
X_train: (1275, 18), X_test: (291, 18)
QLMT
X_train: (792, 18), X_test: (190, 18)
H17A
X_train: (527, 18), X_test: (142, 18)
B208
X_train: (526, 18), X_test: (134, 18)
LKWY
X_train: (1016, 18), X_test: (278, 18)
FLWY
X_train: (694, 18), X_test: (177, 18)
YGC
X_train: (1725, 18), X_test: (451, 18)
TPMT
X_train: (408, 18), X_test: (97, 18)
YMC
X_train: (3553, 18), X_test: (898, 18)
YML
X_train: (2678, 18), X_test: (638, 18)
B206
X_train: (671, 18), X_test: (175, 18)
B944
X_train: (360, 18), X_test: (95, 18)
YLA
X_train: (681, 18), X_test: (169, 18)
YUF
X_trai

In [9]:
assert np.array_equal(ll_station_feat_dict['YHB']['X_train'][:, 0:-2], sr_station_feat_dict['YHB']['X_train'][:, 0:-2]),\
'The filtered datasets are not the same, excluding the location columns'

# Recursive Feature Elimination

Do nested CV for Recursive Feature Elimination (RFE) to select the optimal number of features for each station and (I think?) the optimal hyperparameters.  

Do this for both the feature set with lat/lon and the feature set with SR distance and baz. Then use the location parameters that result in the best performance in the cross validation overall (across all stations). (I would rather just use sr-distance and backazimuth).  

Then, use the optimal number of features from the CV to select the features from each stations training dataset. Compare the selected features.  

Followed example from [Machine Learning Mastery](https://machinelearningmastery.com/rfe-feature-selection-in-python/)

# I NEED TO TURN THE SCALER OFF IN THE DATA LOADER IF I AM GOING TO USE CROSS-VALIDATION WITH A SVR ESTIMATOR BECUASE SCALING NEEDS TO HAPPEN IN EACH FOLD

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold, KFold, GridSearchCV
from sklearn.feature_selection import RFECV, RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR

In [11]:
X = ll_station_feat_dict['YNR']['X_train']
y = ll_station_feat_dict['YNR']['y_train']

rfe = RFECV(DecisionTreeRegressor(),
            scoring='r2',
            n_jobs=1)
rfe.fit(X, y)

In [12]:
rfe.cv_results_

{'mean_test_score': array([-0.2043669 , -0.23778897,  0.53216759,  0.56470719,  0.62311772,
         0.6496774 ,  0.67101865,  0.66055609,  0.65927265,  0.65082419,
         0.6568674 ,  0.64698696,  0.64826863,  0.64708842,  0.64608246,
         0.65633823,  0.65509187,  0.64751194]),
 'std_test_score': array([0.13428017, 0.17541945, 0.02311186, 0.09945653, 0.06757163,
        0.03120139, 0.03812313, 0.04906294, 0.03516151, 0.04049811,
        0.03028124, 0.0323639 , 0.03635343, 0.0463777 , 0.0565227 ,
        0.0452284 , 0.04478739, 0.04728891]),
 'split0_test_score': array([0.0163012 , 0.06265244, 0.51471605, 0.49137295, 0.65986513,
        0.66596008, 0.66043302, 0.67415989, 0.67452917, 0.650914  ,
        0.64408208, 0.62062556, 0.64483674, 0.6414857 , 0.65764902,
        0.6742302 , 0.6719316 , 0.65453748]),
 'split1_test_score': array([-0.20952143, -0.15267227,  0.5333484 ,  0.47403625,  0.53657878,
         0.60375033,  0.61608884,  0.58375125,  0.61392172,  0.62857413,
       

In [13]:
rfe.n_features_

7

In [14]:
ll_feature_names[rfe.support_]

array(['amp_1', 'amp_2', 'amp_3', 'signal_variance', 'source_depth_km',
       'source_latitude', 'source_longitude'], dtype='<U33')

In [21]:
rfe = RFE(DecisionTreeRegressor(),n_features_to_select=7)
rfe.fit(X, y)

In [22]:
ll_feature_names[rfe.support_]

array(['amp_1', 'amp_2', 'amp_3', 'signal_variance', 'source_depth_km',
       'source_latitude', 'source_longitude'], dtype='<U33')

In [24]:
X[:, rfe.support_].shape

(4009, 7)

In [63]:
def run_grid_search(gs, Xtrain, ytrain, Xtest):
    gs_results = gs.fit(Xtrain, ytrain)
    yhat = gs_results.predict(Xtest)
    return gs_results, yhat

In [None]:
def get_gs_best_results(gs_results):
    cv_mean = gs_results.best_score_
    cv_std = gs_results.cv_results_['std_test_score'][gs_results.best_index_]
    params = gs_results.best_params_

    return cv_mean, cv_std, params

In [None]:
X = ll_station_feat_dict['YNR']['X_train']
y = ll_station_feat_dict['YNR']['y_train']

cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
cv_inner = KFold(n_splits=5, shuffle=True, random_state=1)

for train_ix, test_ix in cv_outer.split(X):
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix, :], y[test_ix, :]

    # If the estimator model needs scaled features, add scaling to the 
    # selector pipeline (s_pipe)
    # Each fold in RFECV should be scaled independently
    s_pipe = []
    if s_scaler:
        s_pipe.append(('scaler', StandardScaler()))
    
    # If the main model needs scaled features, add to the model pipeline (m_pipe)
    # Can use this pipeline in GridCV and evaluating the final models
    m_pipe = []
    if m_scaler:
        m_pipe.append(('scaler', StandardScaler()))

    s_pipe.append(('m', estimator_model))    
    m_pipe.append(('m', model))

    # Do RFECV to select the optimal number of features
    rfe = RFECV(s_pipe,
                cv=cv_inner 
                scoring=scoring_method,
                n_jobs=n_jobs)
    rfe.fit(X_train, y_train)

    # Get the best features from the RFECV
    n_feats = rfe.n_features_
    best_feats = rfe.support_

    # Do grid search over all features
    search = GridSearchCV(m_pipe,
                           param_grid=param_grid, 
                           scoring=scoring_method, 
                           n_jobs=n_jobs, 
                           cv=cv_inner,
                           refit=True)
    
    gs_results_all, yhat_all = run_grid_search(search, X_train, y_train, X_test)
    score_all = score_func(y_test, yhat_all)
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              

    # Do GridCV using the optimal number of features 
    X_train = X_train[:, best_feats]
    X_test = X_test[:, best_feats]
    cv_results_best, yhat_best = run_grid_search(search, X_train, y_train, X_test)
    score_best = score_func(y_test, yhat_best)






In [47]:
## Outer loop cv settings

## Inner loop setting 
### Set up the Hyperparameter space, associated cv procedure, and grid search
cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
param_grid = [
    {'svr__C': [0.1, 1, 10], 'svr__epsilon': [0.1, 1, 10], 'svr__kernel': ['linear']},
    {'svr__C': [0.1, 1, 10], 'svr__epsilon': [0.1, 1, 10], 'svr__gamma': [0.1, 0.01, 0.001], 'svr__kernel': ['rbf']},
]
hp_search = GridSearchCV(Pipeline(steps=[('scale', StandardScaler()),('svr', SVR())]), 
                         param_grid, scoring='r2', n_jobs=1, cv=cv_inner, refit=True)
hp_search.fit(X, y)
### Set up feature selection 

### Set up inner loop pipeline:
#### 1. RFECV - get features + optimal number. 
#### 2. Tune model hyperparameters with selected features using CV  
#### 3. Train model with selected features and hyperparameters
#### 4. Eval on the fold's test set. 


In [48]:
hp_search.predict(X)

array([3.01463458, 1.69425173, 0.88448447, ..., 0.94992239, 0.99755112,
       1.442211  ])

In [51]:
hp_search.best_estimator_.get_params()['scale'].transform(X)

array([[ 0.79096608,  0.18190067,  0.51717547, ...,  0.69088028,
        -2.92798006, -0.98568311],
       [ 0.82000645,  1.37105573,  1.52367719, ..., -0.45802101,
        -0.93311124,  1.01508226],
       [ 0.75317058,  1.07396393,  1.25970456, ..., -0.79109011,
        -0.97002477,  0.99229834],
       ...,
       [ 0.45099418,  0.7000917 ,  0.68199508, ..., -0.95201114,
         0.36655221,  1.12647031],
       [-0.1189456 , -0.00845542, -0.5144648 , ..., -0.8584524 ,
        -0.90235027,  2.06904969],
       [ 0.96811234,  1.07010402,  1.03197437, ..., -0.96698053,
         0.74184004,  0.29781105]])

In [50]:
hp_search.best_estimator_.predict(X)

array([3.01463458, 1.69425173, 0.88448447, ..., 0.94992239, 0.99755112,
       1.442211  ])

In [39]:
hp_search.best_params_

{'svr__C': 10, 'svr__epsilon': 0.1, 'svr__gamma': 0.01, 'svr__kernel': 'rbf'}

In [40]:
hp_search.best_score_

0.8849830404626405

In [43]:
print(hp_search.cv_results_.keys())

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_svr__C', 'param_svr__epsilon', 'param_svr__kernel', 'param_svr__gamma', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])


In [58]:
hp_search.best_index_

28

In [57]:
pd.DataFrame(hp_search.cv_results_).iloc[28:29]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svr__C,param_svr__epsilon,param_svr__kernel,param_svr__gamma,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
28,0.93399,0.011615,0.372692,0.001778,10,0.1,rbf,0.01,"{'svr__C': 10, 'svr__epsilon': 0.1, 'svr__gamm...",0.883659,0.883507,0.887783,0.884983,0.001981,1


In [61]:
hp_search.cv_results_['std_test_score'][hp_search.best_index_]

0.001980586782341556