In [1]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

import sys
import os
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.features import SelectPFeatures
from src.features import FeaturePlots as fp

In [2]:
sf = SelectPFeatures()

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline

# Load the data and process each station individually

In [4]:
data_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
train = pd.read_csv(f'{data_dir}/p.train.csv')
test = pd.read_csv(f'{data_dir}/p.test.csv')

In [5]:
station_feature_dict, feature_names = sf.process_station_datasets(train, test)

YHB
X shape: (2920, 47), y shape: (2920,)
X shape: (734, 47), y shape: (734,)
YDC
X shape: (2509, 47), y shape: (2509,)
X shape: (645, 47), y shape: (645,)
YWB
X shape: (3069, 47), y shape: (3069,)
X shape: (786, 47), y shape: (786,)
MCID
X shape: (2942, 47), y shape: (2942,)
X shape: (771, 47), y shape: (771,)
YHL
X shape: (2739, 47), y shape: (2739,)
X shape: (682, 47), y shape: (682,)
YMR
X shape: (3393, 47), y shape: (3393,)
X shape: (845, 47), y shape: (845,)
YHH
X shape: (4005, 47), y shape: (4005,)
X shape: (1002, 47), y shape: (1002,)
B207
X shape: (1609, 47), y shape: (1609,)
X shape: (380, 47), y shape: (380,)
YPP
X shape: (1334, 47), y shape: (1334,)
X shape: (338, 47), y shape: (338,)
YPM
X shape: (3358, 47), y shape: (3358,)
X shape: (843, 47), y shape: (843,)
YLT
X shape: (1275, 47), y shape: (1275,)
X shape: (291, 47), y shape: (291,)
QLMT
X shape: (792, 47), y shape: (792,)
X shape: (190, 47), y shape: (190,)
H17A
X shape: (527, 47), y shape: (527,)
X shape: (142, 47), 

In [6]:
ll_feature_subset = ['amp_ratio_4', 'amp_ratio_5', 'amp_ratio_6', 'amp_ratio_7',
                'amp_1', 'amp_2', 'amp_3', 'amp_4','amp_5', 'signal_variance',
                'signal_dominant_amplitude', 'signal_max_amplitude','signal_dominant_frequency',
                'noise_max_amplitude', 'noise_variance', 
                'source_depth_km', 'source_latitude',
                'source_longitude']

sr_feature_subset = ['amp_ratio_4', 'amp_ratio_5', 'amp_ratio_6', 'amp_ratio_7',
                'amp_1', 'amp_2', 'amp_3', 'amp_4','amp_5', 'signal_variance',
                'signal_dominant_amplitude', 'signal_max_amplitude','signal_dominant_frequency',
                'noise_max_amplitude', 'noise_variance', 
                'source_depth_km', 'source_receiver_distance_logkm',
                'source_receiver_back_azimuth_sine']

In [7]:
ll_station_feat_dict = sf.filter_station_dict_features(station_feature_dict,
                             feature_names,
                             ll_feature_subset)

YHB
X_train: (2920, 18), X_test: (734, 18)
YDC
X_train: (2509, 18), X_test: (645, 18)
YWB
X_train: (3069, 18), X_test: (786, 18)
MCID
X_train: (2942, 18), X_test: (771, 18)
YHL
X_train: (2739, 18), X_test: (682, 18)
YMR
X_train: (3393, 18), X_test: (845, 18)
YHH
X_train: (4005, 18), X_test: (1002, 18)
B207
X_train: (1609, 18), X_test: (380, 18)
YPP
X_train: (1334, 18), X_test: (338, 18)
YPM
X_train: (3358, 18), X_test: (843, 18)
YLT
X_train: (1275, 18), X_test: (291, 18)
QLMT
X_train: (792, 18), X_test: (190, 18)
H17A
X_train: (527, 18), X_test: (142, 18)
B208
X_train: (526, 18), X_test: (134, 18)
LKWY
X_train: (1016, 18), X_test: (278, 18)
FLWY
X_train: (694, 18), X_test: (177, 18)
YGC
X_train: (1725, 18), X_test: (451, 18)
TPMT
X_train: (408, 18), X_test: (97, 18)
YMC
X_train: (3553, 18), X_test: (898, 18)
YML
X_train: (2678, 18), X_test: (638, 18)
B206
X_train: (671, 18), X_test: (175, 18)
B944
X_train: (360, 18), X_test: (95, 18)
YLA
X_train: (681, 18), X_test: (169, 18)
YUF
X_trai

In [10]:
sr_station_feat_dict = sf.filter_station_dict_features(station_feature_dict,
                             feature_names,
                             sr_feature_subset)

YHB
X_train: (2920, 18), X_test: (734, 18)
YDC
X_train: (2509, 18), X_test: (645, 18)
YWB
X_train: (3069, 18), X_test: (786, 18)
MCID
X_train: (2942, 18), X_test: (771, 18)
YHL
X_train: (2739, 18), X_test: (682, 18)
YMR
X_train: (3393, 18), X_test: (845, 18)
YHH
X_train: (4005, 18), X_test: (1002, 18)
B207
X_train: (1609, 18), X_test: (380, 18)
YPP
X_train: (1334, 18), X_test: (338, 18)
YPM
X_train: (3358, 18), X_test: (843, 18)
YLT
X_train: (1275, 18), X_test: (291, 18)
QLMT
X_train: (792, 18), X_test: (190, 18)
H17A
X_train: (527, 18), X_test: (142, 18)
B208
X_train: (526, 18), X_test: (134, 18)
LKWY
X_train: (1016, 18), X_test: (278, 18)
FLWY
X_train: (694, 18), X_test: (177, 18)
YGC
X_train: (1725, 18), X_test: (451, 18)
TPMT
X_train: (408, 18), X_test: (97, 18)
YMC
X_train: (3553, 18), X_test: (898, 18)
YML
X_train: (2678, 18), X_test: (638, 18)
B206
X_train: (671, 18), X_test: (175, 18)
B944
X_train: (360, 18), X_test: (95, 18)
YLA
X_train: (681, 18), X_test: (169, 18)
YUF
X_trai

In [13]:
assert np.array_equal(ll_station_feat_dict['YHB']['X_train'][:, 0:-2], sr_station_feat_dict['YHB']['X_train'][:, 0:-2]),\
'The filtered datasets are not the same, excluding the location columns'

Do nested CV for Recursive Feature Elimination (RFE) to select the optimal number of features for each station and (I think?) the optimal hyperparameters.  

Do this for both the feature set with lat/lon and the feature set with SR distance and baz. Then use the location parameters that result in the best performance in the cross validation overall (across all stations). (I would rather just use sr-distance and backazimuth).  

Then, use the optimal number of features from the CV to select the features from each stations training dataset. Compare the selected features.  