In [1]:
import pandas as pd
import os
from joblib import load
import sys
import numpy as np
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

from copy import deepcopy
import matplotlib.pyplot as plt
from datetime import datetime, timezone
# make paths above 'notebooks/' visible for local imports.
# +----------------------------------------------------------------------------+
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.processing import GatherFeatureDatasets

Remove all or some of the 2017 events (which are likely dominated by the Maple Creek Sequence) does not dramatically change the model performance in either way. 

In [2]:
data_dir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
all_train_df = pd.read_csv(f'{data_dir}/p.train.csv')
all_test_df = pd.read_csv(f'{data_dir}/p.test.csv')
all_holdout_df = pd.read_csv(f'{data_dir}/p.20230101.csv')

In [3]:
stat = "YDC"
proc = GatherFeatureDatasets(is_p=True)
station_feature_dict, station_meta_dict, feature_names = proc.process_station_datasets(stat, 
                                                                                        all_train_df,
                                                                                        test_df=all_test_df,
                                                                                        holdout_df=all_holdout_df,
                                                                                        scaler=False,
                                                                                        linear_model=False,
                                                                                        source_dist_type='dist')

YDC
X shape: (2928, 45), y shape: (2928,)
X shape: (725, 45), y shape: (725,)
X shape: (29, 45), y shape: (29,)


In [4]:
selected_features = np.array(['amp_1', 'amp_2', 'amp_4', 'amp_11','signal_variance',
                    'noise_variance', 'source_depth_km',
                    'source_receiver_distance_logkm',
                    'source_receiver_back_azimuth_deg'])
feature_inds = np.where(np.isin(feature_names, selected_features))[0]

In [5]:
feature_inds

array([18, 19, 21, 28, 40, 41, 42, 43, 44])

In [6]:
selected_X_train = station_feature_dict["X_train"][:, feature_inds]
print(selected_X_train.shape)
selected_y_train = station_meta_dict["y_train"]
print(selected_y_train.shape)

(2928, 9)
(2928,)


In [7]:
scaler = StandardScaler()
scaler = scaler.fit(selected_X_train)
X = scaler.transform(selected_X_train)
model = SVR(kernel='rbf', C=1.0, gamma=0.1)

In [8]:
model.fit(X, selected_y_train)

In [9]:
y_test = model.predict(scaler.transform(station_feature_dict["X_test"][:, feature_inds]))
y_2023 = model.predict(scaler.transform(station_feature_dict["X_holdout"][:, feature_inds]))

In [10]:
r2_score(selected_y_train, model.predict(X))

0.8678993856622711

In [11]:
r2_score(station_meta_dict["y_test"], y_test)

0.8185016307350825

In [12]:
r2_score(station_meta_dict["y_holdout"], y_2023)

-3.178241373540244