In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
datadir = '/uufs/chpc.utah.edu/common/home/koper-group3/alysha/magnitudes/feature_splits'
p_train = pd.read_csv(os.path.join(datadir, 'p.train.csv'))
p_test = pd.read_csv(os.path.join(datadir, 'p.test.csv'))
p_holdout = pd.read_csv(os.path.join(datadir, 'p.20230101.csv'))
s_train = pd.read_csv(os.path.join(datadir, 's.train.csv'))
s_test = pd.read_csv(os.path.join(datadir, 's.test.csv'))
s_holdout = pd.read_csv(os.path.join(datadir, 's.20230101.csv'))

In [3]:
p_train[["Event-Mean-YPML-S", "source_depth_km", "source_receiver_distance_km"]].describe().loc[['min', 'max']]

Unnamed: 0,Event-Mean-YPML-S,source_depth_km,source_receiver_distance_km
min,-0.041446,-3.47,0.044877
max,4.332082,23.48,112.392748


In [4]:
p_test[["Event-Mean-YPML-S", "source_depth_km", "source_receiver_distance_km"]].describe().loc[['min', 'max']]

Unnamed: 0,Event-Mean-YPML-S,source_depth_km,source_receiver_distance_km
min,-0.303969,0.12,0.204153
max,3.446631,18.44,114.788943


In [5]:
p_holdout[["Event-Mean-YPML-S", "source_depth_km", "source_receiver_distance_km"]].describe().loc[['min', 'max']]

Unnamed: 0,Event-Mean-YPML-S,source_depth_km,source_receiver_distance_km
min,0.134884,0.81,0.760827
max,3.615056,18.53,112.354244


In [6]:
s_train[["Event-Mean-YPML-S", "source_depth_km", "source_receiver_distance_km"]].describe().loc[['min', 'max']]

Unnamed: 0,Event-Mean-YPML-S,source_depth_km,source_receiver_distance_km
min,-0.041446,-1.94,0.044877
max,4.326333,23.48,83.316436


In [7]:
s_test[["Event-Mean-YPML-S", "source_depth_km", "source_receiver_distance_km"]].describe().loc[['min', 'max']]

Unnamed: 0,Event-Mean-YPML-S,source_depth_km,source_receiver_distance_km
min,-0.303969,0.54,0.331905
max,3.344134,18.44,63.004114


In [8]:
s_holdout[["Event-Mean-YPML-S", "source_depth_km", "source_receiver_distance_km"]].describe().loc[['min', 'max']]

Unnamed: 0,Event-Mean-YPML-S,source_depth_km,source_receiver_distance_km
min,0.134884,0.81,0.760827
max,3.615056,16.59,66.97591


In [9]:
p_holdout['event_identifier'].unique().shape

(508,)

In [10]:
s_holdout['event_identifier'].unique().shape

(436,)

In [11]:
p_train['Evid'].unique().shape

(5989,)

In [12]:
s_train['Evid'].unique().shape

(5170,)

In [13]:
p_test['Evid'].unique().shape

(1498,)

In [14]:
s_test['Evid'].unique().shape

(1312,)

In [15]:
assert np.any(~p_test.Evid.isin(p_train.Evid)), 'Training evid in the testing set'
assert np.any(~s_test.Evid.isin(s_train.Evid)), 'Training evid in the testing set'
assert np.any(~p_holdout.Evid.isin(p_train.Evid)), 'Training evid in the holdout set'
assert np.any(~s_holdout.Evid.isin(s_train.Evid)), 'Training evid in the holdout set'
assert np.any(~s_holdout.Evid.isin(s_test.Evid)), 'Testing evid in the holdout set'
assert np.any(~p_holdout.Evid.isin(p_test.Evid)), 'Testing evid in the holdout set'


# IQR of training data

In [16]:
train_evids = pd.read_csv(os.path.join(datadir, 'evids.train.txt'), names=['Evid'])
train_evids.head()

Unnamed: 0,Evid
0,60332307
1,60514627
2,60444567
3,60070317
4,60226492


In [17]:
pd.concat([p_train, s_train])["Event-Mean-YPML-S"].describe()

count    77769.000000
mean         1.398809
std          0.585878
min         -0.041446
25%          0.986231
50%          1.324647
75%          1.729424
max          4.332082
Name: Event-Mean-YPML-S, dtype: float64

In [18]:
p_train.drop_duplicates('Evid')["Event-Mean-YPML-S"].describe().loc[['25%', '75%']]

25%    0.903435
75%    1.614240
Name: Event-Mean-YPML-S, dtype: float64

In [19]:
s_train.drop_duplicates('Evid')["Event-Mean-YPML-S"].describe().loc[['25%', '75%']]

25%    0.900963
75%    1.604695
Name: Event-Mean-YPML-S, dtype: float64

# Count events < 0 and > 3.5

In [20]:
p_train[p_train["Event-Mean-YPML-S"] < 0].drop_duplicates('Evid').values.shape[0]

3

In [21]:
p_train[p_train["Event-Mean-YPML-S"] > 3.5].drop_duplicates('Evid').values.shape[0]

6

In [22]:
s_train[s_train["Event-Mean-YPML-S"] < 0].drop_duplicates('Evid').values.shape[0]

3

In [23]:
s_train[s_train["Event-Mean-YPML-S"] > 3.5].drop_duplicates('Evid').values.shape[0]

5

# Channel/station break down

In [90]:
# Number sensors
p_sensors = p_train[['station', 'channel']].drop_duplicates()
print(p_sensors.shape)

(48, 2)


In [94]:
# Number stations
p_stats = p_train['station'].drop_duplicates()
print(p_stats.shape)

(35,)


In [114]:
# Stations with multiple sensors (some have 3)
p_duplicated = p_sensors.value_counts('station')[p_sensors.value_counts('station') > 1].reset_index()
print(p_duplicated.shape)
p_duplicated

(10, 2)


Unnamed: 0,station,count
0,YPP,3
1,YHH,3
2,YHB,3
3,YNR,2
4,YUF,2
5,YMR,2
6,YDD,2
7,YHL,2
8,YTP,2
9,YFT,2


In [123]:
# 3 stations with 3 sensors
p_sensors[p_sensors.station.isin(p_duplicated[p_duplicated['count'] > 2].station)].sort_values('station')

Unnamed: 0,station,channel
0,YHB,EHZ
16,YHB,HHZ
5369,YHB,ENZ
6,YHH,HHZ
25,YHH,EHZ
6165,YHH,ENZ
8,YPP,EHZ
47,YPP,HHZ
21024,YPP,ENZ


In [136]:
# All 10 sensors have an HH station, 6 also have a EN station and 1 has an EH station. 
# 3 have all 3. 
p_sensors[p_sensors.station.isin(p_duplicated.station)].value_counts('channel')

channel
HHZ    10
ENZ     9
EHZ     4
Name: count, dtype: int64

In [124]:
# Count the number of channels
p_sensors.value_counts('channel')

channel
EHZ    24
HHZ    12
ENZ     9
BHZ     3
Name: count, dtype: int64

In [126]:
# Count the stand alone stations (not co-located with any other stations)
p_unique = p_sensors.drop_duplicates('station', keep=False) #.shape #.value_counts('channel')
p_unique.shape

(25, 2)

In [127]:
p_unique.value_counts('channel')

channel
EHZ    20
BHZ     3
HHZ     2
Name: count, dtype: int64

In [144]:
p_sensors[(p_sensors.station.isin(p_duplicated.station)) &
           (p_sensors.station.isin(p_sensors[p_sensors.channel == 'EHZ'].station))].sort_values('station')

Unnamed: 0,station,channel
0,YHB,EHZ
16,YHB,HHZ
5369,YHB,ENZ
6,YHH,HHZ
25,YHH,EHZ
6165,YHH,ENZ
8,YPP,EHZ
47,YPP,HHZ
21024,YPP,ENZ
193,YTP,EHZ


S

In [146]:
# Number sensors in S data
s_sensors = s_train[['station', 'channel1']].drop_duplicates()
print(s_sensors.shape)

(29, 2)


In [158]:
s_stations = s_train['station'].drop_duplicates()
s_stations.shape

(18,)

In [152]:
# 2 of the stations have BH1 and BHN, so not really 'duplicates'
s_sensors.value_counts('channel1')

channel1
HHN    11
ENN     9
EHN     5
BH1     2
BHN     2
Name: count, dtype: int64

In [159]:
s_duplicated = s_sensors.value_counts('station')[s_sensors.value_counts('station') > 1].reset_index()
s_duplicated.shape

(11, 2)

In [160]:
# Count the stand alone stations (not co-located with any other stations)
s_unique = s_sensors.drop_duplicates('station', keep=False) #.shape #.value_counts('channel')
s_unique.shape

(7, 2)

In [162]:
s_unique.value_counts('channel1')

channel1
EHN    5
HHN    2
Name: count, dtype: int64

In [164]:
s_sensors[s_sensors.station.isin(s_duplicated.station)].value_counts('channel1')

channel1
HHN    9
ENN    9
BHN    2
BH1    2
Name: count, dtype: int64

In [153]:
p_train.value_counts('channel')

channel
EHZ    38593
HHZ    23859
BHZ     2377
ENZ      202
Name: count, dtype: int64

In [155]:
s_train.value_counts('channel1')

channel1
HHN    9594
EHN    2554
BH1     379
BHN     149
ENN      62
Name: count, dtype: int64