In [None]:
import sys
lib_path = [r'C:\Users\ikahbasi\OneDrive\Applications\GitHub\SeisRoutine',
            r'C:\Users\ikahb\OneDrive\Applications\GitHub\SeisRoutine']
for path in lib_path:
    sys.path.append(path)
##########################################################################
import SeisRoutine.catalog as src
import SeisRoutine.waveform as srw
import SeisRoutine.config as srconf
import SeisRoutine.statistics as srs

In [None]:
import seisbench.generate as sbg
import seisbench.models as sbm
import torch
from tqdm import tqdm
from scipy import signal
import os
import seisbench.data as sbd
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import label
import pandas as pd


# Functions and Classes

In [None]:
def find_ps_pairs(metadata):
    keys = metadata.keys()
    df_p = metadata[[key for key in keys
                     if (key.upper().startswith('trace_P'.upper())
                         and
                         key.upper().endswith('_arrival_sample'.upper())
                         )
                    ]]
    p_condition = df_p.notna().any(axis=1)
    ############################################################################
    df_s = metadata[[key for key in keys
                     if (key.upper().startswith('trace_S'.upper())
                         and
                         key.upper().endswith('_arrival_sample'.upper())
                         )
                    ]]
    s_condition = df_s.notna().any(axis=1)
    ############################################################################
    ps_pairs_condition = s_condition == p_condition
    return ps_pairs_condition

# Run

In [None]:
path = r'F:\DataSets-Local\Merged_All_DataSets_2025-07-10 (Ahar-Ilam-Kaki-Qeshm)\metadata.csv'
df_metadata = pd.read_csv(path, low_memory=False)

In [None]:
cond_PS_pairs = find_ps_pairs(metadata=df_metadata)

In [None]:
key = 'trace_npts'
cond_data_available = (df_metadata[key] == 3001)

In [None]:
treshold_snr = 2
keys = [key for key in df_metadata.keys() if key.endswith('_snr')]
cond_snr_channel = df_metadata[keys] >= treshold_snr
numbers_of_good_snr_channel = cond_snr_channel.sum(axis=1)
# numbers_of_good_snr_channel.hist()
cond_snr = numbers_of_good_snr_channel == 3

# AutoPicks (Unique Earthquake)

In [None]:
path = r'F:\DataSets-Local\Merged_All_DataSets_2025-07-10 (Ahar-Ilam-Kaki-Qeshm)\metadata-with-AutoPicks.pkl'
df_autopicks = pd.read_pickle(path)

In [None]:
list(df_autopicks.keys())

In [None]:
keys = [key for key in df_autopicks.keys() if key.endswith('AutoPik')]
key_p = keys[0]
key_s = keys[1]

In [None]:
func = lambda x: len(x) if isinstance(x, list) else 0
num_P_autopicks = df_autopicks[key_p].apply(func)
num_S_autopicks = df_autopicks[key_s].apply(func)

# cond_only_one_eq_in_window = (num_P_autopicks==1) & (num_S_autopicks==1)
cond_only_one_eq_in_window = num_S_autopicks <= 1

# High error manual picks

In [None]:
def residual_pick_time(auto_picks, manual_pick=500, unkown=9999):
    if isinstance(auto_picks, list) and (len(auto_picks)>0):
        auto_picks = np.array(auto_picks)
        rms = auto_picks - manual_pick
        rms = abs(rms)
        output = min(rms)
    else:
        output = unkown
    return output

In [None]:
p_phase_time_difference = df_autopicks[key_p].apply(residual_pick_time)

excepted_error = 200                                                            # in samples
cond_if_P_phase_is_not_outlier = p_phase_time_difference <= excepted_error

# Skewness

In [None]:
path = r'F:\DataSets-Local\Merged_All_DataSets_2025-07-10 (Ahar-Ilam-Kaki-Qeshm)\metadata-with-skewness.pkl'
# path = r'F:\DataSets-Local\Merged_All_DataSets_2025-07-10 (Ahar-Ilam-Kaki-Qeshm)\metadata-with-skewness-old.pkl'
df_skewness = pd.read_pickle(path)

In [None]:
keys = [key for key in df_skewness.keys() if key.endswith('skewness')]
keys1 = [key for key in keys if 'no-filter' in key]
keys2 = [key for key in keys if 'with-filter' in key]

In [None]:
# treshold_skewness = 2
# good_skewness = (df[keys].abs() <= treshold_skewness)
# good_skewness = good_skewness.sum(axis=1)
# cond_good_skewness = good_skewness == 3
# print(sum(~cond_good_skewness))

In [None]:
treshold_skewness = 5
treshold_channel = 0
bad_skewness1 = (treshold_skewness <= df_skewness[keys1].abs())
bad_skewness1 = bad_skewness1.sum(axis=1)
bad_skewness1 = bad_skewness1 > treshold_channel
#
bad_skewness2 = (treshold_skewness <= df_skewness[keys2].abs())
bad_skewness2 = bad_skewness2.sum(axis=1)
bad_skewness2 = bad_skewness2 > treshold_channel
bad_skewness_total = bad_skewness1 & bad_skewness2
print(sum(bad_skewness1), sum(bad_skewness2), sum(bad_skewness_total))
cond_good_skewness = ~bad_skewness_total

# Noisy Data (Frequency)

In [None]:
path = r'F:\DataSets-Local\Merged_All_DataSets_2025-07-10 (Ahar-Ilam-Kaki-Qeshm)\metadata-with-frequency.pkl'
df_fft = pd.read_pickle(path)

In [None]:
keys = [key for key in df_fft.keys() if key.endswith('fft')]
# keys

In [None]:
for channel in ['E', 'N', 'Z']:
    m_band = df_fft[f'trace_{channel}_max_M-band_fft']
    h_band = df_fft[f'trace_{channel}_max_H-band_fft']
    df_fft[f'trace_{channel}_noise_level'] = h_band / m_band

keys = [key for key in df_fft.keys() if key.endswith('_noise_level')]

treshold_noisy_level = 1
noisy_channel = df_fft[keys] >= treshold_noisy_level
numbers_of_noisy_channel = noisy_channel.sum(axis=1)
cond_not_noisy_data = numbers_of_noisy_channel == 0

# Flat Signal

In [None]:
path = r'F:\DataSets-Local\Merged_All_DataSets_2025-07-10 (Ahar-Ilam-Kaki-Qeshm)\metadata-with-std.pkl'
df_std = pd.read_pickle(path)

In [None]:
threshold_flatness = 0.01

keys = [key for key in df_std.keys()
        if key.endswith('_std')]
flat_signals = df_std[keys] <= threshold_flatness
numbers_of_flat_signals = flat_signals.sum(axis=1)
cond_not_flat = numbers_of_flat_signals <= 0

# Merge

In [None]:
lst = (cond_PS_pairs,
            cond_data_available,
            cond_only_one_eq_in_window,
            cond_if_P_phase_is_not_outlier,
            cond_good_skewness,
            cond_not_noisy_data,
            cond_snr,
            cond_not_flat)

for el in lst:
    print(sum(el))

In [None]:
cond_all = (cond_PS_pairs &
            cond_data_available &
            cond_only_one_eq_in_window &
            cond_if_P_phase_is_not_outlier &
            cond_good_skewness &
            cond_not_noisy_data &
            cond_snr &
            cond_not_flat)
accepted_data = cond_all[cond_all]
sum(cond_all)

# Visual Inspection (Statistics)

In [None]:
df_accepted = df_metadata[cond_all]
print(df_accepted.shape)

In [None]:
_ = df_accepted['station_network_code'].hist()

In [None]:
result = (df_accepted['station_network_code']+ '_' + df_accepted['station_code']).value_counts()
print(result.to_string())

In [None]:
result

In [None]:
result.plot(kind='bar', figsize=(15,4), rot=90)

# Visual Inspection (Waveform)

In [None]:
from SeisRoutine.waveform.waveform import fft

In [None]:
init_cfg = srconf.load_config('0-init-cfg.yml')
cfg_path = os.path.join(init_cfg.target_config_filepath,
                        init_cfg.target_config_filename)
cfg = srconf.load_config(cfg_path)

In [None]:
dataset = sbd.WaveformDataset(
    path=cfg.dataset.path,
    sampling_rate=cfg.training.dataset.sampling_rate,
    component_order=cfg.training.dataset.component_order,
          )

In [None]:
sps = 100
augmentations = [
    # sbg.Filter(N=4,
    #            Wn=[1, 10],
    #            btype='bandpass',
    #            forward_backward=True,
    #            ),
    sbg.Normalize(demean_axis=-1,
                  amp_norm_axis=-1,
                  amp_norm_type="peak"),
    sbg.ChangeDtype(np.float32),
]
generator = sbg.GenericGenerator(dataset)
generator.add_augmentations(augmentations)

In [None]:
# cond_data_available

In [None]:
sum(~cond_data_available)

In [None]:
# df_to_plot = cond_not_flat[~cond_not_flat]
# df_to_plot = cond_data_available[~cond_data_available
# df_to_plot = cond_only_one_eq_in_window[~cond_only_one_eq_in_window]
# df_to_plot = cond_if_P_phase_is_not_outlier[~cond_if_P_phase_is_not_outlier]

df_to_plot = cond_not_noisy_data[~cond_not_noisy_data]
# accepted_data = cond_all[cond_all]

In [None]:
df_to_plot = cond_not_noisy_data[~cond_not_noisy_data]
keys_additional = [key for key in df_fft.keys()
                   if key.endswith('_fft')]
keys_additional

In [None]:
# df_to_plot = cond_good_skewness[~cond_good_skewness]
# keys_additional = [key for key in df.keys()
#                    if key.endswith('_skewness')]

In [None]:
num_ploted = 0
for index in df_to_plot.index:
    metadata = df_fft.iloc[[index]]
    # if all(metadata['station_code'] != 'GLGL'):
    #     continue
    print('*'*100)
    with pd.option_context("display.max_columns", None):
        display(metadata[['station_network_code', 'station_code']+keys_additional])
    sample = generator[index]
    data_X = sample["X"]
    fig, axes = plt.subplots(1, 2,
            figsize=(10, 3))
    jj = -1
    for _x, channel in zip(data_X, dataset.component_order):
        freq, ampl = fft(array=_x, delta=0.01)
        axes[0].plot(_x+jj, label=channel)
        axes[1].semilogx(freq, ampl, label=channel)
        jj += 1
    plt.show()
    num_ploted += 1
    if num_ploted == 5:
        break