*This code is necessary on colab to install SeisBench. If SeisBench is already installed on your machine, you can skip this.*

In [None]:
# !pip -q install seisbench

*This cell is required to circumvent an issue with colab and obspy. For details, check this issue in the obspy documentation: https://github.com/obspy/obspy/issues/2547*

In [None]:
# try:
#     import obspy
#     obspy.read()
# except TypeError:
#     # Needs to restart the runtime once, because obspy only works properly after restart.
#     print('Stopping RUNTIME. If you run this code for the first time, this is expected. Colaboratory will restart automatically. Please run again.')
#     exit()

# Training PhaseNet

This tutorial shows how to train a model with SeisBench, using PhaseNet as an example. This brings together the three main components of SeisBench: data, models and generate.

The tutorial is intended to highlight the basic principles of training models in SeisBench. However, this will not necessarily be best practice for more elaborate experiments. As a reference how to set up larger studies and which augmentations can be used for which models, we refer to the implementation of our pick benchmark at [https://github.com/seisbench/pick-benchmark](https://github.com/seisbench/pick-benchmark).

*Note: As this tutorial brings together different parts of seisbench, it is recommended to go through the basic tutorials first before beginning this tutorial. In addition, this tutorial assumes some familiarity with pytorch*

In [None]:
import sys
lib_path = [r'C:\Users\ikahbasi\OneDrive\Applications\GitHub\SeisRoutine',
            r'C:\Users\ikahb\OneDrive\Applications\GitHub\SeisRoutine']
for path in lib_path:
    sys.path.append(path)
##########################################################################
import SeisRoutine.catalog as src
import SeisRoutine.waveform as srw
import SeisRoutine.config as srconf
import SeisRoutine.statistics as srs

In [None]:
import myfuncs as mf

In [None]:
import seisbench.data as sbd
import seisbench.generate as sbg
import seisbench.models as sbm
from seisbench.util import worker_seeding
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader
import logging
from datetime import datetime
import os
import ipynbname

# Define Some Functions

In [None]:
def getting_filename_and_path_of_the_running_code():
    """
    Get the filename and directory path of the currently executing code.
    
    This function works for both regular Python scripts (.py files) and Jupyter Notebooks
    (.ipynb files). For notebooks, it handles both VS Code's environment and standard
    Jupyter environments.

    Returns:
        tuple: A tuple containing (directory_path, filename) of the running code.
        
    Note:
        In Jupyter Notebook environments, returns the notebook name and path.
        In regular Python scripts, returns the script name and path.
    """
    _file = sys.argv[0]
    name = os.path.basename(_file)
    path = os.path.dirname(_file)
    if name == "ipykernel_launcher.py":
        try:
            _file = globals()['__vsc_ipynb_file__']
            name = os.path.basename(_file)
            path = os.path.dirname(_file)
        except Exception as error:
            print(error)
            name = ipynbname.name()
            path = ipynbname.path()
    return path, name

# Initializing the init file and starting logging.

In [None]:
log_separator = "+" * 80

In [None]:
init_cfg = srconf.load_config('0-init-cfg.yml')
cfg_path = os.path.join(init_cfg.target_config_filepath,
                        init_cfg.target_config_filename)
cfg = srconf.load_config(cfg_path)

In [None]:
srconf.configure_logging(level=cfg.log.level,
                         log_format=cfg.log.format,
                         mode=cfg.log.mode, colored_console=True,
                         filepath=cfg.log.filepath,
                         filename_prefix=cfg.log.filename_prefix,
                         filename=cfg.log.filename)

In [None]:
nb_path, nb_name = getting_filename_and_path_of_the_running_code()
msg = (f"Logging has started for notebook: {nb_name}.\n"
       f"This file is located at: {nb_path}\n")
logging.info(msg)
logging.info(f"Separator: {log_separator}")

In [None]:
msg = cfg.__str__()
logging.info(f'Configuration File:\n{msg}')
logging.info(f"Separator: {log_separator}")

# Loading Dataset

In [None]:
def find_ps_pairs(metadata):
    keys = metadata.keys()
    df_p = metadata[[key for key in keys
                     if (key.upper().startswith('trace_P'.upper())
                         and
                         key.upper().endswith('_arrival_sample'.upper())
                         )
                    ]]
    p_condition = df_p.notna().any(axis=1)
    ############################################################################
    df_s = metadata[[key for key in keys
                     if (key.upper().startswith('trace_S'.upper())
                         and
                         key.upper().endswith('_arrival_sample'.upper())
                         )
                    ]]
    s_condition = df_s.notna().any(axis=1)
    ############################################################################
    ps_pairs_condition = s_condition == p_condition
    return ps_pairs_condition

In [None]:
problem_data = [['bucket22$500,:3,:3001', '1386f728-f76a-417d-8dd5-711366dc4bcc,2014-09-04T10:35:37.400000Z']]

In [None]:
dataset = sbd.WaveformDataset(
    path=cfg.dataset.path,
    sampling_rate=cfg.training.dataset.sampling_rate,
    component_order=cfg.training.dataset.component_order,
   # dimension_order=cfg.training.dataset.dimension_order # must recheck!
   )
# dataset.filter(~(dataset.metadata['trace_name'] == "bucket2$268,:3,:3001").values, inplace=True)

In [None]:
dataset.metadata['PS-pairs'] = find_ps_pairs(metadata=dataset.metadata)

In [None]:
# df = dataset.metadata[dataset.metadata['PS-pairs']]
# network = 'QM'
# (df['station_network_code']==network).sum(), (dataset.metadata['station_network_code']==network).sum()

In [None]:
precentages = {'train': cfg.dataset.spliter.train,
               'dev':   cfg.dataset.spliter.dev,
               'test':  cfg.dataset.spliter.test}
##################################################
mf.dataset.manual_spliter(
   dataset=dataset,
   mode='PS-Pairs',
   # mode='All',
   precentages=precentages,
   random=True,
   random_state=42)

In [None]:
train, dev, test = dataset.train_dev_test()
print(train, dev, test, sep='\n')

In [None]:
# for ii in range(len(dataset.metadata)):
#     sample = dataset.get_sample(ii)
#     _data, _meta = sample
#     condition, quality = is_waveform_healthy(_data, axis=1, max_thr=1e-6, std_thr=1e-5)
#     if not condition:
#         # print(ii, quality)
#         dataset.metadata.loc[_meta['index'], 'split'] = 'Undefined'
#     else:
#         snr_thr = 3
#         snr = compute_snr(trace=_data, pick_idx=500, noise_window=100, signal_window=200)
#         if (snr < snr_thr).all():
#             # print(ii, snr)
#             dataset.metadata.loc[_meta['index'], 'split'] = 'Undefined'

In [None]:
for ii in range(10):
    sample = dataset.get_sample(ii)
    _data, _meta = sample
    plt.plot(_data.T)
    plt.legend(['E', 'N', 'Z'])
    plt.title(f'{_meta['index']} {_meta['station_code']}')
    plt.show()
    # break

## Generation pipeline

In [None]:
import numpy as np
from scipy import signal

In [None]:
class Tapering:
    def __init__(self, alpha=0.3, key='X'):
        self.alpha = alpha  # Tapering Coefficient
        if isinstance(key, str):
            self.key = (key, key)
        else:
            self.key = key

    def __call__(self, state_dict):
        x, metadata = state_dict[self.key[0]]
        taper = signal.windows.tukey(x.shape[-1], self.alpha)
        x = x * taper
        state_dict[self.key[1]] = (x, metadata)

In [None]:
phase_dict = {
    "trace_p_arrival_sample": "P",
    "trace_pP_arrival_sample": "P",
    "trace_P_arrival_sample": "P",
    "trace_P1_arrival_sample": "P",

    "trace_Pg_arrival_sample": "P",
    "trace_PG_arrival_sample": "P",

    "trace_Pn_arrival_sample": "P",
    "trace_PmP_arrival_sample": "P",
    "trace_pwP_arrival_sample": "P",
    "trace_pwPm_arrival_sample": "P",
    
    "trace_s_arrival_sample": "S",
    "trace_S_arrival_sample": "S",
    "trace_S1_arrival_sample": "S",

    "trace_Sg_arrival_sample": "S",
    "trace_SG_arrival_sample": "S",

    "trace_SmS_arrival_sample": "S",
    "trace_Sn_arrival_sample": "S",
}

In [None]:
list(train.metadata.keys())

In [None]:
# tmp = {}
# for key, val in phase_dict.items():
#     tmp[key.replace('trace', 'trace_manual')] = val

# phase_dict = tmp

In [None]:
sps = 100
augmentations = [
    Tapering(),
    sbg.Normalize(
        demean_axis=-1,
        amp_norm_axis=-1,
        amp_norm_type="peak"),
    sbg.FixedWindow(
        p0=-15*sps,
        windowlen=1*60*sps,
        strategy="pad",
        key='X'),
    sbg.WindowAroundSample(
        metadata_keys=list(phase_dict.keys()),
        samples_before=2000,
        windowlen=5000,
        selection="random",
        strategy="variable"),
    sbg.GaussianNoise(
        scale=(0, 0.02),
        key='X'),
    sbg.RandomWindow(
        windowlen=3001),
    sbg.ChangeDtype(np.float32),
    sbg.ProbabilisticLabeller(
        label_columns=phase_dict,
        model_labels=cfg.training.hyperparameters.phases,
        sigma=30,
        dim=0),
]

In [None]:
train_generator = sbg.GenericGenerator(train)
dev_generator = sbg.GenericGenerator(dev)
test_generator = sbg.GenericGenerator(test)
#######################################################
train_generator.add_augmentations(augmentations)
dev_generator.add_augmentations(augmentations)
test_generator.add_augmentations(augmentations)

Let's visualize a few training examples. Everytime you run the cell below, you'll see a different training example.

In [None]:
sample_number = np.random.randint(len(dev_generator))
sample = train_generator[sample_number]

fig = plt.figure(figsize=(15, 5))
axs = fig.subplots(2, 1, sharex=True, gridspec_kw={"hspace": 0, "height_ratios": [3, 1]})
axs[0].plot(sample["X"].T)
axs[1].plot(sample["y"].T)
plt.suptitle(sample_number)

SeisBench generators are pytorch datasets. Therefore, we can pass them to pytorch data loaders. These will automatically take care of parallel loading and batching. Here we create one loader for training and one for validation. We choose a batch size of 256 samples. This batch size should fit on most hardware.

In [None]:
train_loader = DataLoader(
    train_generator,
    batch_size=cfg.training.hyperparameters.batch_size,
    shuffle=cfg.training.dataset.suffle,
    num_workers=cfg.training.num_workers,
    worker_init_fn=worker_seeding
)
dev_loader = DataLoader(
    dev_generator,
    batch_size=cfg.training.hyperparameters.batch_size,
    shuffle=cfg.training.dataset.suffle,
    num_workers=cfg.training.num_workers,
    worker_init_fn=worker_seeding
)
test_loader = DataLoader(
    test_generator,
    batch_size=cfg.training.hyperparameters.batch_size,
    shuffle=cfg.training.dataset.suffle,
    num_workers=cfg.training.num_workers,
    worker_init_fn=worker_seeding
)

## Model Initialization

We create a randomly initialized PhaseNet model using `seisbench.models`. If available, you can move your model onto the GPU for faster training.

In [None]:
torch.manual_seed(cfg.training.hyperparameters.manual_seed)
model = sbm.PhaseNet(phases=cfg.training.hyperparameters.phases,
                     norm=cfg.training.hyperparameters.norm)

In [None]:
if torch.cuda.is_available():
    model.cuda()
    msg = ("Processor:\n"
           f"The {model.name} Model are running on GPU\n"
           f"GPU Name: {torch.cuda.get_device_name(0)}\n"
           f"Number of GPUs: {torch.cuda.device_count()}\n"
           f"CUDA available: {torch.cuda.is_available()}\n"
           )
else:
    msg = mf.resources.get_cpu_info()
    msg = ("Processor:\n"
           f"The {model.name} Model are running on CPU\n"
           + msg)
    
logging.info(msg)
logging.info(f"Separator: {log_separator}")

# Training

## Define Parameters

Now we got all components for training the model. What we still need to do is define the optimizer and the loss, and write the training and validation loops.

In [None]:
# optimizer = torch.optim.Adam(model.parameters(), lr=eval(cfg.training.hyperparameters.learning_rate))

In [None]:
def loss_fn(y_pred, y_true, eps=1e-5):
    # vector cross entropy loss
    h = y_true * torch.log(y_pred + eps)
    h = h.mean(-1).sum(-1)  # Mean along sample dimension and sum along pick dimension
    h = h.mean()  # Mean over batch axis
    return -h

In [None]:
def train_loop(dataloader, optimizer):
    lst_loss = []
    size = len(dataloader.dataset)
    for batch_id, batch in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(batch["X"].to(model.device))
        loss = loss_fn(pred, batch["y"].to(model.device))
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #
        if batch_id % 5 == 0:
            loss, current = loss.item(), batch_id * batch["X"].shape[0]
            logging.info(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            lst_loss.append((batch_id//5, loss))
    return lst_loss

In [None]:
def test_loop(dataloader):
    num_batches = len(dataloader)
    test_loss = 0

    model.eval()  # close the model for evaluation

    with torch.no_grad():
        for index, batch in enumerate(dataloader):
            # print(index, batch)
            pred = model(batch["X"].to(model.device))
            test_loss += loss_fn(pred, batch["y"].to(model.device)).item()

    model.train()  # re-open model for training stage

    test_loss /= num_batches
    logging.info(f"Test avg loss: {test_loss:>8f} \n")
    return test_loss

# Training Model

In [None]:
# import logging
# logging.basicConfig(
#     level=logging.DEBUG,
#     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
#     force=True
# )

In [None]:
# ### Find problem in data set.
# batch_samples = []
# for i in range(len(test_loader.dataset)):
#     try:
#         sample = test_loader.dataset[i]
#         batch_samples.append(sample)
#     except Exception as e:
#         print(f"Error loading sample {i}: {e}")
#         data, metadata = dev.get_sample(i)


# i = 3624
# plt.plot(dev.get_sample(i)[0].T)
# plt.show()
# plt.plot(dev_loader.dataset[i]['X'].T)

In [None]:
# cfg.training.hyperparameters.learning_rates = ['1e-3', '1e-4']
# cfg.training.hyperparameters.epochs_for_each_learning_rate = [2, 2]

In [None]:
df_loss = pd.DataFrame(columns=['epoch', 'batch', 'loss_train', 'loss_test'])
###
for learning_rate, epochs in zip(cfg.training.hyperparameters.learning_rates,
                                 cfg.training.hyperparameters.epochs_for_each_learning_rate):
    logging.info(f"Main Learning-Rate: {learning_rate}\n" + "-"*70)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=eval(learning_rate))
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer=optimizer, mode='min',
        factor=cfg.training.hyperparameters.lr_scheduler.ReduceLROnPlateau.factor,
        patience=cfg.training.hyperparameters.lr_scheduler.ReduceLROnPlateau.patience,
        threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08)
    for epoch in range(epochs):
        learning_rate = scheduler.get_last_lr()[0]
        logging.info(f"Learning-Rate: {learning_rate} Epoch: {epoch+1}\n" + "-"*70)
        train_loss = train_loop(dataloader=train_loader,
                                optimizer=optimizer)
        test_loss = test_loop(dev_loader)
        scheduler.step(test_loss)
        #
        df_loss_tmp = pd.DataFrame(train_loss, columns=['batch', 'loss_train'])
        df_loss_tmp['epoch'] = epoch
        df_loss_tmp['loss_test'] = None
        last_none_index = len(df_loss_tmp) - 1
        df_loss_tmp.at[last_none_index, "loss_test"] = test_loss
        df_loss = pd.concat([df_loss, df_loss_tmp], ignore_index=True)

# Saving Trained Model

In [None]:
import os
import shutil

In [None]:
# cfg.training.output_model.filepath = r'F:\Models\PhaseNet_Trained_single_networks'
# cfg.training.output_model.version_str= r'14040318_1800_PS-Pairs'

In [None]:
os.makedirs(os.path.abspath(cfg.training.output_model.filepath), exist_ok=True)

In [None]:
shutil.copy(init_cfg.target_config_filename,
            f'{cfg.training.output_model.filepath}/cfg-{cfg.training.output_model.version_str}.yml')

In [None]:
df_loss.to_csv(
    os.path.join(cfg.training.output_model.filepath,
                 f'loss_{cfg.training.output_model.version_str}.csv')
                 )

In [None]:
model.save(
    path=os.path.join(cfg.training.output_model.filepath,
                      cfg.training.output_model.filename_prefix),
    weights_docstring=cfg.__str__(),
    version_str=cfg.training.output_model.version_str
)

# Plot Loss

In [None]:
import plotly.graph_objects as go
import plotly
import plotly.express
import plotly.express as px

In [None]:
df_loss = pd.read_csv(
            os.path.join(
                cfg.training.output_model.filepath,
                f'loss_{cfg.training.output_model.version_str}.csv'
                )
            )
df_loss['index'] = df_loss.index.values

In [None]:
idx_train = df_loss.index.values
loss_train = df_loss['loss_train']
###
loss_test = df_loss['loss_test']

idx_test = df_loss.index.values[loss_test.notna()]
loss_test = loss_test.values[loss_test.notna()]

In [None]:
config = dict({'scrollZoom': True})

fig = px.line(
    df_loss, 
    x="index", 
    y=["loss_train", "loss_test"], 
    markers=True  # Adding point marker
)
fig.update_traces(connectgaps=True)
fig.update_layout(yaxis_type="log")

fig.show(config=config)

In [None]:
# config = dict({'scrollZoom': True})
# #
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=idx_train, y=loss_train, mode='lines', name='Train Loss'))
# fig.add_trace(go.Scatter(x=idx_test, y=loss_test, mode='lines+markers', name='Test Loss'))
# fig.update_layout(yaxis_type="log")
# fig.show(config=config)

## Remarks

As discussed in the data basics tutorial, loading a SeisBench dataset only means loading the metadata into memory. The waveforms are only loaded once they are requested to save memory. By default, waveforms are **not** cached in memory. For training, this means that the data needs to be read from the file in every epoch again. Depending on your hardware, this will take a lot of time. To solve this issue, you can set the `cache` option, when creating the dataset. Then, all you have to do is call `preload_waveforms` and the data will be loaded into memory and automatically cached. For most practical applications, this option is recommended.

# Trying for making constant weights in initialization of models.

In [None]:
import torch.nn as nn
torch.manual_seed(42)
cnn1 = nn.Conv1d(2, 1, 3, padding="same")
cnn2 = nn.Conv1d(2, 1, 3, padding="same")
cnn1.weight