## TCN vs LSTM - Using Full p4679 Dataset

Objectives

- Extract full p4679 dataset. Note this requires dealing with large files. Load in full file, but only display smoothed data?
- Think about pre-processing. Try to keep as much info as possible, while filtering out noise.. how?
- Create train/test split
- Create standardised way to do feature creation. Create features for shear stress, derivative and variance.
- Create simple LSTM and TCN models
- Create information training loop
- Create test procedure 
- Features - instead of raw ss, put in smoothed, alongside variance and/or "noise" = raw - smoothed?
- Don't try and predict full cycle, just 30 or so steps - better aligns with theory.. also probably impossible to do whole cycle starting from 0
- Segment by cycle - about 230.. so ~150 to train on

In [None]:
# Set Directories

jasmin = True  # Set to True if running on JASMIN, False if on local machine
jasmin_user_name = "trr26"

if jasmin:
    MAIN_DIR = f"/gws/nopw/j04/ai4er/users/{jasmin_user_name}/earthquake-predictability"
    DATA_DIR = f"{MAIN_DIR}/data/gtc_quakes_data"

else:  # update directory names to match your local machine
    MAIN_DIR = f"/home/tom-ratsakatika/VSCode/earthquake-predictability"
    DATA_DIR = f"{MAIN_DIR}/data_local"

p4679_FILE_PATH = f"{DATA_DIR}/labquakes/Marone/p4679/p4679.txt"

# Imports

import os
import random
import sys
from math import sqrt

import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import torch
import torch.nn as nn
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, output_notebook, show
from IPython.display import Image, display
from scipy.io import loadmat
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm

sys.path.append(MAIN_DIR)
import utils

In [None]:
# Check CUDA Availability

if torch.cuda.is_available():
    dev = "cuda"
    map_location = None
    print(f"Total number of GPUs available: {torch.cuda.device_count()}")
    !nvidia-smi
else:
    dev = "cpu"
    map_location = "cpu"
    print("No GPU available.")
device = torch.device(dev)

In [None]:
# Open the file located at p4581_FILE_PATH in read mode
with open(p4679_FILE_PATH, "r") as file:
    # Read the file as a CSV using pandas, considering whitespace as delimiter
    # Skip the first 4 rows as they do not contain relevant data
    df = pd.read_csv(file, skiprows=1)

# Rename the columns of the DataFrame to align with the variable names used by Adriano's data loader
df.columns = [
    "id",
    "lp_disp",
    "shr_stress",
    "nor_disp",
    "nor_stress",
    "time",
    "mu",
    "layer_thick",
    "ec_disp",
]

# Drop the 'id' column as it's not needed for further analysis
df = df.drop(
    [
        "id",
        "lp_disp",
        "nor_disp",
        "mu",
        "nor_stress",
        "layer_thick",
        "ec_disp",
    ],
    axis=1,
)

# Reorder the DataFrame columns to bring 'time' to the front
df = df[["time"] + [col for col in df.columns if col != "time"]]

# Define start and end times for the data selection
start_time = 4233.364
end_time = 5159.292

df = df[(df["time"] >= start_time) & (df["time"] <= end_time)]
df = df.reset_index(drop=True)

# Display the first 5 rows of the DataFrame
print(df.head())
print(df.shape)

In [None]:
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, output_notebook, show

output_notebook()

source = ColumnDataSource(df)
p = figure(width=800, height=400)
p.line(x="time", y="shr_stress", source=source)

show(p)

In [None]:
def find_peak_indices(data):
    threshold = 100
    peak_indices = [0]
    for i in range(threshold, len(data) - threshold):
        if data[i] > max(data[i - threshold : i]) and data[i] >= max(
            data[i + 1 : i + threshold]
        ):
            peak_indices.append(i)
    return peak_indices

In [None]:
# Call the find_peak_indices function to get the peak and trough indices
peak_indices = find_peak_indices(df["shr_stress"])
trough_indices = find_peak_indices(-df["shr_stress"])

In [None]:
df["peaks"] = np.where(df.index.isin(peak_indices), df["shr_stress"], np.nan)
df["troughs"] = np.where(
    df.index.isin(trough_indices), df["shr_stress"], np.nan
)

In [None]:
# Calculate time since last peak
last_peak_time = None
for i in range(len(df)):
    if pd.notna(df.loc[i, "peaks"]):
        last_peak_time = df.loc[i, "time"]
    df.loc[i, "time_since_last_peak"] = (
        np.nan
        if last_peak_time is None
        else df.loc[i, "time"] - last_peak_time
    )

# Calculate time since last trough
last_trough_time = None
for i in range(len(df)):
    if pd.notna(df.loc[i, "troughs"]):
        last_trough_time = df.loc[i, "time"]
    df.loc[i, "time_since_last_trough"] = (
        np.nan
        if last_trough_time is None
        else df.loc[i, "time"] - last_trough_time
    )

In [None]:
output_notebook()

source = ColumnDataSource(df)

p = figure(
    width=1200,
    height=400,
    x_axis_label="Time",
    y_axis_label="Shear Stress",
    title="Line Graph of Shear Stress with Peaks and Troughs Marked",
)
p.line(x="time", y="shr_stress", source=source)


# Add peaks with red x marks
p.cross(x="time", y="peaks", source=source, color="red", size=8)

# Add troughs with green x marks
p.cross(x="time", y="troughs", source=source, color="green", size=8)

show(p)

In [None]:
output_notebook()

source = ColumnDataSource(df)

p = figure(
    width=1200,
    height=400,
    x_axis_label="Time",
    y_axis_label="Shear Stress",
    title="Line Graph of Shear Stress with Peaks and Troughs Marked",
)
p.line(x="time", y="time_since_last_peak", source=source, line_color="red")
p.line(x="time", y="time_since_last_trough", source=source, line_color="green")

show(p)

Next steps
- Try work out bug with identification of peaks
- Calculate derivative, and turning point
- Maybe do min-max before and slight smoothing?
- think about what defines a tipping point

In [None]:
df["shr_stress_mv_avg"] = (
    df["shr_stress"]
    .rolling(30)
    .apply(lambda w: scipy.stats.trim_mean(w, 0.05))
)

In [None]:
output_notebook()

source = ColumnDataSource(df)

p = figure(
    width=1200,
    height=400,
    x_axis_label="Time",
    y_axis_label="Shear Stress",
    title="Line Graph of Shear Stress with Peaks and Troughs Marked",
)
p.line(x="time", y="shr_stress", source=source, line_color="blue")
p.line(x="time", y="shr_stress_mv_avg", source=source, line_color="green")

show(p)

In [None]:
df_segments = np.split(df, trough_indices)
df_segments[228].head()

- The metric needs to work even if there isn't a failure if we have random segments that don't have a drop
- Do it based on 2nd derivative .. weight MSE based on magnitude of the second derivative
- So do derivative-weighted RMSE
- Note that derivative for input needs to be trailing, but for loss function doesn't have to be 

In [None]:
segment_lengths = [len(segment) for segment in df_segments]

In [None]:
import matplotlib.pyplot as plt

plt.plot(segment_lengths)
plt.xlabel("Segment Index")
plt.ylabel("Segment Length")
plt.title("Line Chart of Segment Lengths")
plt.show()