## TCN vs LSTM - Using Full p4679 Dataset

Objectives

- Extract full p4679 dataset. Note this requires dealing with large files. Load in full file, but only display smoothed data?
- Think about pre-processing. Try to keep as much info as possible, while filtering out noise.. how?
- Create train/test split
- Create standardised way to do feature creation. Create features for shear stress, derivative and variance.
- Create simple LSTM and TCN models
- Create information training loop
- Create test procedure 

In [None]:
# Set Directories

jasmin = True  # Set to True if running on JASMIN, False if on local machine
jasmin_user_name = "trr26"

if jasmin:
    MAIN_DIR = f"/gws/nopw/j04/ai4er/users/{jasmin_user_name}/earthquake-predictability"
    DATA_DIR = f"{MAIN_DIR}/data/gtc_quakes_data"

else:  # update directory names to match your local machine
    MAIN_DIR = f"/home/tom-ratsakatika/VSCode/earthquake-predictability"
    DATA_DIR = f"{MAIN_DIR}/data_local"

p4679_FILE_PATH = f"{DATA_DIR}/labquakes/Marone/p4679/p4679.txt"

# Imports

import os
import random
import sys
from math import sqrt

import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import torch
import torch.nn as nn
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, output_notebook, show
from IPython.display import Image, display
from scipy.io import loadmat
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm

sys.path.append(MAIN_DIR)
import utils

# Check CUDA Availability

if torch.cuda.is_available():
    dev = "cuda"
    map_location = None
    print(f"Total number of GPUs available: {torch.cuda.device_count()}")
    !nvidia-smi
else:
    dev = "cpu"
    map_location = "cpu"
    print("No GPU available.")
device = torch.device(dev)

In [None]:
# Open the file located at p4581_FILE_PATH in read mode
with open(p4679_FILE_PATH, "r") as file:
    # Read the file as a CSV using pandas, considering whitespace as delimiter
    # Skip the first 4 rows as they do not contain relevant data
    df = pd.read_csv(file, skiprows=1)

# Rename the columns of the DataFrame to align with the variable names used by Adriano's data loader
df.columns = [
    "id",
    "lp_disp",
    "shr_stress",
    "nor_disp",
    "nor_stress",
    "time",
    "mu",
    "layer_thick",
    "ec_disp",
]

# Drop columns not needed for further analysis
df = df.drop(
    [
        "id",
        "lp_disp",
        "nor_disp",
        "mu",
        "nor_stress",
        "layer_thick",
        "ec_disp",
    ],
    axis=1,
)

# Reorder the DataFrame columns to bring 'time' to the front
df = df[["time"] + [col for col in df.columns if col != "time"]]

# Define start and end times for the data selection
start_time = 4233.364
end_time = 5159.292

df = df[(df["time"] >= start_time) & (df["time"] <= end_time)]
df = df.reset_index(drop=True)

# Display the first 5 rows of the DataFrame
print(df.head())
print(df.shape)

In [None]:
def find_peak_indices(data, threshold = 200): # 1000 Hz data, so threshold = 200 is 0.2 seconds 
    peak_indices = [0]
    for i in range(threshold, len(data) - threshold):
        if data[i] > max(data[i - threshold : i]) and data[i] >= max(
            data[i + 1 : i + threshold]
        ):
            peak_indices.append(i)
    return peak_indices

In [None]:
# Call the find_peak_indices function to get the peak and trough indices - takes ~1 minute to run on sci servers
peak_indices = find_peak_indices(df["shr_stress"])
trough_indices = find_peak_indices(-df["shr_stress"])

df["peaks"] = np.where(df.index.isin(peak_indices), df["shr_stress"], np.nan)
df["troughs"] = np.where(
    df.index.isin(trough_indices), df["shr_stress"], np.nan)

# Plots the shear stress time series with peaks and troughs marked by crosses (red and green)

output_notebook()

source = ColumnDataSource(df)

p = figure(
    width=1200,
    height=400,
    x_axis_label="Time",
    y_axis_label="Shear Stress",
    title="Shear Stress with Peaks and Troughs Marked",
)
p.line(x="time", y="shr_stress", source=source)


# Add peaks with red x marks
p.cross(x="time", y="peaks", source=source, color="red", size=8)

# Add troughs with green x marks
p.cross(x="time", y="troughs", source=source, color="green", size=8)

show(p)

In [None]:
# This takes about 3 mins to run on the sci servers

# Calculate time since last peak
last_peak_time = None
for i in range(len(df)):
    if pd.notna(df.loc[i, "peaks"]):
        last_peak_time = df.loc[i, "time"]
    df.loc[i, "time_since_last_peak"] = (
        np.nan
        if last_peak_time is None
        else df.loc[i, "time"] - last_peak_time
    )

# Calculate time since last trough
last_trough_time = None
for i in range(len(df)):
    if pd.notna(df.loc[i, "troughs"]):
        last_trough_time = df.loc[i, "time"]
    df.loc[i, "time_since_last_trough"] = (
        np.nan
        if last_trough_time is None
        else df.loc[i, "time"] - last_trough_time
    )

In [None]:
# Code to visualise data. Set visualise_data to True if you want to create plots of variance,
# first/second derivative, time sine failure and moving average.
# Takes about 1 minute to run on sci servers.

visualise_data = False

if visualise_data:

    # Calculate moving average of shear stress
    df["shr_stress_mv_avg"] = (
        df["shr_stress"]
        .rolling(100)
        .apply(lambda w: scipy.stats.trim_mean(w, 0.05))
    )

    # Calculate variance of shear stress
    df["variance"] = df["shr_stress"].rolling(window=30).var()

    # Calculate first derivative of shear stress
    df["first_derivative"] = df["shr_stress_mv_avg"].diff()

    # Calculate second derivative of shear stress
    df["second_derivative"] = df["first_derivative"].diff()


    # Plots time since last peak and time since last trough

    output_notebook()

    source = ColumnDataSource(df)

    p = figure(
        width=1200,
        height=400,
        x_axis_label="Time",
        y_axis_label="Shear Stress",
        title="Line Graph of Shear Stress with Peaks and Troughs Marked",
    )
    p.line(x="time", y="time_since_last_peak", source=source, line_color="red")
    p.line(x="time", y="time_since_last_trough", source=source, line_color="green")

    show(p)

    # Plots shear stress vs shear stress (moving average)

    output_notebook()

    source = ColumnDataSource(df)

    p = figure(
        width=1200,
        height=400,
        x_axis_label="Time",
        y_axis_label="Shear Stress",
        title="shr_stress_mv_avg",
    )
    p.line(x="time", y="shr_stress", source=source, line_color="blue")
    p.line(x="time", y="shr_stress_mv_avg", source=source, line_color="green")

    show(p)

    # Plots variance of shear stress (raw), with peaks and troughs marked on x-axis

    output_notebook()

    source = ColumnDataSource(df)

    p = figure(
        width=1200,
        height=400,
        x_axis_label="Time",
        y_axis_label="Shear Stress",
        title="variance",
    )

    # Primary y-axis for shear stress
    p.line(x="time", y="variance", source=source, line_color="blue")

    # Add crosses on the x-axis at each index in peak_indices
    p.cross(x=df.loc[peak_indices, "time"], y=0, size=10, color="red")

    # Add crosses on the x-axis at each index in peak_indices
    p.cross(x=df.loc[trough_indices, "time"], y=0, size=10, color="green")

    show(p)

    # Plots first derivative of shear stress (moving average), with peaks and troughs marked on x-axis

    output_notebook()

    source = ColumnDataSource(df)

    p = figure(
        width=1200,
        height=400,
        x_axis_label="Time",
        y_axis_label="Shear Stress",
        title="first_derivative",
    )

    # Primary y-axis for shear stress
    p.line(x="time", y="first_derivative", source=source, line_color="blue")

    # Add crosses on the x-axis at each index in peak_indices
    p.cross(x=df.loc[peak_indices, "time"], y=0, size=10, color="red")

    # Add crosses on the x-axis at each index in peak_indices
    p.cross(x=df.loc[trough_indices, "time"], y=0, size=10, color="green")

    show(p)

    # Plots second derivative of shear stress (moving average), with peaks and troughs marked on x-axis

    output_notebook()

    source = ColumnDataSource(df)

    p = figure(
        width=1200,
        height=400,
        x_axis_label="Time",
        y_axis_label="Shear Stress",
        title="second_derivative",
    )

    # Primary y-axis for shear stress
    p.line(x="time", y="second_derivative", source=source, line_color="blue")

    # Add crosses on the x-axis at each index in peak_indices
    p.cross(x=df.loc[peak_indices, "time"], y=0, size=10, color="red")

    # Add crosses on the x-axis at each index in peak_indices
    p.cross(x=df.loc[trough_indices, "time"], y=0, size=10, color="green")

    show(p)

In [None]:
df_segments = []

# Reorder the DataFrame columns to bring 'time since last peak/trough' to the front
df = df[["time_since_last_peak"] + [col for col in df.columns if col != "time_since_last_peak"]]
df = df[["time_since_last_trough"] + [col for col in df.columns if col != "time_since_last_trough"]]

df_segments = np.split(df.drop(["time", "peaks", "troughs"], axis=1), trough_indices[1:])
df_segments = [segment.reset_index(drop=True) for segment in df_segments]

segment_lengths = [len(segment) for segment in df_segments]

df_segments[1].head()

In [None]:
# Visualise the segments


# Create a dropdown widget to select the segment
segment_dropdown = widgets.Dropdown(
    options=list(range(len(df_segments))),
    description='Segment:',
    value=0,
)

# Define a function to update the plot based on the selected segment
def update_plot(segment):
    plt.figure(figsize=(12, 6))
    plt.plot(df_segments[segment]['time_since_last_trough'], df_segments[segment]['shr_stress'])
    plt.xlabel('Time')
    plt.ylabel('Shear Stress')
    plt.title(f'Segment {segment}')
    plt.show()

# Register the update_plot function as the event handler for the dropdown widget
widgets.interactive(update_plot, segment=segment_dropdown)

In [None]:
# Create features

def create_features(df):

    # Calculate moving average of shear stress - takes about 30 seconds to run on sci servers
    df["shr_stress_mv_avg"] = (
        df["shr_stress"]
        .rolling(100)
        .apply(lambda w: scipy.stats.trim_mean(w, 0.05))
    )

    # Calculate variance of shear stress
    df["variance"] = df["shr_stress"].rolling(window=30).var()

    # Calculate first derivative of shear stress
    df["first_derivative"] = df["shr_stress_mv_avg"].diff()

    # Calculate second derivative of shear stress
    df["second_derivative"] = df["first_derivative"].diff()

    return df


In [None]:
# Add features columns to segments

df_segments_features = df_segments

df_segments_features = [create_features(segment) for segment in df_segments]

df_segments_features[0].head()

In [None]:
def min_max_scale(df, columns_to_scale="shr_stress"):
    if isinstance(columns_to_scale, str):
        columns_to_scale = [columns_to_scale]  # Ensure it's a list if a single column name is passed
    
    for col in columns_to_scale:
        # Explicitly ignore NaN values in min and max calculation
        min_val = df[col].min(skipna=True)
        max_val = df[col].max(skipna=True)
        range_val = max_val - min_val
        
        # Avoid division by zero if all values in a column are the same
        if range_val > 0:
            # Apply min-max scaling
            df[col] = (df[col] - min_val) / range_val
        else:
            # Handle the case where all values are the same or if the column only contains NaN values
            df[col] = 0
    
    return df

In [None]:
# Apply min-max scaling

df_segments_features_scaled = df_segments_features

columns_to_scale = ["shr_stress", "shr_stress_mv_avg", "variance", "first_derivative", "second_derivative"]

df_segments_features_scaled = [min_max_scale(segment, columns_to_scale) for segment in df_segments_features[:-1]]

df_segments_features_scaled[0].head()

In [None]:
def add_padding(df):
    max_length = max(len(seg) for seg in df)
    
    for i in range(len(df)):

        current_length = len(df[i])

        if current_length < max_length:
            
            additional_rows = max_length - current_length

            # Create a dataframe with the padding values and mask set to 0 (indicating padded data)
            zeros_df = pd.DataFrame(0, index=np.arange(additional_rows), columns=df[i].columns)
            zeros_df['mask'] = 0  # Assuming 0 for padded data
            
            # Ensure original data has 'mask' column set to 1 (indicating real data)
            df[i]['mask'] = 1
            
            # Concatenate the padding dataframe at the beginning of the original dataframe
            df[i] = pd.concat([zeros_df, df[i]]).reset_index(drop=True)
            
    return df


In [None]:
# Add padding so all the segments are the same length, and add a mask column = 1 for real data

df_segments_features_scaled_padded = add_padding(df_segments_features_scaled)
df_segments_features_scaled_padded[0].head()

In [None]:
# Define Xs and ys, then shuffle and split the segments into train/test/val sets

def train_val_test_split(df, y_size = 500, val=0.2, test=0.1, seed = 42):

    num_segments = len(df)
    train = 1 - val - test
    last_train_segment_index = round(train * num_segments) - 1
    last_val_segment_index = last_train_segment_index + round(val * num_segments)
    last_test_segment_index = last_val_segment_index + round(test * num_segments)

    y_start = len(df[0]) - y_size
    X = [df_i.iloc[:y_start].copy() for df_i in df]
    y = [df_i.iloc[y_start:].copy() for df_i in df]

    from sklearn.model_selection import train_test_split

    X_train, X_test = train_test_split(X, test_size=test, random_state=seed)
    X_train, X_val = train_test_split(X_train, test_size=val, random_state=seed)

    y_train, y_test = train_test_split(y, test_size=test, random_state=seed)
    y_train, y_val = train_test_split(y_train, test_size=val, random_state=seed)
    
    return X_train, y_train, X_val, y_val, X_test, y_test    


In [69]:
# Apply train_val_test_split to segments

X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(df_segments_features_scaled_padded)

# Create a dropdown widget to select the segment
segment_dropdown = widgets.Dropdown(
    options=list(range(len(X_train))),
    description='Segment:',
    value=0,
)

# Define a function to update the plot based on the selected segment
def update_plot(segment):
    plt.figure(figsize=(12, 6))
    plt.plot(X_train[segment]['shr_stress'])
    plt.plot(y_train[segment]['shr_stress'])
    plt.xlabel('Time')
    plt.ylabel('Shear Stress')
    plt.title(f'Segment {segment}')
    plt.show()

# Register the update_plot function as the event handler for the dropdown widget
widgets.interactive(update_plot, segment=segment_dropdown)

interactive(children=(Dropdown(description='Segment:', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, …

To do:

- Convert dataframes into tensors
- Create a data loader
- Create a model that takes into account the mask variable

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

class TemporalBlock(nn.Module):
    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
        super(TemporalBlock, self).__init__()
        self.conv1 = nn.Conv1d(n_inputs, n_outputs, kernel_size,
                               stride=stride, padding=padding, dilation=dilation)
        self.conv2 = nn.Conv1d(n_outputs, n_outputs, kernel_size,
                               stride=stride, padding=padding, dilation=dilation)
        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = self.relu(self.conv1(x))
        out = self.dropout(out)
        out = self.relu(self.conv2(out))
        res = x if self.downsample is None else self.downsample(x)
        return self.relu(out + res)

class TCN(nn.Module):
    def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
        super(TCN, self).__init__()
        layers = []
        num_levels = len(num_channels)
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = num_inputs if i == 0 else num_channels[i-1]
            out_channels = num_channels[i]
            layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size,
                                     padding=(kernel_size-1) * dilation_size, dropout=dropout)]

        self.tcn = nn.Sequential(*layers)
        self.linear = nn.Linear(num_channels[-1], out_steps)

    def forward(self, x):
        x = x.transpose(1, 2)  # Transpose batch and sequence dims
        y = self.tcn(x)
        y = y.transpose(1, 2)  # Transpose back
        y = self.linear(y[:, -1, :])  # Take the last output for each sequence
        return y

# Assuming num_features and out_steps are defined
num_features = X_train[0].shape[1]
out_steps = 500

# Convert DataFrames to tensors
def df_to_tensor(df_list, feature_columns=None, target_column=None):
    features = []
    targets = []
    for df in df_list:
        if feature_columns is not None:
            features.append(torch.tensor(df[feature_columns].values, dtype=torch.float32))
        if target_column is not None:
            targets.append(torch.tensor(df[target_column].values, dtype=torch.float32))
    features = torch.stack(features)
    targets = torch.stack(targets)
    return features, targets

# Assuming X_train, y_train, etc. are lists of DataFrames
X_train_tensor, _ = df_to_tensor(X_train, feature_columns=X_train[0].columns)
_, y_train_tensor = df_to_tensor(y_train, target_column='shr_stress')

X_val_tensor, _ = df_to_tensor(X_val, feature_columns=X_val[0].columns)
_, y_val_tensor = df_to_tensor(y_val, target_column='shr_stress')

X_test_tensor, _ = df_to_tensor(X_test, feature_columns=X_test[0].columns)
_, y_test_tensor = df_to_tensor(y_test, target_column='shr_stress')

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=batch_size, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=batch_size, shuffle=False)

# Model, loss, and optimizer
model = TCN(num_inputs=num_features, num_channels=[25, 50, 100])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(1):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')
