## TCN - p4679 Full Dataset

Objectives

- Extract full p4679 dataset. Note this requires dealing with large files. Load in full file, but only display smoothed data?
- Think about pre-processing. Try to keep as much info as possible, while filtering out noise.. how?
- Create train/test split
- Create standardised way to do feature creation. Create features for shear stress, derivative and variance.
- Create simple LSTM and TCN models
- Create information training loop
- Create test procedure 

In [None]:
# Set Directories

jasmin = True  # Set to True if running on JASMIN, False if on local machine
jasmin_user_name = "trr26"

if jasmin:
    MAIN_DIR = f"/gws/nopw/j04/ai4er/users/{jasmin_user_name}/earthquake-predictability"
    DATA_DIR = f"{MAIN_DIR}/data/gtc_quakes_data"

else:  # update directory names to match your local machine
    MAIN_DIR = f"/home/tom-ratsakatika/VSCode/earthquake-predictability"
    DATA_DIR = f"{MAIN_DIR}/data_local"

p4679_FILE_PATH = f"{DATA_DIR}/labquakes/Marone/p4679/p4679.txt"
p4581_FILE_PATH = f"{DATA_DIR}/labquakes/Marone/p4581/p4581.txt"

# Imports

import os
import random
import sys
from math import sqrt

import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import torch
import torch.nn as nn
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, output_notebook, show
from IPython.display import Image, display
from scipy.io import loadmat
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm

sys.path.append(MAIN_DIR)
import utils

# group utils

from utils.general_functions import set_seed, set_torch_device
# Set random seed
SEED = 42 # random seed for the dataset and model
set_seed(SEED)


In [None]:

# Check CUDA Availability

if torch.cuda.is_available():
    dev = "cuda"
    map_location = None
    print(f"Total number of GPUs available: {torch.cuda.device_count()}")
    !nvidia-smi
else:
    dev = "cpu"
    map_location = "cpu"
    print("No GPU available.")
device = torch.device(dev)

In [None]:
def data_loader(exp):

    df_p4679 = pd.DataFrame()
    df_p4581 = pd.DataFrame()

    if "p4679" in exp:
        # Open the file located at p4679_FILE_PATH in read mode
        with open(p4679_FILE_PATH, "r") as file:
            # Read the file as a CSV using pandas, considering whitespace as delimiter
            # Skip the first 4 rows as they do not contain relevant data
            df_p4679 = pd.read_csv(file, skiprows=1)

        # Rename the columns of the DataFrame to align with the variable names used by Adriano's data loader
        df_p4679.columns = [
            "id",
            "lp_disp",
            "shr_stress",
            "nor_disp",
            "nor_stress",
            "time",
            "mu",
            "layer_thick",
            "ec_disp",
        ]

        # Drop columns not needed for further analysis
        df_p4679 = df_p4679.drop(
            [
                "id",
                "lp_disp",
                "nor_disp",
                "mu",
                "nor_stress",
                "layer_thick",
                "ec_disp",
            ],
            axis=1,
        )

        # Reorder the DataFrame columns to bring 'time' to the front
        df_p4679 = df_p4679[["time"] + [col for col in df_p4679.columns if col != "time"]]

        # Define start and end times for the data selection
        start_time = 4233.364
        end_time = 5159.292

        df_p4679 = df_p4679[(df_p4679["time"] >= start_time) & (df_p4679["time"] <= end_time)]
        df_p4679 = df_p4679.reset_index(drop=True)

        # Display the first 5 rows of the DataFrame
        print(df_p4679.head())
        print(df_p4679.shape)
    
    if "p4581" in exp:

        # Open the file located at p4581_FILE_PATH in read mode
        with open(p4581_FILE_PATH, "r") as file:
            # Read the file as a CSV using pandas, considering whitespace as delimiter
            # Skip the first 4 rows as they do not contain relevant data
            df_p4581 = pd.read_csv(file, delim_whitespace=True, skiprows=4)

        # Rename the columns of the DataFrame to align with the variable names used by Adriano's data loader
        df_p4581.columns = [
            "id",            # Identifier
            "lp_disp",       # Lateral displacement
            "shr_stress",    # Shear stress
            "nor_disp",      # Normal displacement
            "nor_stress",    # Normal stress
            "time",          # Time of measurement
            "sync",          # Synchronization marker
            "samp_freq",     # Sampling frequency
        ]


        # Drop columns not needed for further analysis
        df_p4581 = df_p4581.drop(
            [
                "id",
                "lp_disp",
                "nor_disp",
                "nor_stress",
                "sync",
                "samp_freq",
            ],
            axis=1,
        )

        # Reorder the DataFrame columns to bring 'time' to the front
        df_p4581 = df_p4581[["time"] + [col for col in df_p4581.columns if col != "time"]]

        # Define start and end times for the data selection
        start_time = 2100
        end_time = 4800

        df_p4581 = df_p4581[(df_p4581["time"] >= start_time) & (df_p4581["time"] <= end_time)]
        df_p4581 = df_p4581.reset_index(drop=True)

        # Display the first 5 rows of the DataFrame
        print(df_p4581.head())
        print(df_p4581.shape)
 
    df = pd.concat([df_p4679, df_p4581], axis=0)
    df = df.reset_index(drop=True)
    
    return df

In [None]:
# Load data from one or multiple experiments

exp = "p4679"       #["p4679", "p4581"]
df = data_loader(exp)

In [None]:
def find_peak_indices(data, threshold = 500): # 1000 Hz data, so threshold = 200 is 0.2 seconds 
    peak_indices = [0]
    for i in range(threshold, len(data) - threshold):
        if data[i] > max(data[i - threshold : i]) and data[i] >= max(
            data[i + 1 : i + threshold]
        ):
            peak_indices.append(i)
    return peak_indices

In [None]:
# Call the find_peak_indices function to get the peak and trough indices - takes ~1 minute to run on sci servers
peak_indices = find_peak_indices(df["shr_stress"])
trough_indices = find_peak_indices(-df["shr_stress"])

df["peaks"] = np.where(df.index.isin(peak_indices), df["shr_stress"], np.nan)
df["troughs"] = np.where(
    df.index.isin(trough_indices), df["shr_stress"], np.nan)


In [None]:

# Plots the shear stress time series with peaks and troughs marked by crosses (red and green)

output_notebook()

source = ColumnDataSource(df)

p = figure(
    width=1200,
    height=400,
    x_axis_label="Time",
    y_axis_label="Shear Stress",
    title="Shear Stress with Peaks and Troughs Marked",
)
p.line(x="time", y="shr_stress", source=source)


# Add peaks with red x marks
p.cross(x="time", y="peaks", source=source, color="red", size=8)

# Add troughs with green x marks
p.cross(x="time", y="troughs", source=source, color="green", size=8)

show(p)

In [None]:
# This takes about 3 mins to run on the sci servers

# Calculate time since last peak
last_peak_time = None
for i in range(len(df)):
    if pd.notna(df.loc[i, "peaks"]):
        last_peak_time = df.loc[i, "time"]
    df.loc[i, "time_since_last_peak"] = (
        np.nan
        if last_peak_time is None
        else df.loc[i, "time"] - last_peak_time
    )

# Calculate time since last trough
last_trough_time = None
for i in range(len(df)):
    if pd.notna(df.loc[i, "troughs"]):
        last_trough_time = df.loc[i, "time"]
    df.loc[i, "time_since_last_trough"] = (
        np.nan
        if last_trough_time is None
        else df.loc[i, "time"] - last_trough_time
    )

In [None]:
# Code to visualise data. Set visualise_data to True if you want to create plots of variance,
# first/second derivative, time sine failure and moving average.
# Takes about 1 minute to run on sci servers.

visualise_data = True
calculate_features = True

if calculate_features:

        # Calculate moving average of shear stress
    df["shr_stress_mv_avg"] = (
        df["shr_stress"]
        .rolling(100)
        .apply(lambda w: scipy.stats.trim_mean(w, 0.05))
    )

    # Calculate variance of shear stress
    df["variance"] = df["shr_stress"].rolling(window=30).var()

    # Calculate first derivative of shear stress
    df["first_derivative"] = df["shr_stress_mv_avg"].diff()

    # Calculate second derivative of shear stress
    df["second_derivative"] = df["first_derivative"].diff()


if visualise_data:

    # Plots time since last peak and time since last trough

    output_notebook()

    source = ColumnDataSource(df)

    p = figure(
        width=1200,
        height=400,
        x_axis_label="Time",
        y_axis_label="Shear Stress",
        title="Line Graph of Shear Stress with Peaks and Troughs Marked",
    )
    p.line(x="time", y="time_since_last_peak", source=source, line_color="red")
    p.line(x="time", y="time_since_last_trough", source=source, line_color="green")

    show(p)

    # Plots shear stress vs shear stress (moving average)

    output_notebook()

    source = ColumnDataSource(df)

    p = figure(
        width=1200,
        height=400,
        x_axis_label="Time",
        y_axis_label="Shear Stress",
        title="shr_stress_mv_avg",
    )
    p.line(x="time", y="shr_stress", source=source, line_color="blue")
    p.line(x="time", y="shr_stress_mv_avg", source=source, line_color="green")

    show(p)

    # Plots variance of shear stress (raw), with peaks and troughs marked on x-axis

    output_notebook()

    source = ColumnDataSource(df)

    p = figure(
        width=1200,
        height=400,
        x_axis_label="Time",
        y_axis_label="Shear Stress",
        title="variance",
    )

    # Primary y-axis for shear stress
    p.line(x="time", y="variance", source=source, line_color="blue")

    # Add crosses on the x-axis at each index in peak_indices
    p.cross(x=df.loc[peak_indices, "time"], y=0, size=10, color="red")

    # Add crosses on the x-axis at each index in peak_indices
    p.cross(x=df.loc[trough_indices, "time"], y=0, size=10, color="green")

    show(p)

    # Plots first derivative of shear stress (moving average), with peaks and troughs marked on x-axis

    output_notebook()

    source = ColumnDataSource(df)

    p = figure(
        width=1200,
        height=400,
        x_axis_label="Time",
        y_axis_label="Shear Stress",
        title="first_derivative",
    )

    # Primary y-axis for shear stress
    p.line(x="time", y="first_derivative", source=source, line_color="blue")

    # Add crosses on the x-axis at each index in peak_indices
    p.cross(x=df.loc[peak_indices, "time"], y=0, size=10, color="red")

    # Add crosses on the x-axis at each index in peak_indices
    p.cross(x=df.loc[trough_indices, "time"], y=0, size=10, color="green")

    show(p)

    # Plots second derivative of shear stress (moving average), with peaks and troughs marked on x-axis

    output_notebook()

    source = ColumnDataSource(df)

    p = figure(
        width=1200,
        height=400,
        x_axis_label="Time",
        y_axis_label="Shear Stress",
        title="second_derivative",
    )

    # Primary y-axis for shear stress
    p.line(x="time", y="second_derivative", source=source, line_color="blue")

    # Add crosses on the x-axis at each index in peak_indices
    p.cross(x=df.loc[peak_indices, "time"], y=0, size=10, color="red")

    # Add crosses on the x-axis at each index in peak_indices
    p.cross(x=df.loc[trough_indices, "time"], y=0, size=10, color="green")

    show(p)

In [None]:
# Call the find_peak_indices function to get the peak and trough indices - takes ~1 minute to run on sci servers

max_velocity_indices = find_peak_indices(-df["first_derivative"], threshold = 1000)

df["max_velocity"] = np.where(df.index.isin(peak_indices), df["first_derivative"], np.nan)

df.head()

# Plots the shear stress time series with peaks, troughs, and max velocity indices marked by crosses (red, green, and black)

output_notebook()

source = ColumnDataSource(df)

p = figure(
    width=1200,
    height=400,
    x_axis_label="Time",
    y_axis_label="Shear Stress",
    title="Shear Stress with Peaks, Troughs, and Max Velocity Indices Marked",
)
p.line(x="time", y="shr_stress", source=source)

# Add peaks with red x marks
p.cross(x="time", y="peaks", source=source, color="red", size=8)

# Add troughs with green x marks
p.cross(x="time", y="troughs", source=source, color="green", size=8)

# Add max velocity indices with black x marks
p.cross(x=df.loc[max_velocity_indices, "time"], y=df.loc[max_velocity_indices, "shr_stress"], color="black", size=8)

show(p)

output_notebook()

source = ColumnDataSource(df)

p = figure(
    width=1200,
    height=400,
    x_axis_label="Time",
    y_axis_label="Velocity",
    title="Max Velocity Indices Marked",
)
p.line(x="time", y="first_derivative", source=source)

# Add peaks with red x marks
p.cross(x="time", y="max_velocity", source=source, color="black", size=8)

show(p)

In [None]:
df_segments = []

# Reorder the DataFrame columns to bring 'time since last peak/trough' to the front
df = df[["time_since_last_peak"] + [col for col in df.columns if col != "time_since_last_peak"]]
df = df[["time_since_last_trough"] + [col for col in df.columns if col != "time_since_last_trough"]]

df_segments = np.split(df.drop(["time", "peaks", "troughs"], axis=1), trough_indices[1:])
df_segments = [segment.reset_index(drop=True) for segment in df_segments]

segment_lengths = [len(segment) for segment in df_segments]

df_segments[1].head()

In [None]:
# Visualise the segments

# Create a dropdown widget to select the segment
segment_dropdown = widgets.Dropdown(
    options=list(range(len(df_segments))),
    description='Segment:',
    value=0,
)

# Define a function to update the plot based on the selected segment
def update_plot(segment):
    plt.figure(figsize=(12, 6))
    plt.plot(df_segments[segment]['time_since_last_trough'], df_segments[segment]['shr_stress'])
    plt.xlabel('Time')
    plt.ylabel('Shear Stress')
    plt.title(f'Segment {segment}')
    plt.show()

# Register the update_plot function as the event handler for the dropdown widget
widgets.interactive(update_plot, segment=segment_dropdown)

In [None]:
# Create features

def create_features(df):

    # Calculate moving average of shear stress - takes about 30 seconds to run on sci servers
    df["shr_stress_mv_avg"] = (
        df["shr_stress"]
        .rolling(100)
        .apply(lambda w: scipy.stats.trim_mean(w, 0.05))
    )

    # Calculate variance of shear stress
    df["variance"] = df["shr_stress"].rolling(window=30).var()

    # Calculate first derivative of shear stress
    df["first_derivative"] = df["shr_stress_mv_avg"].diff()

    # Calculate second derivative of shear stress
    df["second_derivative"] = df["first_derivative"].diff()

    return df


In [None]:
# Add features columns to segments

df_segments_features = df_segments

df_segments_features = [create_features(segment) for segment in df_segments]

df_segments_features[0].head()

In [None]:
def min_max_scale(df, columns_to_scale="shr_stress"):
    if isinstance(columns_to_scale, str):
        columns_to_scale = [columns_to_scale]  # Ensure it's a list if a single column name is passed
    
    for col in columns_to_scale:
        # Explicitly ignore NaN values in min and max calculation
        min_val = df[col].min(skipna=True)
        max_val = df[col].max(skipna=True)
        range_val = max_val - min_val
        
        # Avoid division by zero if all values in a column are the same
        if range_val > 0:
            # Apply min-max scaling
            df[col] = (df[col] - min_val) / range_val
        else:
            # Handle the case where all values are the same or if the column only contains NaN values
            df[col] = 0
    
    return df

In [None]:
# log scaling function

def log_scale(df, columns_to_scale="variance"):
    if isinstance(columns_to_scale, str):
        columns_to_scale = [columns_to_scale]  # Ensure it's a list if a single column name is passed
    
    for col in columns_to_scale:
        # Shift values to ensure they are positive if necessary. 
        # This requires domain knowledge to choose an appropriate shift.
        min_val = df[col].min(skipna=True)
        
        # Calculate the minimum shift to ensure all values are > 0
        shift = 0
        if min_val <= 0:
            shift = abs(min_val) + 1  # Adding 1 to ensure all values are strictly positive
            # print(f"Shifting {col} by {shift}, because min_val = {min_val}")
        # Apply log scaling with shift
        df[col] = np.log(df[col] + shift)
    
    return df


In [None]:
# Apply log normalisation then min-max scaling

df_segments_features_scaled = df_segments_features

columns_to_scale = ["variance", "first_derivative", "second_derivative"]

df_segments_features_scaled = [log_scale(segment, columns_to_scale) for segment in df_segments_features[:-1]]

# Apply min-max scaling

columns_to_scale = ["shr_stress", "shr_stress_mv_avg", "variance", "first_derivative", "second_derivative"]

df_segments_features_scaled = [min_max_scale(segment, columns_to_scale) for segment in df_segments_features[:-1]]

df_segments_features_scaled[0].head()

In [None]:
def add_padding(df):
    max_length = max(len(seg) for seg in df)
    
    for i in range(len(df)):

        current_length = len(df[i])

        # Ensure original data has 'mask' column set to 1 (indicating real data)
        df[i]['mask'] = 1
        
        additional_rows = max_length - current_length

        if additional_rows > 0:

            # Create a dataframe with the padding values and mask set to 0 (indicating padded data)
            zeros_df = pd.DataFrame(0, index=np.arange(additional_rows), columns=df[i].columns)
            zeros_df['mask'] = 0  # Assuming 0 for padded data

            # Concatenate the padding dataframe at the beginning of the original dataframe
            df[i] = pd.concat([zeros_df, df[i]]).reset_index(drop=True)
            
    return df


In [None]:
# Add padding so all the segments are the same length, and add a mask column = 1 for real data

df_segments_features_scaled_padded = add_padding(df_segments_features_scaled)
df_segments_features_scaled_padded[0].head()

In [None]:
# Define Xs and ys, then shuffle and split the segments into train/test/val sets

def train_val_test_split(df, y_size = 500, val=0.2, test=0.1, seed = 42, target="shr_stress_mv_avg"):

    num_segments = len(df)
    train = 1 - val - test
    last_train_segment_index = round(train * num_segments) - 1
    last_val_segment_index = last_train_segment_index + round(val * num_segments)
    last_test_segment_index = last_val_segment_index + round(test * num_segments)

    y_start = len(df[0]) - y_size
    X = [df_i.iloc[:y_start].copy() for df_i in df]
    y = [df_i.iloc[y_start:][[target]].copy() for df_i in df]

    from sklearn.model_selection import train_test_split

    X_train, X_test = train_test_split(X, test_size=test, random_state=seed)
    X_train, X_val = train_test_split(X_train, test_size=val, random_state=seed)

    y_train, y_test = train_test_split(y, test_size=test, random_state=seed)
    y_train, y_val = train_test_split(y_train, test_size=val, random_state=seed)
    
    return X_train, y_train, X_val, y_val, X_test, y_test    


In [None]:
# Apply train_val_test_split to segments

X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(df_segments_features_scaled_padded)


In [None]:

# Create a dropdown widget to select the segment
segment_dropdown = widgets.Dropdown(
    options=list(range(len(X_train))),
    description='Segment:',
    value=0,
)

# Define a function to update the plot based on the selected segment
def update_plot(segment):
    plt.figure(figsize=(12, 6))
    plt.plot(X_train[segment]['shr_stress_mv_avg'])
    plt.plot(y_train[segment]['shr_stress_mv_avg'])
    plt.xlabel('Time')
    plt.ylabel('Shear Stress')
    plt.title(f'Segment {segment}')
    plt.show()

# Register the update_plot function as the event handler for the dropdown widget
widgets.interactive(update_plot, segment=segment_dropdown)

In [None]:
# Convert each DataFrame to a PyTorch tensor and stack them

X_train_tensor = torch.stack([torch.tensor(np.array(df_i), dtype=torch.float32) for df_i in X_train])
X_val_tensor = torch.stack([torch.tensor(np.array(df_i), dtype=torch.float32) for df_i in X_val])
X_test_tensor = torch.stack([torch.tensor(np.array(df_i), dtype=torch.float32) for df_i in X_test])

y_train_tensor = torch.stack([torch.tensor(np.array(df_i), dtype=torch.float32) for df_i in y_train])
y_val_tensor = torch.stack([torch.tensor(np.array(df_i), dtype=torch.float32) for df_i in y_val])
y_test_tensor = torch.stack([torch.tensor(np.array(df_i), dtype=torch.float32) for df_i in y_test])

# Print size and datatype of X tensors
print("X_train_tensor size:", X_train_tensor.size(), "dtype:", X_train_tensor.dtype)
print("X_val_tensor size:", X_val_tensor.size(), "dtype:", X_val_tensor.dtype)
print("X_test_tensor size:", X_test_tensor.size(), "dtype:", X_test_tensor.dtype)

# Print size and datatype of y tensors
print("y_train_tensor size:", y_train_tensor.size(), "dtype:", y_train_tensor.dtype)
print("y_val_tensor size:", y_val_tensor.size(), "dtype:", y_val_tensor.dtype)
print("y_test_tensor size:", y_test_tensor.size(), "dtype:", y_test_tensor.dtype)


In [None]:
# Cell 1: Import Libraries

# Import PyTorch libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset

# For building the TCN model
from torch.nn.utils import weight_norm  # Weight normalization can be useful in TCN layers
from torch.nn.utils.parametrizations import weight_norm



# Additional libraries for data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Check if a GPU is available and set PyTorch to use the GPU, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Display the device to confirm PyTorch device configuration
print(f'Using device: {device}')

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.parametrizations import weight_norm

class CausalConv1d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation=1):
        super(CausalConv1d, self).__init__()
        self.conv1d = nn.Conv1d(in_channels, out_channels, kernel_size,
                                padding=(kernel_size-1) * dilation, dilation=dilation)

    def forward(self, x):
        return self.conv1d(x)[:, :, :-self.conv1d.padding[0]]

class TemporalBlock(nn.Module):
    def __init__(self, n_inputs, n_outputs, kernel_size, dilation, dropout=0.2):
        super(TemporalBlock, self).__init__()
        self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
                                           padding=(kernel_size-1) * dilation, dilation=dilation))
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)
        self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,
                                           padding=(kernel_size-1) * dilation, dilation=dilation))
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)
        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else nn.Identity()

    def forward(self, x):
        res = self.downsample(x)
        x = self.relu1(self.conv1(x))
        x = self.dropout1(x)
        x = self.relu2(self.conv2(x))
        x = self.dropout2(x)
        return x + res

class TCN(nn.Module):
    def __init__(self, input_size, output_size, num_channels, kernel_size=2, dropout=0.2):
        super(TCN, self).__init__()
        layers = []
        num_levels = len(num_channels)
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = input_size if i == 0 else num_channels[i-1]
            out_channels = num_channels[i]
            layers.append(TemporalBlock(in_channels, out_channels, kernel_size,
                                        dilation=dilation_size, dropout=dropout))
        self.tcn = nn.Sequential(*layers)
        self.final_layer = nn.Conv1d(num_channels[-1], output_size, 1)
        self.adaptive_pool = nn.AdaptiveAvgPool1d(500)  # Ensuring output sequence length of 500

    def forward(self, x):
        x = self.tcn(x)
        x = self.final_layer(x)
        x = self.adaptive_pool(x)  # Adjust sequence length to 500
        return x



In [None]:
# Cell 3: Custom RMSE Loss Function

def custom_rmse_loss(output, target):
    """
    Custom Root Mean Squared Error loss function.
    
    Parameters:
    - output: The predictions from the model.
    - target: The true values.
    
    Returns:
    - loss: The root mean squared error calculated over the values.
    """
    # Ensure all tensors are on the same device (e.g., CPU or GPU)
    output, target = output.to(device), target.to(device)
    
    # Calculate the squared difference between the predictions and true values
    mse_loss = torch.mean((output - target) ** 2)
    
    # Calculate the square root of the mean squared error
    rmse_loss = torch.sqrt(mse_loss)
    
    return rmse_loss


In [None]:
# Cell 4: Data Loaders Corrected for Mask Extraction and Feature Transposition

from torch.utils.data import DataLoader, TensorDataset

# Extract the mask tensors before transposing the features
mask_train_tensor = X_train_tensor[:, :, -1].unsqueeze(-1)  # Extract mask and ensure it has the same dimensions as y tensors
mask_val_tensor = X_val_tensor[:, :, -1].unsqueeze(-1)
mask_test_tensor = X_test_tensor[:, :, -1].unsqueeze(-1)

# Now extract features excluding the last column (mask) and transpose dimensions to match [batch_size, num_channels, sequence_length]
X_train_features = X_train_tensor[:, :, :-1].transpose(1, 2)
X_val_features = X_val_tensor[:, :, :-1].transpose(1, 2)
X_test_features = X_test_tensor[:, :, :-1].transpose(1, 2)

# Create TensorDatasets to hold inputs, targets, and masks
train_dataset = TensorDataset(X_train_features, y_train_tensor, mask_train_tensor)
val_dataset = TensorDataset(X_val_features, y_val_tensor, mask_val_tensor)
test_dataset = TensorDataset(X_test_features, y_test_tensor, mask_test_tensor)

# DataLoader parameters
batch_size = 32
shuffle = True  # Shuffle for the training dataset; not necessary for validation/test datasets

# Create DataLoaders for each dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

print("Data loaders prepared with adjusted input dimensions and masks for training, validation, and test sets.")


In [None]:
# Cell 5: Define Training and Evaluation Function

import torch.optim as optim

def train_and_evaluate_model(model, train_loader, val_loader, optimizer, loss_fn, epochs, device):
    """
    Trains and evaluates the model.
    
    Parameters:
    - model: The model to train and evaluate.
    - train_loader: DataLoader for the training data.
    - val_loader: DataLoader for the validation data.
    - optimizer: Optimizer to use for training.
    - loss_fn: Loss function used for training.
    - epochs: Number of training epochs.
    - device: Device to train on ('cuda' or 'cpu').
    """
    min_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()  # Set model to training mode
        total_train_loss = 0

        for X_batch, y_batch, mask_batch in train_loader:
            X_batch, y_batch, mask_batch = X_batch.to(device), y_batch.to(device), mask_batch.to(device)

            optimizer.zero_grad()  # Clear gradients
            outputs = model(X_batch)  # Forward pass
            loss = custom_rmse_loss(outputs, y_batch)  # Compute loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update weights

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # Validation phase
        model.eval()  # Set model to evaluation mode
        total_val_loss = 0

        with torch.no_grad():  # No need to track gradients
            for X_batch, y_batch, mask_batch in val_loader:
                X_batch, y_batch, mask_batch = X_batch.to(device), y_batch.to(device), mask_batch.to(device)
                outputs = model(X_batch)
                loss = loss_fn(outputs, y_batch, mask_batch)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

        # Early stopping based on validation loss improvement
        if avg_val_loss < min_val_loss:
            print(f'Validation Loss Improved ({min_val_loss:.4f} ---> {avg_val_loss:.4f})')
            min_val_loss = avg_val_loss
            # Save model checkpoint if desired
            # torch.save(model.state_dict(), 'best_model.pth')
            print('Model Saved')
        else:
            print('No Improvement in Validation Loss')



In [None]:
# Cell 6: Model Evaluation

def evaluate_model(model, data_loader, loss_fn):
    """
    Evaluate the model on a given dataset.
    
    Parameters:
    - model: The PyTorch model to evaluate.
    - data_loader: DataLoader for the dataset to evaluate on.
    - loss_fn: The loss function used for evaluation.
    
    Returns:
    - average_loss: The average loss over the dataset.
    - average_mae: The average mean absolute error over the dataset.
    """
    model.eval()  # Set the model to evaluation mode
    total_loss = 0.0
    total_mae = 0.0
    total_count = 0
    
    with torch.no_grad():  # No gradients needed
        for X_batch, y_batch, mask_batch in data_loader:
            X_batch, y_batch, mask_batch = X_batch.to(device), y_batch.to(device), mask_batch.to(device)
            
            outputs = model(X_batch)
            loss = loss_fn(outputs, y_batch, mask_batch)
            mae = torch.abs(outputs - y_batch) * mask_batch  # Apply mask to MAE calculation
            
            total_loss += loss.item() * X_batch.size(0)
            total_mae += mae.sum().item()
            total_count += mask_batch.sum().item()  # Count only non-padded entries
            
    average_loss = total_loss / total_count
    average_mae = total_mae / total_count
    
    return average_loss, average_mae




In [None]:
# Cell 7: Initialize Model and Hyperparameters

# Note: Ensure that the TCN model class and the custom masked MSE loss function are defined in previous cells.

# Initialize the model
model = TCN(input_size=7, output_size=1, num_channels=[16, 32, 64], kernel_size=2, dropout=0.2).to(device)


# Define the optimizer with a specified learning rate
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Specify the number of epochs for training
epochs = 1

print("Model, optimizer, and hyperparameters initialized.")

In [None]:
# Cell 8: Execute Training

# Call the training function with the initialized components.
# The training function, model, optimizer, and loss function should already be defined.
# Ensure train_loader and val_loader are set up with your training and validation data.

# Start training and validation process
print("Starting training...")
train_and_evaluate_model(model, train_loader, val_loader, optimizer, masked_mse_loss, epochs, device)

print("Training completed.")


In [None]:
# Cell 9: Evaluation and Results Visualization

import matplotlib.pyplot as plt

def plot_predictions_vs_actuals(outputs, targets):
    """
    Plots the predicted vs. actual values.

    Parameters:
    - outputs: The model's predictions.
    - targets: The actual values.
    """
    # Detach predictions and targets from GPU if necessary and convert to numpy
    predictions = outputs.detach().cpu().numpy().flatten()
    actuals = targets.detach().cpu().numpy().flatten()

    plt.figure(figsize=(10, 6))
    plt.plot(predictions, label='Predicted', linewidth=2)
    plt.plot(actuals, label='Actual', linewidth=2)
    plt.title('Predictions vs. Actual Values')
    plt.xlabel('Sample Index')
    plt.ylabel('Value')
    plt.legend()
    plt.show()

# Evaluate the model on the test set
test_loss, test_mae = evaluate_model(model, test_loader, masked_mse_loss)
print(f'Test Loss: {test_loss:.4f}, Test MAE: {test_mae:.4f}')


# Visualize predictions for the first batch from the test set
model.eval()  # Ensure model is in evaluation mode
with torch.no_grad():
    for X_batch, y_batch, mask_batch in test_loader:
        X_batch, y_batch, mask_batch = X_batch.to(device), y_batch.to(device), mask_batch.to(device)
        outputs = model(X_batch)
        # Select the first sequence for visualization
        plot_predictions_vs_actuals(outputs[0], y_batch[0])
        break  # Only visualize the first batch


To do:

- Convert dataframes into tensors
- Create a data loader
- Create a model that takes into account the mask variable
- Just try and predict the moving average
- Bring in data from p4581
- Maybe downsample - so predicting out 50 steps instead of 500 and include other experiments - but first see if something useful in high res data
- How to utilise GPUs - look up vptop = it's a task manager, sees cpus memory etc.
- Look into CNN / FNO - could be useful for long term trends
- Look into clustering peaks to determine similaritiy
- Look at gausian process - inital kernel would be a sin wave.. and would be qutie efficient with little data

Other features
- 

Statistics to look at
- From red star to peak of velocity (i.e. min of derivvative) - have a recursive map.. the next time between the two and hte previous one..
- Lorenz map? To go from a high dimensional system to a 1D map - a function that brings you from a one value to the next - discrete map
- Plot maxima vs previosu maxima
- Add other past stats as constants - amplitude and cycle length