In [1]:
import sys
import numpy as np
import pandas as pd

MAIN_DICT = "/gws/nopw/j04/ai4er/users/pn341/earthquake-predictability"
sys.path.append(MAIN_DICT)

from utils.dataset import SlowEarthquakeDataset

## Raw Data

In [12]:
# Directories paths
GTC_DATA_DIR = "/gws/nopw/j04/ai4er/users/pn341/earthquake-predictability/data/gtc_quakes_data"
LABQUAKES_DATA_DIR = f"{GTC_DATA_DIR}/labquakes"
MELEVEEDU_DATA_DIR = f"{LABQUAKES_DATA_DIR}/MeleVeeduetal2020"

# Open b698 experiment in a dataframe
i417_FILE_PATH = f"{MELEVEEDU_DATA_DIR}/b698/b698.txt"
with open(i417_FILE_PATH, "r") as file:
    df = pd.read_csv(
        file, delim_whitespace=True, header=0, index_col=0, low_memory=False
    )

# Remove units
df = df.iloc[1:, :]

# Handle exception for space in "# Rec" column name creating two separate columns
cols = list(df.keys()) + [""]  # create a new cols list
df.columns = cols[1:]  # remove the first
df.pop(df.columns[-1])  # pop the last column

# Get the last index
last_index = df.index[-1]

# Get the value of the 'Time' column at the last index
last_time = df.loc[last_index, "Time"]
print(f"Last time: {last_time}")

df.head()

Last time value: 29316.2860000000


Unnamed: 0_level_0,lp_disp,LT,Tau,SigN,dcdtOB,slip,Time,Rec.1,timedcdt,ec_disp,mu,Shear_Strain,Slip,velocity
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.0,3.1636940114,-0.0,1e-07,0.0,0.0,0.0,-0.1,0.0,-0.0,0.0,0.0,,
1,0.0,3.1636930114,-0.0,1e-07,0.0,1.0,1.0,0.9,0.0,-0.0,0.0,0.0,,
2,0.0,3.1636935114,-0.0,1e-07,0.0,2.0,2.0,1.9,0.0,-0.0,0.0,0.0,,
3,0.0,3.1636940114,-0.0,1e-07,0.0,3.0,3.0,2.9,0.0,-0.0,0.0,0.0,,
4,0.0,3.1636955114,-0.0,1e-07,0.0,4.0,4.0,3.9,0.0,-0.0,0.0,0.0,,


## Pre-processed Data

In [9]:
# Access b698 and output dataframe head using Pritt's data loaders (which utilises Adriano's loading + pre-processing)
dataset = SlowEarthquakeDataset(["b698"])
dataset.load()

# Get data optupts
ds_exp = dataset["b698"]
X, Y, t = ds_exp["X"], ds_exp["Y"], ds_exp["t"]

# Create dataframe
df = pd.DataFrame(
    np.hstack((X, Y, t.reshape(-1, 1))),
    columns=[ds_exp["hdrs"]["X"], *ds_exp["hdrs"]["Y"], ds_exp["hdrs"]["t"]],
)

df.head()

Unnamed: 0,det_shear_stress,obs_shear_stress,obs_normal_stress,obs_ecdisp,obs_shear_strain,time
0,0.105776,11.225116,17.382861,20.088637,22.167371,0.0
1,0.103531,11.22287,17.375571,20.088683,22.167447,0.01
2,0.105632,11.224972,17.385328,20.088975,22.167941,0.02
3,0.101784,11.221124,17.373017,20.088549,22.167221,0.03
4,0.10612,11.225461,17.386512,20.089116,22.16818,0.04


## Notes on Pre-processing

### General notes:

* We have sampled 3.78% of dataset (in the 3650-3850 window).
* Downsampling frequency = (from Mele Veedu).
* Original columns were: [RecNum, lp_disp, LT, Tau, SigN, dcdtOB, Time, recN, timedcdt, ec_disp, mu, etrain, slipVelocity].
* Pre-processed columns: [det6_shear_stress, obs_shear_stress, obs_normal_stress, obs_ecdisp, obs_shear_strain, time], where:
    * Tau + polyfit -> det_shear_stress &emsp; (processed - detrended)
    * Tau -> obs_shear_stress &emsp; &emsp; &emsp; &emsp; (not processed)
    * SigN -> obs_normal_stress &emsp; &emsp; &emsp; (not processed)
    * ec_disp -> obs_ecdisp &emsp; &emsp; &emsp; &emsp; &emsp;(processed - handles exceptions)
    * etrain -> obs_shear_strain &emsp; &emsp;&emsp;&emsp;(processed - handles exceptions)
    * Time -> time &emsp; &emsp; &emsp; &emsp; &emsp; &emsp; &emsp;&emsp;&emsp;(not processed)
* Pre-processing steps:
    * Handling exceptions for ec_disp and etrain is done by discarding the data and creating empty columns for obs_ecdisp and obs_shear_strain. (See load_data(), lines 41-47.)
    * De-trending for det_shear_stress is done by fitting np.polyfit to obs_shear_stress (degree=1), and then subtracting it from obs_shear_stress. (See load_data(), lines 62-63.)



### Annotated Code

#### Setting Experiment Parameter
From _params.py_: 

```python
elif exp == "i417":
        parameters = {
            "t0": 3650.0,           # Starting time window loaded - Note: raw data min = 0
            "tend": 3850.0,         # Ending time window loaded - Note: raw data max = 5285.9
            "Nheaders": 2,          # Header that np array starts with in import_data
            "dir_data": "gtc_quakes_data/labquakes/",
            "case_study": "MeleVeeduetal2020/i417",
            "data_type": "lab",
            "struct_type": "MeleVeeduetal2020",
            "file_format": "txt",
            "downsample_factor": 1, # No downsampling (in != 1, no code has been written for it)
            "vl": 10,               # Loading velocity
            "segment": None,        # Only relevant for gnss data to segment the data
            "obs_unit": "MPa",
            "time_unit": "s",
        }

        [...] # Assigns new params for obs and time labels with units
```


#### Importing Data
Relevant parts from _load.py_: 

```python
def import_data(dirs, filename, parameters):
    [...] # sets format

    if struct == "MeleVeeduetal2020":
        [...] # accesses file

            Nheaders = parameters["Nheaders"] # From parameters, for "i417" =2
            L = L - Nheaders

            [...] # Creates new array columns, one per quantity in data (see below for assignment)

            [...] # loads data, for loop to assign columns from data

                # For each header, assign quantity from column - see comments for data column headers
                Rec[tt] = int(columns[0])               # RecNum
                LPDisp[tt] = float(columns[1])          # lp_disp (mic)
                LayerThick[tt] = float(columns[2])      # LT (mic) - micrometer?
                ShearStress[tt] = float(columns[3])     # Tau (MPa)
                NormStress[tt] = float(columns[4])      # SigN (MPa)
                OnBoard[tt] = float(columns[5])         # dcdtOB (mic) - micrometer?
                Time[tt] = float(columns[6])            # Time (sec)
                Rec_float[tt] = float(columns[7])       # recN
                TimeOnBoard[tt] = float(columns[8])     # timedcdt (sec)
                ecDisp[tt] = float(columns[9])          # ec_disp
                mu[tt] = float(columns[10])             # mu
                ShearStrain[tt] = float(columns[11])    # etrain
                slip_velocity[tt] = float(columns[12])  # slipVelocity (micrometer/sec)

            [...] # only keep indices with time between time range chosen (3650-3850) as set in parameters

```

#### Loading and Pre-processing
Note: load_data() runs the import_data() which is the one with the loading code, the rest of the code in load_data() then processes it and outputs it into X, Y, t, dt, vl.

Relevant parts from _load.py_: 

```python
def load_data(exp, dirs, params):

    if params["data_type"] == "lab":
            [...] # choose data based on params set and run import_data()

            #---- Copy obs_shear_stress, obs_normal_stress as is!
            ShearStressobs = data["ShearStress"]
            NormalStressobs = data["NormStress"]

            #---- Copy obs_ecdisp and obs_shear_strain, if error create an empty (NaN) column
            try:
                ecDispobs = data["ecDisp"]
            except Exception:
                ecDispobs = np.nan * np.ones(ShearStressobs.shape)
            try:
                ShearStrainobs = data["ShearStrain"]
            except Exception:
                ShearStrainobs = np.nan * np.ones(ShearStressobs.shape)

            [...] # about n of samples, only relevant for Marone

            #----  Reassign time for new range
            if params["struct_type"] == "MeleVeeduetal2020":
                t = data["Time"] - data["Time"][0]
                
            [...] # handle time for other experiments

            #---- Detrend shear stress (into our det_shear_stress) and normal stress
            p = np.polyfit(t, ShearStressobs, deg=1)
            ShearStressobs_det = ShearStressobs - (p[0] * t + p[1]) # our det_shear_stress
            del p

            [...] #---- Detrend normal stress, displacement and strain in same way,
            #           but they are already commented out? No need?

            #---- Assign outputs 
            # observed data
            X = np.array([ShearStressobs_det]).T # our det_shear_stress, note it will be 1st column

            # observed time step
            dt = t[1] - t[0]

            vl = params["vl"]
            [...] #---- Estimate loading velocity from loading displacenment if not present, but in i417 vl=10

            # Y = np.array([ShearStressobs_det, NormalStressobs_det]).T
            Y = np.array(
                [ShearStressobs, NormalStressobs, ecDispobs, ShearStrainobs]
            ).T
            # our [obs_shear_stress, obs_normal_stress, obs_ecdisp, obs_shear_strain]
            
    
    return X, Y, t, dt, vl # note we read the first 3 in as out 6 column dataset [X, Y, t]
```