# Explore data from the Appolo Grade 12 and Mars

In [46]:
# Import libraries
import numpy as np
import pandas as pd
from obspy import read
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import glob
import os
import re
from tqdm import tqdm

## Read labels

In [63]:
# planet = "lunar"
planet = "lunar"

data_prefix = "../data"
plot_prefix = "../plots"
out_prefix = "../out"

In [64]:
if planet == "lunar":
    catalog_directory = os.path.join(data_prefix, 'lunar/training/catalogs/')
    catalog_file = catalog_directory + 'apollo12_catalog_GradeA_final.csv'
elif planet == "mars": 
    catalog_directory = os.path.join(data_prefix, 'mars/training/catalogs/')
    catalog_file = catalog_directory + 'Mars_InSight_training_catalog_final.csv'
else:
    raise RuntimeError("Don't know this planet")

In [65]:

labels = pd.read_csv(catalog_file, parse_dates=['time_abs(%Y-%m-%dT%H:%M:%S.%f)'])
labels.rename(columns={"time_abs(%Y-%m-%dT%H:%M:%S.%f)":"datetime"}, inplace=True)
labels.set_index("datetime", inplace=True)

# In the case of mars data, we don't have any mq_type
try:
    mq_type_dict = {l:i+1 for i,l in enumerate(labels["mq_type"].unique())}
    display(mq_type_dict)
    labels["mq_type_id"] = labels["mq_type"].map(mq_type_dict)
except KeyError:
    labels["mq_type_id"] = 1
labels

{'impact_mq': 1, 'deep_mq': 2, 'shallow_mq': 3}

Unnamed: 0_level_0,filename,time_rel(sec),evid,mq_type,mq_type_id
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1970-01-19 20:25:00,xa.s12.00.mhz.1970-01-19HR00_evid00002,73500.0,evid00002,impact_mq,1
1970-03-25 03:32:00,xa.s12.00.mhz.1970-03-25HR00_evid00003,12720.0,evid00003,impact_mq,1
1970-03-26 20:17:00,xa.s12.00.mhz.1970-03-26HR00_evid00004,73020.0,evid00004,impact_mq,1
1970-04-25 01:14:00,xa.s12.00.mhz.1970-04-25HR00_evid00006,4440.0,evid00006,impact_mq,1
1970-04-26 14:29:00,xa.s12.00.mhz.1970-04-26HR00_evid00007,52140.0,evid00007,deep_mq,2
...,...,...,...,...,...
1974-10-14 17:43:00,xa.s12.00.mhz.1974-10-14HR00_evid00156,63780.0,evid00156,impact_mq,1
1975-04-12 18:15:00,xa.s12.00.mhz.1975-04-12HR00_evid00191,65700.0,evid00191,impact_mq,1
1975-05-04 10:05:00,xa.s12.00.mhz.1975-05-04HR00_evid00192,36300.0,evid00192,impact_mq,1
1975-06-24 16:03:00,xa.s12.00.mhz.1975-06-24HR00_evid00196,57780.0,evid00196,impact_mq,1


## Read data

In [50]:
planet

'mars'

In [51]:
data_directory = os.path.normpath(os.path.join(data_prefix, planet,'training/data'))
print(data_directory)
files  =  glob.glob(data_directory+"/**/*.csv", recursive=True)
dates = [re.search("\d{4}-\d{2}-\d{2}", file)[0] for file in files]

print("Available dates:")
print(dates)
print(f"{len(dates)} elements")

..\data\mars\training\data
Available dates:
['2022-01-02', '2022-02-03']
2 elements


In [52]:
i_file = 0
files[i_file]

file = files[i_file]
file

'..\\data\\mars\\training\\data\\XB.ELYSE.02.BHV.2022-01-02HR04_evid0006.csv'

In [53]:
def load_data(file, planet):
    date_col_dict = {"lunar": "time_abs(%Y-%m-%dT%H:%M:%S.%f)", 
                     "mars": "time(%Y-%m-%dT%H:%M:%S.%f)"}
    
    vel_col_dict = {"lunar": "velocity(m/s)", 
                     "mars": "velocity(c/s)"}
    
    data = pd.read_csv(file, parse_dates=[date_col_dict[planet]])
    data.rename(columns={date_col_dict[planet]:"datetime"}, inplace=True)
    data.rename(columns={vel_col_dict[planet]:"velocity"}, inplace=True)
    data.set_index("datetime", inplace=True)
    mask = (labels.index > data.index[0]) & (labels.index <= data.index[-1])
    data = data.join(labels.loc[mask, "mq_type_id"], how="outer")

    return data.loc[:, ["velocity", "mq_type_id"]]

### Export all dataframes

In [54]:
out_path = os.path.join(out_prefix, planet)
os.makedirs(out_path, exist_ok=True)

for file in (pbar := tqdm(files, unit="files")):
    sample_name = os.path.split(file)[1] 

    data = load_data(file, planet)

    out_file = os.path.split(file.replace(data_prefix, out_prefix).replace(".csv", ".parquet"))[1]
    pbar.set_description(f"Writing {out_file}")   
    data.to_parquet(os.path.join(out_path, out_file))
    

Writing XB.ELYSE.02.BHV.2022-02-03HR08_evid0005.parquet: 100%|██████████| 2/2 [00:00<00:00, 10.50files/s]


## Load all data (as an example and also to measure)

In [55]:
for file in (pbar := tqdm(files, unit="files")):
    
    out_file = os.path.split(file.replace(data_prefix, out_prefix).replace(".csv", ".parquet"))[1]
    pd.read_parquet(os.path.join(out_path, out_file))

100%|██████████| 2/2 [00:00<00:00, 221.85files/s]


In [61]:
file = "../out/lunar/xa.s12.00.mhz.1970-01-19HR00_evid00002.parquet"
data = pd.read_parquet(file)
data.describe()


Unnamed: 0,velocity,mq_type_id
count,572415.0,1.0
mean,-8.443134e-13,1.0
std,3.530059e-10,
min,-8.185283e-09,1.0
25%,-5.50483e-11,1.0
50%,-1.633815e-17,1.0
75%,5.443508e-11,1.0
max,7.874026e-09,1.0


### Plots

In [56]:
plot_path = os.path.join(plot_prefix, planet)
os.makedirs(plot_path, exist_ok=True)

c_dict = {1:"red", 
          2:"orange",
          3:"black"}

for file in tqdm(files, unit="files"):
    sample_name = os.path.split(file)[1] 

    data = load_data(file, planet)

    fig, ax = plt.subplots(figsize=(16, 5))

    data["quakes"] = data["velocity"].max()*data["mq_type_id"]

    data.plot(y=["velocity"], ax=ax, ylabel="Velocity", title=os.path.splitext(sample_name)[0])

    qtimes = data["mq_type_id"].dropna().index
    qtypes = data["mq_type_id"].dropna().values
    for qtime, qtype in zip(qtimes, qtypes):
        ax.axvline(x=qtime, c=c_dict[qtype])

    plot_file = os.path.split(file.replace(data_prefix, plot_prefix).replace(".csv", ".png"))[1]    
    fig.savefig(os.path.join(plot_path, plot_file), bbox_inches="tight")
    plt.close(fig)

100%|██████████| 2/2 [00:00<00:00,  3.14files/s]


In [57]:
def clean_A(dataframe, fc):
    do stuff with data and fc

    return data


def clean_B(dataframe, fc1, fc2):

    do other stuff with data and fc1 and fc2

    return data


def clean_B(dataframe, params):

    do other stuff with data and fc1 and fc2

    return data


SyntaxError: invalid syntax (1430814128.py, line 2)