# Data Overview

In [13]:
import glob
import os
import pandas as pd
import plotly.graph_objs as go
import plotly.io as pio

In [None]:
files = glob.glob("../data/processed/*")
files.sort()
files

In [None]:
df = pd.read_csv('../data/processed/MSL_train.csv')
# pd.infer_freq(df.index)
df.index

In [None]:
for file_path in files:
    if file_path.endswith(".parquet"):
        df = pd.read_parquet(file_path)
    elif file_path.endswith(".csv"):
        df = pd.read_csv(file_path)
    else:
        print(f"Skipping '{file_path}' (unsupported file type).")
        continue

    try:
        df.set_index(pd.to_datetime(df.iloc[:, 0], format="%Y-%m-%d %H:%M:%S"), inplace=True)
        df.drop(df.columns[0], axis=1, inplace=True)
    except (ValueError, TypeError):
        pass

    if df.shape[0] > 1000000:
        downsample_factor = 1000
    elif df.shape[0] > 100000:
        downsample_factor = 100
    elif df.shape[0] > 10000:
        downsample_factor = 10
    else:
        downsample_factor = 1
    df_downsampled = df.iloc[::downsample_factor]

    fig = go.Figure()
    for col in df_downsampled.columns:
        fig.add_trace(
            go.Scatter(
                x=df_downsampled.index,
                y=df_downsampled[col],
                mode='lines',
                name=col
            )
        )
    index_label = "Datetime" if isinstance(df_downsampled.index, pd.DatetimeIndex) else "Index"
    fig.update_layout(
        title=f"{os.path.basename(file_path)} | original shape: {df.shape} | plot downsample factor: {downsample_factor}",
        xaxis_title=index_label,
        yaxis_title="Value",
        legend=dict(
            title="Columns",
            orientation="v",
            x=1.02,
            xanchor="left",
            y=1
        )
    )
    fig.show()