# Data exploration

In [None]:
from pathlib import Path

import pandas as pd
import seaborn as sb

sb.set()

## Load a first data set

In [None]:
dset = pd.read_csv("/nesi/project/uoo03699/G1_S1.csv")
dset

In [None]:
dset.info()

In [None]:
dset.isnull().any()

In [None]:
features = [
    "Line Distance [m]",
    "Normal Load [N]",
    "Friction Force [N]",
    "Friction Coeff.",
]
target = "Wear Loss [mm]"

In [None]:
_ = dset.plot(x="Time [sec]", y=features, subplots=True, figsize=(15, 7))

In [None]:
_ = dset.plot(x="Time [sec]", y=target, figsize=(15, 7))

## Compare wear curves for all materials

In [None]:
dset = []

for path in Path("/nesi/project/uoo03699").glob("*_*.csv"):
    material, sample = path.stem.split("_")
    dset_sample = (
        pd.read_csv(path, low_memory=False)
        .assign(sample=sample, material=material)
        .dropna()  # remove few malformed lines at the end with no data
        .astype({"Time [sec]": int})
    )
    dset.append(dset_sample)

dset = pd.concat(dset).reset_index(drop=True)
dset

In [None]:
dset.info()

In [None]:
_ = sb.relplot(
    data=dset,
    x="Time [sec]",
    y=target,
    hue="sample",
    col="material",
    col_wrap=2,
    kind="line",
    aspect=2,
    facet_kws={"sharey": False},
    col_order=sorted(dset["material"].unique()),
    hue_order=sorted(dset["sample"].unique()),
)

## Reduce curves resolution

Let's resample from seconds to minute data, taking the average.

In [None]:
dset["Time [min]"] = dset["Time [sec]"] // 600

In [None]:
# columns to keep in the resampled dataset
cols = ["Line Distance [m]", target]
dset_small = (
    dset.groupby(["material", "sample", "Time [min]"])[cols].mean().reset_index()
)
dset_small

In [None]:
_ = sb.relplot(
    data=dset_small,
    x="Time [min]",
    y=target,
    hue="sample",
    col="material",
    col_wrap=2,
    kind="line",
    aspect=2,
    facet_kws={"sharey": False},
    col_order=sorted(dset_small["material"].unique()),
    hue_order=sorted(dset_small["sample"].unique()),
)

Save prepared data in a `results` folder.

In [None]:
results_dir = Path("../results")
results_dir.mkdir(exist_ok=True, parents=True)
dset_small.to_csv(results_dir / "dataset_minutes_10m.csv", index=False)