In [None]:
from pathlib import Path
from typing import Iterable

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

from common.data import load_data

sns.set_theme()

In [None]:
base_path = Path("../data/5. Battery Data Set/1. BatteryAgingARC-FY08Q4")

def make_paths(names: Iterable[str]):
    return [
        base_path.joinpath(name)
        for name in names
    ]

train_paths = make_paths(["B0005.mat", "B0006.mat"])
valid_paths = make_paths(["B0007.mat"])
test_paths = make_paths(["B0018.mat"])

In [None]:
def load_paths(paths: Iterable[Path]) -> pd.DataFrame:
    data = pd.concat({Path(p).stem: load_data(p, "discharge") for p in paths})
    data.index.names = ["file", "index"]
    return data

In [None]:
train_data = load_paths(train_paths)
valid_data = load_paths(valid_paths)
test_data = load_paths(test_paths)

In [None]:
def window_dataframe(df: pd.DataFrame, size: int) -> pd.DataFrame:
    windows = []
    for s in range(size):
        shifted = df.shift(s)
        shifted.columns = shifted.columns.map(lambda c: f"{c}_b{s}")
        windows.append(shifted)
    return pd.concat(windows, axis=1).iloc[size-1:].reset_index(drop=True)

def process_data(data: pd.DataFrame, window_size: int):
    X = []
    y = []

    for _, group_df in data.groupby("file"):
        operation_df = group_df.groupby("operation_id").agg({
            "Capacity": ["first"],
            "Temperature_measured": ["min", "max"],
        })
        operation_df.columns = operation_df.columns.map(lambda c: "_".join(c))

        is_alive = operation_df.eval("Capacity_first > 1.4")
        alive_cycles = is_alive.sum()
        rul_cycles = -(np.arange(len(is_alive)) - alive_cycles)

        X.append(operation_df)
        y.append(pd.DataFrame({"rul": rul_cycles}))

    X = pd.concat(X, ignore_index=True)
    y = pd.concat(y, ignore_index=True)

    X_win = window_dataframe(X, window_size)
    y_win = y["rul"].iloc[window_size-1:].reset_index(drop=True)

    return X_win, y_win

In [None]:
window_size = 3
train_X, train_y = process_data(train_data, window_size=window_size)
valid_X, valid_y = process_data(valid_data, window_size=window_size)
test_X, test_y = process_data(test_data, window_size=window_size)

In [None]:
model = LGBMRegressor()
model.fit(train_X, train_y, eval_set=(valid_X, valid_y))

In [None]:
def plot_prediction(y, pred):
    plot_min = min(pred.min(), y.min())
    plot_max = min(pred.max(), y.max())

    plt.scatter(pred, y, alpha=0.2)
    plt.plot(
        [plot_min, plot_max], [plot_min, plot_max], 
        color="orange", 
        linestyle="dashed",
    )
    plt.title("Prediction vs Real")
    plt.xlabel("Predictions")
    plt.ylabel("Real")
    plt.show()


def regression_report(model, X, y):
    pred = model.predict(X)
    diff = pred - y

    print(f"rmse: {mean_squared_error(y, pred):0.4f}")
    print(f"max early: {diff[diff > 0].max()}")
    print(f"max late: {diff[diff < 0].min()}")
    plot_prediction(y, pred)


In [None]:
regression_report(model, train_X, train_y)

In [None]:
regression_report(model, valid_X, valid_y)

In [None]:
regression_report(model, test_X, test_y)