# Notebook

## Setup

In [1]:
In [1]: %load_ext kedro.ipython
import pandas as pd

## Load Datasets

In [2]:
alarms = catalog.load("clean_alarms")
status = catalog.load("clean_station_overviews")
# sessions = catalog.load("clean_charging_sessions")

sn_map = catalog.load("map_serial_number")
# ss_map = catalog.load("map_station_status")
# ns_map = catalog.load("map_network_status")
# a_map = catalog.load("map_alarm")

domain_alarms = catalog.load("domain_alarms")
domain_status = catalog.load("domain_status")

## Assemble Training Data

1) Combine the domain status and alarms datasets.
2) Remove all rows with CIDs that are not present in the domain status dataset.
3) For each CID, remove all rows with timestamps outside the min and max timestamps found in the domain status dataset for that CID.
4) For each CID, assemble training groups. Each training group consists of a starting status, followed by all alarms that occur before the next status update for that CID, and finally the next status update.

Additionally, assemble training triplets, which are training groups that consist of exactly 3 rows: a starting status, a single alarm, and an ending status.

In [None]:
last_training_data = None
try:
    last_training_data = training_data.copy()
except:
    pass

# Combine domain status and alarms
domain_status_alarms = pd.concat([domain_status, domain_alarms], axis=0).sort_index()
# display(domain_status_alarms)

# Flatten the index
domain_status_alarms_flat = domain_status_alarms.reset_index()
# display(domain_status_alarms_flat)

# Remove CIDs not in domain_status
training_data = domain_status_alarms_flat[domain_status_alarms_flat["cid"].isin(domain_status.index.levels[0])]
# display(training_data)

# Remove data outside status time range
## Get min and max timestamps for each cid in domain_status
status_time_range = (
    domain_status.reset_index().groupby("cid")["timestamp"].agg(min_status_timestamp="min", max_status_timestamp="max")
)
# display(status_time_range)

## Merge with training_data and filter
training_data = training_data.merge(status_time_range, on="cid", how="left")
# display(training_data)

## Filter rows based on min and max status timestamps
training_data = (
    training_data[
        (training_data["timestamp"] >= training_data["min_status_timestamp"])
        & (training_data["timestamp"] <= training_data["max_status_timestamp"])
    ]
    .reset_index(drop=True)
    .drop(columns=["min_status_timestamp", "max_status_timestamp"])
)
# display(training_data)

# Create status timestamp column
training_data.loc[~training_data.ssid.isna(), "status_timestamp"] = training_data.loc[
    ~training_data.ssid.isna(), "timestamp"
]
# display(training_data)

# Forward fill within CID groups
training_data["status_timestamp"] = training_data.groupby("cid")["status_timestamp"].ffill()
# display(training_data)

# Create training groups
training_data["training_group"] = (
    (abs((training_data.status_timestamp - training_data.status_timestamp.shift(1)).dt.total_seconds()) > 0)
    .astype(int)
    .cumsum()
)
# display(training_data)

# Reindex training groups within CID groups
training_data["training_group"] = training_data.groupby("cid")["training_group"].transform(lambda x: x - x.min())
# display(training_data)

# For each (CID, training_group) group, append status from (CID, training_group + 1) to the end of the groups
appended_statuses = training_data.groupby(["cid", "training_group"]).nth(0)
appended_statuses.loc[:, "training_group"] -= 1
appended_statuses = appended_statuses[appended_statuses["training_group"] >= 0]
training_data = (
    pd.concat([training_data, appended_statuses], axis=0, ignore_index=True)
    .sort_values(by=["cid", "training_group", "timestamp"])
    .reset_index(drop=True)
)
# display(training_data)

# Reorder (and drop) columns
training_data = training_data[["cid", "training_group", "timestamp", "ssid", "nsid", "aid"]]

# Remove groups with less than 3 rows
training_data = (
    training_data.groupby(["cid", "training_group"]).filter(lambda group: len(group) >= 3).reset_index(drop=True)
)
# display(training_data)

# Filter groups less than 1.5 hours
training_data = training_data.groupby(["cid", "training_group"]).filter(
    lambda group: (group.timestamp.max() - group.timestamp.min()) < pd.Timedelta(hours=1.5)
)
display(training_data)

# Is idempotent?
if last_training_data is not None:
    assert training_data.equals(last_training_data), "Training data has changed since last run!"

# Select observation model training data (groups with exactly 3 rows: 2 statuses, 1 alarm)
training_triplets = (
    training_data.groupby(["cid", "training_group"]).filter(lambda group: len(group) == 3).reset_index(drop=True)
)
training_triplets = training_triplets.groupby(["cid", "training_group"]).agg(
    ssid_1=("ssid", lambda x: x.iloc[0].astype(int)),
    ssid_2=("ssid", lambda x: x.iloc[-1].astype(int)),
    aid=("aid", lambda x: x.iloc[1].astype(int)),
)
display(training_triplets)

## Determine Null Events

Now, we determine which events are null events (i.e., do not change the state). To do this, we select all training triplets (groups with exactly 3 rows: 2 statuses, 1 alarm). A null event is one where the starting and ending statuses are always the same for a given alarm.

In [None]:
training_triplets_changed_status = training_triplets.copy()
training_triplets_changed_status["changed_status"] = (
    training_triplets_changed_status["ssid_1"] != training_triplets_changed_status["ssid_2"]
)
# display(training_triplets_changed_status)

training_triplets_changed_status.reset_index(drop=True)[
    ["ssid_1", "aid", "ssid_2"]
].value_counts().reset_index().set_index(["ssid_1", "aid", "ssid_2"]).sort_index()

In [None]:
a_map

In [None]:
ss_map

In [None]:
# Convert per-group ssids and aid to lists of integers
observation_model_training_data = training_triplets.groupby(["cid", "training_group"])[["ssid", "aid"]].aggregate(
    lambda x: list(map(int, x.dropna().tolist()))
)
display(observation_model_training_data)

# Concatenate to form observation model counts
observation_model_training_data = observation_model_training_data.aggregate(
    lambda x: x.ssid[:1] + x.aid + x.ssid[-1:], axis=1
)
observation_model_training_data.name = "observation_index"
display(observation_model_training_data)

In [None]:
observation_model_training_data.value_counts()

R(s, a)
- if faulted / unreachable, but we do nothing: -100
- if are not faulted, and we alert: -50
- if faulted, and we already alerted in the last xxx time, and then we alert again: -10
- else (we did good): +10


In [None]:
a_map