# Jupyter Notebook for Transforming Dataset

Dataset used: [AnoML-IoT](https://www.kaggle.com/datasets/hkayan/anomliot)

Sample dataset format:

| Time       | Temperature | Humidity | ... |
|------------|-------------|----------|-----|
| 1623781306 | 37.94       | 28.94    | ... |
| ...        | ...         | ...      | ... |

## Load Dataset

In [1]:
import pandas as pd

anoml_iot_dataset = pd.read_csv("../datasets/dataset_final.csv")
anoml_iot_dataset

Unnamed: 0,Time,Temperature,Humidity,Air Quality,Light,Loudness
0,1623781306,37.94,28.94,75,644,106
1,1623781316,37.94,29.00,75,645,145
2,1623781326,37.88,28.88,75,644,146
3,1623781336,37.72,28.94,75,646,139
4,1623781346,37.69,29.19,75,644,155
...,...,...,...,...,...,...
6553,1623846836,26.16,57.38,75,630,169
6554,1623846846,26.22,57.38,75,630,169
6555,1623846856,26.22,57.38,75,630,142
6556,1623846866,26.22,57.31,75,630,127


## Sort Data Based on Timestamp (Just in case)

In [2]:
anoml_iot_dataset_sorted = anoml_iot_dataset.sort_values(by=["Time"])
anoml_iot_dataset_sorted

Unnamed: 0,Time,Temperature,Humidity,Air Quality,Light,Loudness
0,1623781306,37.94,28.94,75,644,106
1,1623781316,37.94,29.00,75,645,145
2,1623781326,37.88,28.88,75,644,146
3,1623781336,37.72,28.94,75,646,139
4,1623781346,37.69,29.19,75,644,155
...,...,...,...,...,...,...
6553,1623846836,26.16,57.38,75,630,169
6554,1623846846,26.22,57.38,75,630,169
6555,1623846856,26.22,57.38,75,630,142
6556,1623846866,26.22,57.31,75,630,127


## Generate Input for IoT Device Simulator

In [3]:
IGNORE_KEYS = ["Air Quality", "Loudness"]

In [4]:
from datetime import datetime
from collections import defaultdict
import pytz

simulator_schedule: dict[str, list[dict[str, float]]] = defaultdict(list)

current_timestamp = anoml_iot_dataset_sorted.iloc[0]["Time"]

for index, row in anoml_iot_dataset_sorted.iterrows():
    row_dict = row.to_dict()
    previous_timestamp = current_timestamp
    current_timestamp = row_dict.pop("Time")
    time_difference = current_timestamp - previous_timestamp
    ts_iso8601_format = datetime.fromtimestamp(
        timestamp=current_timestamp, tz=pytz.timezone("UTC")
    ).isoformat()
    for key, value in row_dict.items():
        if key not in IGNORE_KEYS:
            simulator_schedule[key].append(
                {
                    "timestamp": ts_iso8601_format,
                    "timeDifference": time_difference,
                    "value": value,
                }
            )

## Save Result as JSON

In [5]:
import json

with open("../datasets/schedule.json", "w") as f:
    json.dump(simulator_schedule, f)