# Building Data Genome Project 2.0
## Cleaned datasets

Biam! (pic.biam@gmail.com)

In [9]:
# data and numbers
import numpy as np
import pandas as pd
import datetime as dt

In [10]:
path_raw = "..//data//meters//raw//"
path_cleaned = "..//data//meters//cleaned//"
path_anom = "..//data//meters//screening//anomalies//"

# Introduction

In this notebooks cleaned meters data-set will be created. Outliers in the raw meters dataset were detected using the [Seasonal Hybrid ESD (S-H-ESD)](https://github.com/twitter/AnomalyDetection) developed by Twitter. This was implemented in R language, the process can be found [here](https://github.com/buds-lab/building-data-genome-project-2/blob/master/notebooks/04_Anomaly-detection.ipynb).

# Functions

In [4]:
# This function removes outliers and 24 hours zero readings
def removeBadData(df, metername):

    # load anomalies df
    df_anom = pd.read_csv(path_anom + metername + "_anoms.csv")
    # Transform timestamp to datetime object type
    df_anom["timestamp"] = pd.to_datetime(
        df_anom["timestamp"], format="%Y-%m-%d %H:%M:%S"
    )
    # Remove timezone offset at the end of timestamp
    df_anom["timestamp"] = df_anom.timestamp.apply(lambda d: d.replace(tzinfo=None))
    # Set index
    df_anom = df_anom.set_index("timestamp")

    # Remove outliers
    outliers = df_anom.copy()
    # replace not null values with 9999 (outliers)
    outliers[outliers.isna() == False] = 9999
    # Update df with outliers data
    df.update(outliers)
    # Remove outliers
    for datapoint in df.columns:
        df[datapoint] = df[datapoint][df[datapoint] != 9999]

    # Remove zero gaps
    # Calculate daily average and aggregate data
    df_daily = df.resample("D").mean()
    # De-aggreate data asigning daily mean to each hour
    df_hourly = df_daily.resample("H").fillna(method="ffill")

    ## This dataset ends on 2017-12-31 00:00:00. Our meter dataset ends on 2017-12-31 23:00:00.##
    ## This is solved in the following code ##

    # Last row of df_hourly to copy values
    sample = df_hourly[df_hourly.index == "2017-12-31 00:00:00"]
    # Dataframe
    rng = pd.DataFrame(
        index=pd.date_range("2017-12-31 01:00:00", periods=23, freq="H"),
        columns=df.columns,
    )
    appdf = (
        sample.append(rng)
        .fillna(method="ffill")
        .drop(pd.Timestamp("2017-12-31 00:00:00"))
    )
    # Append
    df_hourly = df_hourly.append(appdf)

    # Delete zero values during whole day
    for datapoint in df_hourly.columns:
        df[datapoint] = df[datapoint][df_hourly[datapoint] > 0]

    del (df_anom, outliers, df_daily, df_hourly)

    return df

# Export cleaned datasets

In [8]:
metername = ["electricity","water","chilledwater","hotwater","gas", "steam","solar","irrigation"]

for meter in metername:    
    # load data
    df = pd.read_csv(path_raw + meter + ".csv") 
    # Transform timestamp to datetime object type
    df["timestamp"] = pd.to_datetime(df["timestamp"], format='%Y-%m-%d %H:%M:%S')
    # Set index
    df = df.set_index("timestamp")
    
    # Remove bad data
    df_clean = removeBadData(df, meter)
    df_clean.to_csv(path_cleaned + meter + "_cleaned.csv")