# Building Data Genome Project 2.0
## Cleaned datasets

Biam! (pic.biam@gmail.com)

In [1]:
# data and numbers
import numpy as np
import pandas as pd
import datetime as dt
import glob

In [2]:
path_raw = "..\\data\\meters\\raw\\"
path_cleaned = "..\\data\\meters\\cleaned\\"
path_anom = "..\\data\\meters\\screening\\anomalies\\"

# Introduction

In this notebooks cleaned meters data-set will be created. 
- Outliers will be replace by `NaN`.These were detected using the [Seasonal Hybrid ESD (S-H-ESD)](https://github.com/twitter/AnomalyDetection) developed by Twitter. This was implemented in R language, the process can be found [here](https://github.com/buds-lab/building-data-genome-project-2/blob/master/notebooks/05_Anomaly-detection.ipynb).
- Zero-readings in `electricity`meter will be replaced by `NaN`.
- Zero-readings longer than 24 continuous hours (in all meters) will be replaced by `NaN`.

This data can be removed using `df.dropna()`.

# Functions

The original code for the `find_zero_gaps` function was written by this [Kaggle user](https://www.kaggle.com/kevincastro).

In [3]:
# This function find all zero gaps in the meter data set
def find_zero_gaps(df, metername):
    
    # Melt dataframe
    df = df.melt(id_vars="timestamp",var_name="building_id",value_name="meter_reading")
    
    # Initiate
    status = {}
    gaps = []
    total_rows = df.timestamp.count()
    pos = 0
    
    for i, row in df.iterrows():
        
        # Initialize status for this meter
        bmid = str(row["building_id"]) + "_" + metername
        
        if bmid not in status:
            status[bmid] = {
                "building_id": row["building_id"],
                "meter": metername,
                "start": None,
                "end": None,
                "count": 0,
                "last_ts": None,
            }
            
        meter_status = status[bmid]
        is_zero = row["meter_reading"] == 0
        
        if is_zero:
            
            if status[bmid]["start"] is None:  # first zero detected
                status[bmid]["start"] = row["timestamp"]
                status[bmid]["count"] = 0
            else:
                status[bmid]["count"] = status[bmid]["count"] + 1
                
        else:
            if status[bmid]["start"] is not None:  # End of gap
                status[bmid]["end"] = row["timestamp"]
                gaps.append(
                    (
                        row["building_id"],
                        metername,
                        status[bmid]["start"],
                        status[bmid]["end"],
                        status[bmid]["count"],
                    )
                )
                status[bmid]["start"] = None
                status[bmid]["end"] = None
                status[bmid]["count"] = 0
                
        status[bmid]["last_ts"] = row["timestamp"]
        progress = round((pos / total_rows) * 100, 2)
        
        if pos % 10000 == 0:
            print(f"\rProgress: {progress}%", end="", flush=True)
            
        pos = pos + 1

    print(f"\rProgress: {progress}%", end="", flush=True)
    # close trailing gaps

    for bmid in status:
        s = status[bmid]
        if (s["start"] is not None) & (s["end"] is None):  # Trailing gap
            gaps.append(
                (s["building_id"], s["meter"], s["start"], s["last_ts"], s["count"])
            )
    df_gaps = pd.DataFrame.from_dict(gaps)
    df_gaps.rename(
        columns={0: "building_id", 1: "meter", 2: "ts_from", 3: "ts_to", 4: "cnt"},
        inplace=True,
    )
    return df_gaps

In [4]:
# This function replace the zero-reading that belong to a gap longer than the desired limit with NaN
def removeZeroGaps(df, df_gaps, limit):
    # This is to reorder columns at the end
    col_order = df.columns
    # Melt meter dataframe
    df = df.melt(id_vars="timestamp",var_name="building_id",value_name="meter_reading")
    # Select gaps longer than limit set
    df_gaps = df_gaps[df_gaps.cnt > limit].reset_index(drop=True)
    
    # For each row in gaps list
    for i in df_gaps.index:
        
        # loop progress
        percentage = round(((i+1) / len(df_gaps)) * 100, 2)
        
        # For the gap selected, replace them with NaN
        df.loc[
            (df["building_id"] == df_gaps["building_id"][i])
            & (
                (df["timestamp"] >= df_gaps["ts_from"][i])
                & (df["timestamp"] < df_gaps["ts_to"][i])
            ),
            "meter_reading",
        ] = np.nan
        
        # print progress
        print(f"\rProgress: {percentage}%", end="", flush=True)
    
    # Unmelt replaced meter dataframe
    df = df.pivot(index='timestamp', columns="building_id", values="meter_reading").reset_index(level=0)
    # Same order as original
    df = df[col_order]
    
    return df

In [5]:
# This function replace all outliers with nan
def removeAnoms(df, metername):
    # load data
    df = pd.read_csv(path_raw + metername + ".csv") 
    # Transform timestamp to datetime object type
    df["timestamp"] = pd.to_datetime(df["timestamp"], format='%Y-%m-%d %H:%M:%S')
    
    # load anomalies df
    df_anom = pd.read_csv(path_anom + metername + "_anoms.csv")    
    # Transform timestamp to datetime object type
    df_anom["timestamp"] = pd.to_datetime(df_anom["timestamp"], format='%Y-%m-%d %H:%M:%S')
    # Remove timezone offset at the end of timestamp
    df_anom["timestamp"] = df_anom.timestamp.apply(lambda d: d.replace(tzinfo=None))
    
    # Outliers
    df_anom = df_anom.set_index("timestamp")
    outliers = df_anom.copy()
    
    # replace not null values with 9999 (outliers)
    outliers[outliers.isna() == False] = 9999
    
    # Set index in original dataset to replace
    df = df.set_index("timestamp")
    
    # Update df with outliers data
    df.update(outliers)
    
    # Replace outliers with nan
    df.replace(9999, np.nan, inplace=True)
    
    # Reset index
    df.reset_index(inplace=True)
    
    return df

# Zero gaps

## Electricity meter

In this meter all zero-readings will be replaced by `NaN`.

In [6]:
# Load data
meter = pd.read_csv(path_raw + "electricity.csv")

In [7]:
# Original number of NaN
meter.isna().sum().sum()

1312095

In [8]:
# Replace all zeros with NaN
meter.replace(0,np.nan, inplace=True)

In [9]:
# Final number of NaN
meter.isna().sum().sum()

2471853

In [10]:
# export data
meter.to_csv(path_cleaned + "electricity_cleaned.csv", index=False)

## All remaining meters

This takes long, but can be ran in several kernels at the same time.

In [11]:
# files in directory
files = glob.glob(path_raw + "*.csv")

In [12]:
files.remove('..\\data\\meters\\raw\\electricity.csv')
files

['..\\data\\meters\\raw\\chilledwater.csv',
 '..\\data\\meters\\raw\\gas.csv',
 '..\\data\\meters\\raw\\hotwater.csv',
 '..\\data\\meters\\raw\\irrigation.csv',
 '..\\data\\meters\\raw\\solar.csv',
 '..\\data\\meters\\raw\\steam.csv',
 '..\\data\\meters\\raw\\water.csv']

In [15]:
files[0:1]

['..\\data\\meters\\raw\\chilledwater.csv']

In [17]:
for file in files[0:1]:
    # Metername
    metername = file.split("\\")[4].split(".")[0]
    print("Current meter: " + metername)
    
    # Load data
    df = pd.read_csv(file)
    
    # Original number of NaN
    print(f'Original number of missing values: {df.isna().sum().sum()}')
    
    # Detect gaps
    print("detecting gaps for " + metername + " meter")
    df_gaps = find_zero_gaps(df, metername)
    print()
    
    # Replace gaps longer than 24 hours wit NaN
    print("Replacing in " + metername + " meter")
    df_zeros = removeZeroGaps(df, df_gaps, 24)
    print()
    
     # Final number of NaN
    print(f'Final number of missing values: {df_zeros.isna().sum().sum()}')
    print()
    print()
    
    # Export data
    del(df)
    df_zeros.to_csv(path_cleaned + metername + "_cleaned.csv", index=False)

Current meter: chilledwater
Original number of missing values: 676512
detecting gaps for chilledwater meter
Progress: 100.0%
Replacing in chilledwater meter
Progress: 100.0%
Final number of missing values: 1919556




# Anomalies

In [18]:
# files in directory
files = glob.glob(path_cleaned + "*.csv")

In [19]:
files

['..\\data\\meters\\cleaned\\chilledwater_cleaned.csv',
 '..\\data\\meters\\cleaned\\electricity_cleaned.csv',
 '..\\data\\meters\\cleaned\\gas_cleaned.csv',
 '..\\data\\meters\\cleaned\\hotwater_cleaned.csv',
 '..\\data\\meters\\cleaned\\irrigation_cleaned.csv',
 '..\\data\\meters\\cleaned\\solar_cleaned.csv',
 '..\\data\\meters\\cleaned\\steam_cleaned.csv',
 '..\\data\\meters\\cleaned\\water_cleaned.csv']

In [21]:
meters = ["chilledwater","electricity","gas","hotwater","irrigation","solar","steam","water"]

In [22]:
for (file, meter) in zip (files,meters):
    print("Processing meter: " + meter)
    df = pd.read_csv(file)
    df_cleaned = removeAnoms(df,meter)
    df_cleaned.to_csv(path_cleaned + meter + "_cleaned1.csv", index=False)

Processing meter: chilledwater
Processing meter: electricity
Processing meter: gas
Processing meter: hotwater
Processing meter: irrigation
Processing meter: solar
Processing meter: steam
Processing meter: water
