# Residuals summary datasets
In this notebook the following task will be done:
- Create summary datasets, summarised by regresion metric RMSLE.

In [1]:
import pandas as pd
import numpy as np
import glob
import gzip
import pickle
import sys
import datetime as dt
from sklearn.metrics import mean_squared_log_error

sys.path.append("..\\source\\")
import utils as utils

In [2]:
path_in = "..\\data\\original\\"
path_out = "..\\data\\processed\\"

## Load files list

In [4]:
# Susb + solution files
files = glob.glob(path_out + "merged\\*")
len(files)

50

In [5]:
files[0]

'..\\data\\processed\\merged\\sub13577404_merged.pickle.gz'

## Summary datasets

In [21]:
# General script

# Groupby
#gb = "groupby_bdg_meter"
#gb = "groupby_bdg_meter_date"
gb = "groupby_meter_date"

# Files to convert
start = 1
end = 51

for datafile in files[start:end]:

    # file id
    name = datafile.split("\\")[-1].split("_")[0]

    # Print progress
    number = files.index(datafile) + 1
    total = len(files)
    progress = round(number * 100 / total,2)
    print(f"{name} - {progress}% ({number} of {len(files)})")

    # Load data
    df = pd.read_pickle(datafile)
    print(f"Data loaded")

    # Rename
    df = df.rename(columns={"res":"meter_reading"})
    # Replace all negative values in submission
    df.loc[df.submission < 0, "submission"] = 0
    # Drop NaN
    df.dropna(inplace=True)
    print("Data transformed")
    # Reduce memory use
    df = utils.reduce_mem_usage(df)

    # Get summary metrics
    if gb == "groupby_bdg_date":
        print("Grouping by bdg, meter")
        #drop useless columns
        df.drop("timestamp",axis=1,inplace=True)
        # Group
        df = df.groupby(["building_id","meter"])["submission","meter_reading"].apply(lambda group: utils.RMSLE(group["meter_reading"], group["submission"]))

    elif gb == "groupby_bdg_meter_date":
        print("Grouping by bdg, meter, date")
        # Convert to timestamp
        df.timestamp = pd.to_datetime(df.timestamp, format="%Y-%m-%d %H:%M:%S")
        # Group
        df = pd.DataFrame(df.groupby(["building_id","meter",df.timestamp.dt.date])["submission","meter_reading"].apply(lambda group: utils.RMSLE(group["meter_reading"], group["submission"]))).reset_index()

    elif gb == "groupby_meter_date":
        print("Grouping by meter, date")
        # Convert to timestamp
        df.timestamp = pd.to_datetime(df.timestamp, format="%Y-%m-%d %H:%M:%S")
        #drop useless columns
        df.drop("building_id",axis=1,inplace=True)
        # Group
        df = df.groupby(["meter",df.timestamp.dt.date])["submission","meter_reading"].apply(lambda group: utils.RMSLE(group["meter_reading"], group["submission"]))

    # Rename
    df = df.reset_index()
    # Rename columns
    df = df.rename(columns={0:"rmsle"})
    print("Data summary created")

    # Export df
    file = gzip.GzipFile(path_out + f'summary\\{gb}\\{name}.pickle.gz', 'wb', 6)
    file.write(pickle.dumps(df))
    file.close()
    print(f"Data exported")
    
    print("")

sub13768618 - 4.0% (2 of 50)
Data loaded
Data transformed
Mem. usage decreased to 1051.92 Mb (0.0% reduction)
Grouping by meter, date
Data summary created
Data exported

sub13768660 - 6.0% (3 of 50)
Data loaded
Data transformed
Mem. usage decreased to 1051.92 Mb (0.0% reduction)
Grouping by meter, date
Data summary created
Data exported

sub13806350 - 8.0% (4 of 50)
Data loaded
Data transformed
Mem. usage decreased to 1051.92 Mb (0.0% reduction)
Grouping by meter, date
Data summary created
Data exported

sub13808838 - 10.0% (5 of 50)
Data loaded
Data transformed
Mem. usage decreased to 1051.92 Mb (0.0% reduction)
Grouping by meter, date
Data summary created
Data exported

sub13810582 - 12.0% (6 of 50)
Data loaded
Data transformed
Mem. usage decreased to 1051.92 Mb (0.0% reduction)
Grouping by meter, date
Data summary created
Data exported

sub13812777 - 14.0% (7 of 50)
Data loaded
Data transformed
Mem. usage decreased to 1051.92 Mb (0.0% reduction)
Grouping by meter, date
Data summary 

## Concat all

In [22]:
# Groupby
#gb = "groupby_bdg_meter"
#gb = "groupby_bdg_meter_date"
gb = "groupby_meter_date"

In [23]:
files = glob.glob(path_out + f"summary\\{gb}\\*" )
len(files)

50

In [33]:
dfs = []
for datafile in files:
    # File id
    #name = int(datafile.split("\\")[-1].split("_")[1].split(".")[0].split("sub")[1])
    name = int(datafile.split("\\")[-1].split(".")[0].split("sub")[1])
    # Load dataset
    df = pd.read_pickle(datafile)
    # Add id
    df["file_id"] = name
    dfs.append(df)

In [34]:
df_concat = pd.concat(dfs)

In [35]:
df_concat.head()

Unnamed: 0,meter,timestamp,rmsle,file_id
0,0,2017-01-01,0.466707,13577404
1,0,2017-01-02,0.465356,13577404
2,0,2017-01-03,0.486281,13577404
3,0,2017-01-04,0.490924,13577404
4,0,2017-01-05,0.476774,13577404


In [36]:
del(dfs)

In [37]:
df_concat = utils.reduce_mem_usage(df_concat)

Mem. usage decreased to  3.20 Mb (42.5% reduction)


In [14]:
file = gzip.GzipFile(path_out + f"summary\\{gb}.pickle.gz", 'wb', 6)
file.write(pickle.dumps(df_concat))
file.close()

In [38]:
df_concat.to_csv(path_out + f"summary\\{gb}.csv", index=False)