## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import os

In [None]:
#!pip install dask

## Extract data from gcp

In [2]:
#this does not group by tariff type
def process_group_data(g):
    #not required for Chris
    df=pd.read_csv(f"https://storage.googleapis.com/energy_usage_prediction_903/df_{g}_v1.csv")
    df.drop(columns='Unnamed: 0',inplace=True)
    #imputing zeros and nans on group avverage as opposed to per house for speed
    df1=df.loc[:,['LCLid','Acorn_Group','DateTime','KWH/hh','Tariff']]
    df1.loc[:,'DateTime']=pd.to_datetime(df1['DateTime'])
    #group average
    df2=df1.groupby(by=df.DateTime).mean()
    df2.sort_index(inplace=True)
    #fill 0s with nans
    df2['KWH/hh'].replace(0,np.nan,inplace=True)
    #fill nans for both existing ones and any created due to missing HH 
    date_range = pd.DataFrame(pd.date_range(df2.index[0],df2.index[-1], freq='30 min'),columns=['DateTime'])
    df2.reset_index(inplace=True)
    df2.loc[:,'DateTime']=pd.to_datetime(df2['DateTime'])
    df3=date_range.merge(df2,on='DateTime',how='outer')
    if np.sum(df3['KWH/hh'].isna())!=0:
        df3.fillna(method='bfill',inplace=True)
    return df3

## Check start and end date for each group

In [2]:
groups=['A','B','C','D','F','G','H','I','J','K','L','M','N','O','P','Q']
def extract_start_end_time_bygroup(groups):
    g_dict={}
    g_dict['E']=[['2011-11-24 11:00:00', '2014-02-28 00:00:00']]
    for g in groups:
        df=pd.read_csv(f"https://storage.googleapis.com/energy_usage_prediction_903/df_{g}_v1.csv",usecols=['DateTime','KWH/hh'])
        df.set_index('DateTime',inplace=True)
        df_av=df.groupby(by=df.index).mean()
        df_av.sort_index(inplace=True)
        g_dict[g]=[[df_av.index[0],df_av.index[-1]]]
        
    return g_dict

In [None]:
# df=pd.read_csv('https://storage.googleapis.com/energy_usage_prediction_903/df_E_v1.csv',usecols=['DateTime','KWH/hh'])
# df.set_index('DateTime',inplace=True)
# t=[df.index[0],df.index[-1]]#['2012-07-09 11:30:00', '2014-02-28 00:00:00']

In [None]:
# #read data in chunks of 1 million rows at a time
# chunk = pd.read_csv('https://storage.googleapis.com/energy_usage_prediction_903/df_E_v1.csv',chunksize=1000000)
# pd_df = pd.concat(chunk)

## merge_all_data

In [3]:
groups=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q']
def merge_groups(groups,use_tariff=True):
    df=pd.DataFrame()
    for g  in groups:
        if use_tariff==True:
            tariffs=['Std','ToU']
            for tr in tariffs:
                df2=pd.read_csv(f"https://storage.googleapis.com/energy_usage_prediction_903/df_{g}_avg_{tr}_v1.csv").drop(columns='Unnamed: 0')
                df2['Acorn_Group']=g
                df=pd.concat([df, df2])                
        else:
            df1=pd.read_csv(f"https://storage.googleapis.com/energy_usage_prediction_903/df_{g}_avg_v1.csv").drop(columns='Unnamed: 0')
            df1['Acorn_Group']=g
            df=pd.concat([df, df1])
    return df
    

## Save files

In [4]:
df_all_avg_tariff=merge_groups(groups,use_tariff=True)
df_all_avg=merge_groups(groups,use_tariff=False)

path=os.path.join(os.getcwd(),"../Team_Energy/data/")
print(path)
df_all_avg_tariff.to_csv(path+"df_all_avg_tariff_v3.csv",header=True,index=False)
df_all_avg.to_csv(path+"df_all_avg_v3.csv",header=True,index=False)

/home/zenanahmed/code/ZenanAH/Team_Energy/notebooks/../Team_Energy/data/


In [3]:
## group by tariffs
df=pd.read_csv(f"https://storage.googleapis.com/energy_usage_prediction_903/df_{'B'}_v1.csv")
df.drop(columns='Unnamed: 0',inplace=True)
#imputing zeros and nans on group avverage as opposed to per house for speed
df1=df.loc[:,['LCLid','Acorn_Group','DateTime','KWH/hh','Tariff']]
df1.loc[:,'DateTime']=pd.to_datetime(df1['DateTime'])
#group average
df2=df1.groupby(by=[df.DateTime,'Tariff']).mean()
df2.reset_index(inplace=True)
#fill 0s with nans
df2['KWH/hh'].replace(0,np.nan,inplace=True)
df3=df2.pivot(index='DateTime', columns='Tariff', values='KWH/hh').reset_index()
df3.rename_axis(None, axis=1,inplace=True)
#fill nans for both existing ones and any created due to missing HH 
#std
df_std=df3[['DateTime','Std']].set_index('DateTime')
date_range = pd.DataFrame(pd.date_range(df_std.index[0],df_std.index[-1], freq='30 min'),columns=['DateTime'])
df_std.reset_index(inplace=True)
df_std.loc[:,'DateTime']=pd.to_datetime(df_std['DateTime'])
df3_std=date_range.merge(df_std,on='DateTime',how='outer')
if np.sum(df3_std['KWH/hh'].isna())!=0:
    df3_std.fillna(method='bfill',inplace=True)
#tou
df_tou=df3[['DateTime','ToU']].set_index('DateTime').dropna(inplace=True)
date_range2 = pd.DataFrame(pd.date_range(df_tou.index[0],df_tou.index[-1], freq='30 min'),columns=['DateTime'])
df_tou.reset_index(inplace=True)
df_tou.loc[:,'DateTime']=pd.to_datetime(df_tou['DateTime'])
df3_tou=date_range2.merge(df_tou,on='DateTime',how='outer')
if np.sum(df3_tou['KWH/hh'].isna())!=0:
    df3_tou.fillna(method='bfill',inplace=True)

KeyError: 'KWH/hh'