In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Cleaning and Preprocessing by ACORN Class

In [2]:
# Save the Group Letter Name to be processed here

groupletter = 'A'

In [3]:
# Import Summary Datasheet

df_summary = pd.read_csv('../raw_data/summary_v1_selection.csv')
df_summary = df_summary[['LCLid', 'start_date', 'end_date', 'number_of_days', 'data_points', 'missing_hh', 'zeros', 'tariff', 'Block', 'Acorn', 'Group', 'Classification', 'Select']]

In [34]:
len(df_summary[(df_summary['tariff'] == 'ToU') & (df_summary['Acorn'] == 'ACORN-G')])

47

In [5]:
# Reduce selection to chosen group which has been selected
string = f'ACORN-{groupletter}'
df_hot = df_summary[(df_summary['Acorn'] == string) 
                    & (df_summary['number_of_days'] >= 365) 
                    & (df_summary['zeros'] + df_summary['missing_hh'] <= 48)]
df_hot

Unnamed: 0,LCLid,start_date,end_date,number_of_days,data_points,missing_hh,zeros,tariff,Block,Acorn,Group,Classification,Select
1,MAC000246,03/12/2011 09:00,28/02/2014 00:00,817,39244,3,0,Std,block_0,ACORN-A,Lavish Lifestyles,Affluent Achievers,True
4,MAC003223,17/09/2012 12:30,28/02/2014 00:00,528,25366,2,0,Std,block_0,ACORN-A,Lavish Lifestyles,Affluent Achievers,True
5,MAC003239,18/09/2012 09:00,28/02/2014 00:00,527,25326,1,0,Std,block_0,ACORN-A,Lavish Lifestyles,Affluent Achievers,True
7,MAC003281,21/09/2012 08:30,28/02/2014 00:00,524,25178,6,0,Std,block_0,ACORN-A,Lavish Lifestyles,Affluent Achievers,True
8,MAC003305,24/09/2012 10:30,28/02/2014 00:00,521,25034,2,0,Std,block_0,ACORN-A,Lavish Lifestyles,Affluent Achievers,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,MAC005421,29/02/2012 10:30,28/02/2014 00:00,729,35011,9,2,Std,block_2,ACORN-A,Lavish Lifestyles,Affluent Achievers,True
169,MAC003211,17/09/2012 09:30,28/02/2014 00:00,528,25372,2,0,ToU,block_3,ACORN-A,Lavish Lifestyles,Affluent Achievers,True
191,MAC004543,21/12/2011 14:30,28/02/2014 00:00,799,38365,7,8,ToU,block_3,ACORN-A,Lavish Lifestyles,Affluent Achievers,True
196,MAC004955,23/01/2012 11:30,28/02/2014 00:00,766,36794,0,0,ToU,block_3,ACORN-A,Lavish Lifestyles,Affluent Achievers,True


In [6]:
# Create a list of houses matching criteria

houselist = df_hot['LCLid'].tolist()

In [7]:
# Create a second list of the blocks where the data for each house is found

blocklist = df_hot['Block'].tolist()

In [8]:
# Create a third list of the tariff type

tarifflist = df_hot['tariff'].tolist()

In [9]:
# Iterable loop concactinating the data into lists
LCL = []
Acorn_Group = []
DateTime = []
KWH = []
tariff = []


for i in range (0,int(len(houselist))):
    print(f'Now adding house {i+1} of {len(houselist)}, {houselist[i]} to the dataset...')
    blockstr = f'../raw_data/halfhourly_dataset/{blocklist[i]}.csv'
    if blocklist[i] != blocklist[i-1]:
        df_house = pd.read_csv(blockstr, dtype = {'LCLid': object, 'tstp': object , 'energy(kWh/hh)': object})
    df_house = df_house[df_house['LCLid'] == houselist[i]]
    df_house['tariff'] = tarifflist[i]
    LCL.extend(df_house['LCLid'])
    DateTime.extend(df_house['tstp'])
    KWH.extend(df_house['energy(kWh/hh)'])
    tariff.extend(df_house['tariff'])
    

Now adding house 1 of 81, MAC000246 to the dataset...
Now adding house 2 of 81, MAC003223 to the dataset...
Now adding house 3 of 81, MAC003239 to the dataset...
Now adding house 4 of 81, MAC003281 to the dataset...
Now adding house 5 of 81, MAC003305 to the dataset...
Now adding house 6 of 81, MAC003388 to the dataset...
Now adding house 7 of 81, MAC003400 to the dataset...
Now adding house 8 of 81, MAC003428 to the dataset...
Now adding house 9 of 81, MAC003482 to the dataset...
Now adding house 10 of 81, MAC003553 to the dataset...
Now adding house 11 of 81, MAC003557 to the dataset...
Now adding house 12 of 81, MAC003579 to the dataset...
Now adding house 13 of 81, MAC003613 to the dataset...
Now adding house 14 of 81, MAC003646 to the dataset...
Now adding house 15 of 81, MAC003668 to the dataset...
Now adding house 16 of 81, MAC003686 to the dataset...
Now adding house 17 of 81, MAC003719 to the dataset...
Now adding house 18 of 81, MAC003737 to the dataset...
Now adding house 19

In [52]:
# Creating a new Data Frame with lists

block_data = pd.DataFrame({'LCLid': LCL, 'DateTime': DateTime, 'KWH/hh': KWH, 'Tariff': tariff})

In [53]:
# Remove Null values

block_data = block_data[block_data['KWH/hh'] != 'Null']

# Convert KWH/hh to numeric

block_data['KWH/hh'] = pd.to_numeric(block_data['KWH/hh'])

# Convert DateTime to DateTime format

block_data['DateTime'] = pd.to_datetime(block_data['DateTime'])

# Create ACORN group label

block_data['Acorn_Group'] = groupletter
block_data['Group'] = df_hot['Group'].iloc[0]
block_data['Classification'] = df_hot['Classification'].iloc[0]

In [54]:
block_data

Unnamed: 0,LCLid,DateTime,KWH/hh,Tariff,Acorn_Group,Group,Classification
0,MAC000246,2011-12-03 09:00:00,0.149,Std,A,Lavish Lifestyles,Affluent Achievers
1,MAC000246,2011-12-03 09:30:00,0.154,Std,A,Lavish Lifestyles,Affluent Achievers
2,MAC000246,2011-12-03 10:00:00,0.768,Std,A,Lavish Lifestyles,Affluent Achievers
3,MAC000246,2011-12-03 10:30:00,1.179,Std,A,Lavish Lifestyles,Affluent Achievers
4,MAC000246,2011-12-03 11:00:00,0.588,Std,A,Lavish Lifestyles,Affluent Achievers
...,...,...,...,...,...,...,...
2365795,MAC005159,2014-02-27 22:00:00,0.424,ToU,A,Lavish Lifestyles,Affluent Achievers
2365796,MAC005159,2014-02-27 22:30:00,0.388,ToU,A,Lavish Lifestyles,Affluent Achievers
2365797,MAC005159,2014-02-27 23:00:00,0.276,ToU,A,Lavish Lifestyles,Affluent Achievers
2365798,MAC005159,2014-02-27 23:30:00,0.255,ToU,A,Lavish Lifestyles,Affluent Achievers


In [55]:
block_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2365719 entries, 0 to 2365799
Data columns (total 7 columns):
 #   Column          Dtype         
---  ------          -----         
 0   LCLid           object        
 1   DateTime        datetime64[ns]
 2   KWH/hh          float64       
 3   Tariff          object        
 4   Acorn_Group     object        
 5   Group           object        
 6   Classification  object        
dtypes: datetime64[ns](1), float64(1), object(5)
memory usage: 144.4+ MB


In [56]:
block_data

Unnamed: 0,LCLid,DateTime,KWH/hh,Tariff,Acorn_Group,Group,Classification
0,MAC000246,2011-12-03 09:00:00,0.149,Std,A,Lavish Lifestyles,Affluent Achievers
1,MAC000246,2011-12-03 09:30:00,0.154,Std,A,Lavish Lifestyles,Affluent Achievers
2,MAC000246,2011-12-03 10:00:00,0.768,Std,A,Lavish Lifestyles,Affluent Achievers
3,MAC000246,2011-12-03 10:30:00,1.179,Std,A,Lavish Lifestyles,Affluent Achievers
4,MAC000246,2011-12-03 11:00:00,0.588,Std,A,Lavish Lifestyles,Affluent Achievers
...,...,...,...,...,...,...,...
2365795,MAC005159,2014-02-27 22:00:00,0.424,ToU,A,Lavish Lifestyles,Affluent Achievers
2365796,MAC005159,2014-02-27 22:30:00,0.388,ToU,A,Lavish Lifestyles,Affluent Achievers
2365797,MAC005159,2014-02-27 23:00:00,0.276,ToU,A,Lavish Lifestyles,Affluent Achievers
2365798,MAC005159,2014-02-27 23:30:00,0.255,ToU,A,Lavish Lifestyles,Affluent Achievers


In [57]:
# Fill 0 with nans
block_data['KWH/hh'].replace(0,np.nan,inplace=True)

# Fill nan's using a back fill
date_range = pd.DataFrame(pd.date_range(block_data.index[0],block_data.index[-1], freq='30 min'),columns=['DateTime'])
block_data = date_range.merge(block_data,on='DateTime',how='outer')
if np.sum(block_data['KWH/hh'].isna())!=0:
    block_data.fillna(method='bfill',inplace=True)


Unnamed: 0,DateTime,LCLid,KWH/hh,Tariff,Acorn_Group,Group,Classification
0,1970-01-01 00:00:00,MAC000246,0.149,Std,A,Lavish Lifestyles,Affluent Achievers
1,2011-12-03 09:00:00,MAC000246,0.149,Std,A,Lavish Lifestyles,Affluent Achievers
2,2011-12-03 09:30:00,MAC000246,0.154,Std,A,Lavish Lifestyles,Affluent Achievers
3,2011-12-03 10:00:00,MAC000246,0.768,Std,A,Lavish Lifestyles,Affluent Achievers
4,2011-12-03 10:30:00,MAC000246,1.179,Std,A,Lavish Lifestyles,Affluent Achievers
...,...,...,...,...,...,...,...
2365715,2013-12-09 10:30:00,MAC005421,0.599,Std,A,Lavish Lifestyles,Affluent Achievers
2365716,2013-12-09 10:30:00,MAC003211,0.730,ToU,A,Lavish Lifestyles,Affluent Achievers
2365717,2013-12-09 10:30:00,MAC004543,0.394,ToU,A,Lavish Lifestyles,Affluent Achievers
2365718,2013-12-09 10:30:00,MAC004955,0.110,ToU,A,Lavish Lifestyles,Affluent Achievers


In [59]:
block_data = block_data[block_data['DateTime'] >= '2012-01-01']

In [60]:
# Identify and remove outliers

u = np.mean(block_data['KWH/hh'])
q3 = np.quantile(block_data['KWH/hh'], 0.75)
q1 = np.quantile(block_data['KWH/hh'], 0.25)
IQR = q3-q1
ub = q3 + 1.5 * IQR
lb = q1 - 1.5 * IQR
block_data[(block_data['KWH/hh'] <= ub) & (block_data['KWH/hh'] >= lb)]

Unnamed: 0,DateTime,LCLid,KWH/hh,Tariff,Acorn_Group,Group,Classification
7771,2012-01-01 00:00:00,MAC000246,0.509,Std,A,Lavish Lifestyles,Affluent Achievers
7772,2012-01-01 00:00:00,MAC004529,0.219,Std,A,Lavish Lifestyles,Affluent Achievers
7773,2012-01-01 00:00:00,MAC000030,0.531,Std,A,Lavish Lifestyles,Affluent Achievers
7774,2012-01-01 00:00:00,MAC000040,0.489,Std,A,Lavish Lifestyles,Affluent Achievers
7775,2012-01-01 00:00:00,MAC000103,0.264,Std,A,Lavish Lifestyles,Affluent Achievers
...,...,...,...,...,...,...,...
2365715,2013-12-09 10:30:00,MAC005421,0.599,Std,A,Lavish Lifestyles,Affluent Achievers
2365716,2013-12-09 10:30:00,MAC003211,0.730,ToU,A,Lavish Lifestyles,Affluent Achievers
2365717,2013-12-09 10:30:00,MAC004543,0.394,ToU,A,Lavish Lifestyles,Affluent Achievers
2365718,2013-12-09 10:30:00,MAC004955,0.110,ToU,A,Lavish Lifestyles,Affluent Achievers


In [67]:
# Create average time
block_data_avg=block_data.groupby(by=block_data.DateTime).mean()
block_data_avg.reset_index(inplace = True)

In [68]:
# View the final dataset

block_data_avg

Unnamed: 0,DateTime,KWH/hh
0,2012-01-01 00:00:00,0.471778
1,2012-01-01 00:30:00,0.483222
2,2012-01-01 01:00:00,0.457222
3,2012-01-01 01:30:00,0.380556
4,2012-01-01 02:00:00,0.452889
...,...,...
37868,2014-02-27 22:00:00,0.532726
37869,2014-02-27 22:30:00,0.515342
37870,2014-02-27 23:00:00,0.457260
37871,2014-02-27 23:30:00,0.405000


In [69]:
# Save and Export dataset to CSV
filename = f'df_{groupletter}__avg_v1.csv'

block_data_avg.to_csv(f'../raw_data/{filename}')

In [71]:
block_data['LCLid'].nunique()

81