In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Cleaning and Preprocessing by ACORN Class

In [2]:
# Save the Group Letter Name to be processed here

groupletter = 'L'

In [3]:
# Import Summary Datasheet

df_summary = pd.read_csv('../raw_data/summary_v1_selection.csv')
df_summary = df_summary[['LCLid', 'start_date', 'end_date', 'number_of_days', 'data_points', 'missing_hh', 'zeros', 'tariff', 'Block', 'Acorn', 'Group', 'Classification', 'Select']]

In [4]:
# Reduce selection to chosen group which has been selected
string = f'ACORN-{groupletter}'
df_hot = df_summary[(df_summary['Acorn'] == string) 
                    & (df_summary['number_of_days'] >= 365) 
                    & (df_summary['zeros'] + df_summary['missing_hh'] <= 48)]
df_hot

Unnamed: 0,LCLid,start_date,end_date,number_of_days,data_points,missing_hh,zeros,tariff,Block,Acorn,Group,Classification,Select
3848,MAC000068,09/12/2011 13:00,28/02/2014 00:00,811,38933,18,1,Std,block_77,ACORN-L,Modest Means,Financially Stretched,True
3849,MAC000117,14/12/2011 12:00,28/02/2014 00:00,806,38712,1,0,Std,block_77,ACORN-L,Modest Means,Financially Stretched,True
3850,MAC000373,12/03/2012 11:30,28/02/2014 00:00,717,34442,0,0,Std,block_77,ACORN-L,Modest Means,Financially Stretched,True
3856,MAC000893,27/04/2012 09:30,28/02/2014 00:00,671,32230,8,0,Std,block_77,ACORN-L,Modest Means,Financially Stretched,True
3866,MAC001464,21/05/2012 12:30,28/02/2014 00:00,647,31074,6,2,Std,block_77,ACORN-L,Modest Means,Financially Stretched,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4220,MAC003398,28/09/2012 12:00,28/02/2014 00:00,517,24823,18,0,Std,block_84,ACORN-L,Modest Means,Financially Stretched,True
4231,MAC003535,08/10/2012 09:30,28/02/2014 00:00,507,24366,0,0,Std,block_84,ACORN-L,Modest Means,Financially Stretched,True
4232,MAC003537,08/10/2012 09:30,28/02/2014 00:00,507,24361,5,0,Std,block_84,ACORN-L,Modest Means,Financially Stretched,True
4241,MAC003623,11/10/2012 11:30,28/02/2014 00:00,504,24208,10,0,Std,block_84,ACORN-L,Modest Means,Financially Stretched,True


In [5]:
# Create a list of houses matching criteria

houselist = df_hot['LCLid'].tolist()

In [6]:
# Create a second list of the blocks where the data for each house is found

blocklist = df_hot['Block'].tolist()

In [7]:
# Create a third list of the tariff type

tarifflist = df_hot['tariff'].tolist()

In [8]:
# Iterable loop concactinating the data into lists
LCL = []
Acorn_Group = []
DateTime = []
KWH = []
tariff = []


for i in range (0,int(len(houselist))):
    print(f'Now adding house {i+1} of {len(houselist)}, {houselist[i]} to the dataset...')
    blockstr = f'../raw_data/halfhourly_dataset/{blocklist[i]}.csv'
    df_house = pd.read_csv(blockstr, dtype = {'LCLid': object, 'tstp': object , 'energy(kWh/hh)': object})
    df_house = df_house[df_house['LCLid'] == houselist[i]]
    df_house['tariff'] = tarifflist[i]
    LCL.extend(df_house['LCLid'])
    DateTime.extend(df_house['tstp'])
    KWH.extend(df_house['energy(kWh/hh)'])
    tariff.extend(df_house['tariff'])
    

Now adding house 1 of 174, MAC000068 to the dataset...
Now adding house 2 of 174, MAC000117 to the dataset...
Now adding house 3 of 174, MAC000373 to the dataset...
Now adding house 4 of 174, MAC000893 to the dataset...
Now adding house 5 of 174, MAC001464 to the dataset...
Now adding house 6 of 174, MAC002013 to the dataset...
Now adding house 7 of 174, MAC002038 to the dataset...
Now adding house 8 of 174, MAC002079 to the dataset...
Now adding house 9 of 174, MAC003049 to the dataset...
Now adding house 10 of 174, MAC004234 to the dataset...
Now adding house 11 of 174, MAC005193 to the dataset...
Now adding house 12 of 174, MAC000101 to the dataset...
Now adding house 13 of 174, MAC000106 to the dataset...
Now adding house 14 of 174, MAC000146 to the dataset...
Now adding house 15 of 174, MAC000187 to the dataset...
Now adding house 16 of 174, MAC000303 to the dataset...
Now adding house 17 of 174, MAC000354 to the dataset...
Now adding house 18 of 174, MAC000395 to the dataset...
N

Now adding house 147 of 174, MAC003224 to the dataset...
Now adding house 148 of 174, MAC003242 to the dataset...
Now adding house 149 of 174, MAC003520 to the dataset...
Now adding house 150 of 174, MAC003526 to the dataset...
Now adding house 151 of 174, MAC003533 to the dataset...
Now adding house 152 of 174, MAC003577 to the dataset...
Now adding house 153 of 174, MAC003593 to the dataset...
Now adding house 154 of 174, MAC003598 to the dataset...
Now adding house 155 of 174, MAC003604 to the dataset...
Now adding house 156 of 174, MAC003609 to the dataset...
Now adding house 157 of 174, MAC003612 to the dataset...
Now adding house 158 of 174, MAC003726 to the dataset...
Now adding house 159 of 174, MAC003773 to the dataset...
Now adding house 160 of 174, MAC003821 to the dataset...
Now adding house 161 of 174, MAC004033 to the dataset...
Now adding house 162 of 174, MAC004490 to the dataset...
Now adding house 163 of 174, MAC004497 to the dataset...
Now adding house 164 of 174, MA

In [9]:
# Creating a new Data Frame with lists

block_data = pd.DataFrame({'LCLid': LCL, 'DateTime': DateTime, 'KWH/hh': KWH, 'Tariff': tariff})

In [10]:
# Remove Null values

block_data = block_data[block_data['KWH/hh'] != 'Null']

# Convert KWH/hh to numeric

block_data['KWH/hh'] = pd.to_numeric(block_data['KWH/hh'])

# Convert DateTime to DateTime format

block_data['DateTime'] = pd.to_datetime(block_data['DateTime'])

# Create ACORN group label

block_data['Acorn_Group'] = groupletter
block_data['Group'] = df_hot['Group'].iloc[0]
block_data['Classification'] = df_hot['Classification'].iloc[0]

In [11]:
# View the final dataset

block_data

Unnamed: 0,LCLid,DateTime,KWH/hh,Tariff,Acorn_Group,Group,Classification
0,MAC000068,2011-12-09 13:00:00,0.000,Std,L,Modest Means,Financially Stretched
1,MAC000068,2011-12-09 14:00:00,0.300,Std,L,Modest Means,Financially Stretched
2,MAC000068,2011-12-09 14:30:00,0.805,Std,L,Modest Means,Financially Stretched
3,MAC000068,2011-12-09 15:00:00,0.601,Std,L,Modest Means,Financially Stretched
4,MAC000068,2011-12-09 15:30:00,0.257,Std,L,Modest Means,Financially Stretched
...,...,...,...,...,...,...,...
4977547,MAC003793,2014-02-27 22:00:00,0.316,Std,L,Modest Means,Financially Stretched
4977548,MAC003793,2014-02-27 22:30:00,0.283,Std,L,Modest Means,Financially Stretched
4977549,MAC003793,2014-02-27 23:00:00,0.297,Std,L,Modest Means,Financially Stretched
4977550,MAC003793,2014-02-27 23:30:00,0.388,Std,L,Modest Means,Financially Stretched


In [12]:
# Save and Export dataset to CSV
filename = f'df_{groupletter}_v1.csv'

block_data.to_csv(f'../raw_data/{filename}')