In [1]:
import pandas as pd
import numpy as np
import holidays
import config as cfg

In [2]:
# Initialize list to collect results
house = []

# Loop through all CSV files in the directory
for file_path in cfg.RAW_DIR.glob('block_*.csv'):
    # Extract block number from filename
    block_number = int(file_path.stem.split('_')[1])
    
    # Read CSV and parse 'day' column
    df = pd.read_csv(file_path, parse_dates=['day'])
    
    # Add block number column
    df['block_number'] = block_number
    
    # Append to list
    house.append(df)

# Concatenate all results
house_df = pd.concat(house, ignore_index=True)

# Group by day and calculate mean of energy_mean
block_df = house_df.groupby(['block_number', 'day'])['energy_mean'].mean().reset_index()

# Convert 'day' column to datetime if it's not already
block_df['day'] = pd.to_datetime(block_df['day'])

# Sort by day and block number
block_df.sort_values(['block_number', 'day'], inplace=True)
house_df.sort_values(['block_number', 'day'], inplace=True)

# Create UK holiday calendar (England)
uk_holidays = holidays.country_holidays('GB', subdiv='England')

# Define a function to check holiday or Sunday
def is_holiday_or_sunday(date):
    return date in uk_holidays or date.weekday() == 6  # Sunday is weekday 6

# Add DOY, Day of Week and is_holiday to block_df
block_df['DOY'] = block_df['day'].dt.dayofyear
block_df['day_of_week'] = block_df['day'].dt.weekday
block_df['is_holiday'] = block_df['day'].apply(is_holiday_or_sunday)

# Add DOY, Day of Week and is_holiday to house_df
house_df['DOY'] = house_df['day'].dt.dayofyear
house_df['day_of_week'] = house_df['day'].dt.weekday
house_df['is_holiday'] = house_df['day'].apply(is_holiday_or_sunday)

# drop unnecessary columns from house_df
house_df.drop(columns=['energy_median', 'energy_max', 'energy_count', 'energy_std', 'energy_sum', 'energy_min'], inplace=True)

# convert day to datetime64
if not np.issubdtype(house_df['day'].dtype, np.datetime64):
    house_df['day'] = pd.to_datetime(house_df['day'])

if not np.issubdtype(block_df['day'].dtype, np.datetime64):
    block_df['day'] = pd.to_datetime(block_df['day'])

# Define your date range
start_date = block_df['day'].min()  #'2012-06-30'
end_date = block_df['day'].max()    #'2014-02-20'

# Filter the DataFrame
block_filtered = block_df[(block_df['day'] >= start_date) & (block_df['day'] <= end_date)].copy()
house_filtered = house_df[(house_df['day'] >= start_date) & (house_df['day'] <= end_date)].copy()

# Reset index for both filtered DataFrames
block_filtered.reset_index(drop=True, inplace=True)
house_filtered.reset_index(drop=True, inplace=True)

# Display or use block_filtered
print(block_filtered.head())
print(house_filtered.head())


   block_number        day  energy_mean  DOY  day_of_week  is_holiday
0             0 2011-12-03     0.300233  337            5       False
1             0 2011-12-04     0.432563  338            6        True
2             0 2011-12-05     0.125417  339            0       False
3             0 2011-12-06     0.277542  340            1       False
4             0 2011-12-07     0.188792  341            2       False
       LCLid        day  energy_mean  block_number  DOY  day_of_week  \
0  MAC000246 2011-12-03     0.300233             0  337            5   
1  MAC000246 2011-12-04     0.432563             0  338            6   
2  MAC000246 2011-12-05     0.125417             0  339            0   
3  MAC000246 2011-12-06     0.277542             0  340            1   
4  MAC000246 2011-12-07     0.188792             0  341            2   

   is_holiday  
0       False  
1        True  
2       False  
3       False  
4       False  


In [3]:
print("Block DataFrame Description:")
# Display the description of the block_filtered DataFrame
print(block_filtered.describe())

print("Household DataFrame Description:")
# Display the description of the house_filtered_combined DataFrame
print(house_filtered.describe())

Block DataFrame Description:
       block_number                            day   energy_mean  \
count  90015.000000                          90015  90015.000000   
mean      55.626429  2013-01-22 00:36:18.836860672      0.217181   
min        0.000000            2011-11-23 00:00:00      0.006708   
25%       28.000000            2012-07-06 00:00:00      0.166784   
50%       56.000000            2013-01-23 00:00:00      0.200512   
75%       84.000000            2013-08-12 00:00:00      0.249699   
max      111.000000            2014-02-28 00:00:00      1.065250   
std       32.345699                            NaN      0.076001   

                DOY   day_of_week  
count  90015.000000  90015.000000  
mean     178.419952      2.999278  
min        1.000000      0.000000  
25%       76.000000      1.000000  
50%      176.000000      3.000000  
75%      277.000000      5.000000  
max      366.000000      6.000000  
std      111.220761      1.997974  
Household DataFrame Description:
 

In [4]:
# Create the directory if it doesn't exist
cfg.DATASET_DIR.mkdir(parents=True, exist_ok=True)

# Save block_filtered as Pickle
block_filtered.to_pickle(cfg.PICKLE_BLOCK)
print(f"Data saved as Pickle to: {cfg.PICKLE_BLOCK}")

# Save house_filtered as Pickle
house_filtered.to_pickle(cfg.PICKLE_HOUSE)
print(f"Data saved as Pickle to: {cfg.PICKLE_HOUSE}")

Data saved as Pickle to: z:\University Folders\Deep Learning\Project\archive\Project-v1\data\processed\block_daily_means.pkl
Data saved as Pickle to: z:\University Folders\Deep Learning\Project\archive\Project-v1\data\processed\house_daily_means.pkl
