# Generating Day profiles from seasonal profiles
Please extract the seasonal data from CKW's monthly data as done in CKW.ipynb

In [6]:
import polars as pl
import glob
import re
import os

In [7]:
#load spring files first
# Function to extract numeric part from file name
season= "spring"
def extract_numeric_part(file_path):
    match = re.search(rf'(\d+)_{season}_data\.csv', file_path)
    if match:
        return int(match.group(1))
    else:
        print(f"Warning: Filename {file_path} does not match pattern")
        return None

# Get the list of file paths
file_paths = glob.glob(rf'C:\Users\pana\Desktop\DATA\ckw\2021\{season}\*_{season}_data.csv') #location of spring season files
file_paths = sorted(file_paths[:10], key=extract_numeric_part)
print(file_paths[:10])

['C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\0_spring_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\10000_spring_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\10001_spring_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\10002_spring_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\10003_spring_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\10004_spring_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\10005_spring_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\10006_spring_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\10007_spring_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\10008_spring_data.csv']


In [8]:
def week_extraction(file_path):
    """ This function extracts the energy consumption data per days of the week from the season"""
    
    # Load the data into a Polars DataFrame
    data = pl.read_csv(file_path)

    # Ensure the timestamp column is treated as a string
    data = data.with_columns(pl.col('timestamp').cast(pl.Utf8))
    #print(data, "1")
    # Parse the timestamp column to datetime with the correct format and strict=False
    data = data.with_columns(pl.col('timestamp').str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%.fZ", strict=False))
    #print(data, "2")
    # Extract the day of the week and the time of day from the timestamp
    data = data.with_columns([
        pl.col('timestamp').dt.strftime('%Y-%m-%d').alias('date'),
        pl.col('timestamp').dt.weekday().alias('day_of_week'),
        pl.col('timestamp').dt.strftime('%H:%M:%S').alias('time_of_day')
    ])
    #print(data, "3")

    # Group by day of the week and time of day
    grouped = data.group_by(['day_of_week', 'time_of_day'])

    # Calculate the mean for each group
    daily_averages = grouped.agg(pl.col('value_kwh').mean().alias('average_kwh'))
    #print(data,"3")
    # Pivot the data to have days of the week as columns and time of day as rows
    pivot_table = daily_averages.pivot(
        values='average_kwh',
        index='time_of_day',
        columns='day_of_week'
    ).sort('time_of_day')

    # Replace the day_of_week index with actual day names
    day_mapping = {1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday', 7:'Sunday'}
    pivot_table = pivot_table.rename({str(day): name for day, name in day_mapping.items()})
    #pivot_table= pivot_table.with_columns(pl.lit(f'{season}').alias('season'))
    #print(f'{season} table',pivot_table)
    # Show the resulting pivot table
    return(pivot_table)

In [9]:
#saves the average week for each household in the 'week' folder
for idx,file_path in enumerate(file_paths):
    #print(idx,file_path)
    df = week_extraction(file_path)
    df.write_csv(rf'C:\Users\pana\Desktop\DATA\ckw\2022\week\{idx}_week_data.csv') #path were the file will be saved
    if idx%1000 == 0:
        print(f"generated week profiles from {season}: {idx}/{len(file_paths)}")

generated week profiles from spring: 0/10


  pivot_table = daily_averages.pivot(


In [10]:
def extract_numeric_part(file_path):
    match = re.search(rf'(\d+)_week_data\.csv', file_path)
    if match:
        return int(match.group(1))
    else:
        print(f"Warning: Filename {file_path} does not match pattern")
        return None

# Get the list of file paths
file_paths = glob.glob(rf'C:\Users\pana\Desktop\DATA\ckw\2021\week\*_week_data.csv') #same path as before
file_paths = sorted(file_paths, key=extract_numeric_part)
print(file_paths[:10])

['C:\\Users\\pana\\Desktop\\DATA\\ckw\\2022\\week\\0_week_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2022\\week\\1_week_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2022\\week\\2_week_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2022\\week\\3_week_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2022\\week\\4_week_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2022\\week\\5_week_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2022\\week\\6_week_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2022\\week\\7_week_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2022\\week\\8_week_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2022\\week\\9_week_data.csv']


In [11]:
season= "summer"
def extract_numeric_part(file_path):
    match = re.search(rf'(\d+)_{season}_data\.csv', file_path)
    if match:
        return int(match.group(1))
    else:
        print(f"Warning: Filename {file_path} does not match pattern")
        return None
    
#path were summer season files are
toadd_file_paths = glob.glob(rf'C:\Users\pana\Desktop\DATA\ckw\2021\{season}\*_{season}_data.csv') 
toadd_file_paths = sorted(toadd_file_paths[:10], key=extract_numeric_part)
print(toadd_file_paths[:10])

['C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\0_summer_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\10000_summer_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\10001_summer_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\10002_summer_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\10003_summer_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\10004_summer_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\10005_summer_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\10006_summer_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\10007_summer_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\10008_summer_data.csv']


In [12]:
def append_season_week(existing_file_path, summer_file_path):
    """ This function adds the week profile of the next season to the existing week profile"""
    # Load existing data into a Polars DataFrame
    existing_data = pl.read_csv(existing_file_path)
    #print("existing data", existing_data)

    # Load summer data
    table_summer = week_extraction(summer_file_path)
    #print(table_summer)
    # Ensure columns match between existing_data and pivot_table_summer
    existing_data = existing_data.select(table_summer.columns)

    # Concatenate existing data with summer pivot_table
    concatenated_data = pl.concat([existing_data, table_summer])

    # Return the concatenated DataFrame
    return concatenated_data


for i,file_path in enumerate(file_paths):
   
    existing_file = file_path
    summer_file = toadd_file_paths[i]
    #print(summer_file,existing_file)
    result = append_season_week(existing_file, summer_file)

    # Print or save the result
    result.write_csv(rf'C:\Users\pana\Desktop\DATA\ckw\2021\week\{i}_week_data.csv')
    if i%1000 == 0:
        print(f"Added {season} week data to previous seasons {i}/{len(file_paths)} done ")

Added summer week data to previous seasons 0/10


  pivot_table = daily_averages.pivot(


In [13]:
season= "autumn"
def extract_numeric_part(file_path):
    match = re.search(rf'(\d+)_{season}_data\.csv', file_path)
    if match:
        return int(match.group(1))
    else:
        print(f"Warning: Filename {file_path} does not match pattern")
        return None
    
#path were summer season files are
toadd_file_paths = glob.glob(rf'C:\Users\pana\Desktop\DATA\ckw\2021\{season}\*_{season}_data.csv') 
toadd_file_paths = sorted(toadd_file_paths[:10], key=extract_numeric_part)
print(toadd_file_paths[:10])

for i,file_path in enumerate(file_paths):
   
    existing_file = file_path
    summer_file = toadd_file_paths[i]
    #print(summer_file,existing_file)
    result = append_season_week(existing_file, summer_file)

    # Print or save the result
    result.write_csv(rf'C:\Users\pana\Desktop\DATA\ckw\2021\week\{i}_week_data.csv')
    if i%1000 == 0:
        print(f"Added {season} week data to previous seasons {i}/{len(file_paths)} done ")

['C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\0_autumn_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\10000_autumn_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\10001_autumn_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\10002_autumn_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\10003_autumn_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\10004_autumn_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\10005_autumn_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\10006_autumn_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\10007_autumn_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\10008_autumn_data.csv']
Added autumn week data to previous seasons 0/10 done 


  pivot_table = daily_averages.pivot(


In [14]:
season= "winter"
def extract_numeric_part(file_path):
    match = re.search(rf'(\d+)_{season}_data\.csv', file_path)
    if match:
        return int(match.group(1))
    else:
        print(f"Warning: Filename {file_path} does not match pattern")
        return None
    
#path were summer season files are
toadd_file_paths = glob.glob(rf'C:\Users\pana\Desktop\DATA\ckw\2021\{season}\*_{season}_data.csv') 
toadd_file_paths = sorted(toadd_file_paths[:10], key=extract_numeric_part)
print(toadd_file_paths[:10])

for i,file_path in enumerate(file_paths):
   
    existing_file = file_path
    summer_file = toadd_file_paths[i]
    #print(summer_file,existing_file)
    result = append_season_week(existing_file, summer_file)

    # Print or save the result
    result.write_csv(rf'C:\Users\pana\Desktop\DATA\ckw\2021\week\{i}_week_data.csv')
    if i%1000 == 0:
        print(f"Added {season} week data to previous seasons {i}/{len(file_paths)} done ")
""" The files saved contain week profiles of spring, summer, autumn and winter, total with 385 rows"""

['C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\0_winter_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\10000_winter_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\10001_winter_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\10002_winter_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\10003_winter_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\10004_winter_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\10005_winter_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\10006_winter_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\10007_winter_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\10008_winter_data.csv']
Added winter week data to previous seasons 0/10 done 


  pivot_table = daily_averages.pivot(


' The files saved contain week profiles of spring, summer, autumn and winter, total with 386 rows'

## Creating Week avg and normalized day columns 

In [15]:
standard_order = ['time_of_day', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def week_avg(data):
    df= data
    #df = df[standard_order]
    df = df.with_columns(
        #weekdays_avg = pl.mean_horizontal("Monday","Tuesday","Wednesday","Thursday","Friday")
        #weekend_avg = pl.mean_horizontal("Saturday","Sunday")
        week_avg = pl.mean_horizontal("Monday","Tuesday","Wednesday","Thursday","Friday", "Saturday", "Sunday")
        )
    max_df= df.select(pl.max("week_avg"))
    max_df= max_df['week_avg'][0]
    
    if max_df ==0:
        df = df.with_columns(
            normalized_day = pl.col("week_avg") 
            #normalized day is the representative normalized day profile of each household
        )
    else:
        df = df.with_columns(
            normalized_day = pl.col("week_avg")/max_df
        )

    df = df.sort("time_of_day")
    return df

In [16]:
# Function to process each file
def process_file(df):
    """ This function groups by the time of day for each day of the week"""
    # Group by the 'time' column and calculate the mean for each group
    grouped_df = df.group_by("time_of_day").agg([
        pl.mean("Monday").alias("Monday"),
        pl.mean("Tuesday").alias("Tuesday"),
        pl.mean("Wednesday").alias("Wednesday"),
        pl.mean("Thursday").alias("Thursday"),
        pl.mean("Friday").alias("Friday"),
        pl.mean("Saturday").alias("Saturday"),
        pl.mean("Sunday").alias("Sunday")
    ])

    return grouped_df

#saves the average week for each household in the 'week' folder
for idx,file_path in enumerate(file_paths):
    df = pl.read_csv(file_path)
    df = df[standard_order]
    #df = df.with_columns(pl.col('time_of_day').str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%.fZ", strict=False))
    
    df_reduced = process_file(df)
    
    df_normalized = week_avg(df_reduced)
    
    df_normalized.write_csv(rf'C:\Users\pana\Desktop\DATA\ckw\2022\week\{idx}_week_data.csv')
    if idx%1000 == 0:
        print(f"Processed {idx}/{len(file_paths)}")

#Average week profiles for the whole year is processed

Processed 0/10
