# Day-Averaged Yearly files

The average (mean) of energy consumed in a day is calculated for per season then appended together

In [12]:
import polars as pl
import glob
import re
from pathlib import Path
import os

In [13]:
import polars as pl
from datetime import datetime

# Define seasonal timestamp ranges
season_ranges = {
    "spring": pl.datetime_range(datetime(2021, 3, 1), datetime(2021, 5, 31), "1d", eager=True).cast(pl.Date),
    "summer": pl.datetime_range(datetime(2021, 6, 1), datetime(2021, 8, 31), "1d", eager=True).cast(pl.Date),
    "autumn": pl.datetime_range(datetime(2021, 9, 1), datetime(2021, 11, 30), "1d", eager=True).cast(pl.Date),
    "winter": pl.datetime_range(datetime(2021, 12, 1), datetime(2022, 2, 28), "1d", eager=True).cast(pl.Date)
}

In [14]:
#load spring files first
# Function to extract numeric part from file name
season= "spring"
timestamp_range = season_ranges[season]

def extract_numeric_part(file_path):
    match = re.search(rf'(\d+)_{season}_data\.csv', file_path)
    if match:
        return int(match.group(1))
    else:
        print(f"Warning: Filename {file_path} does not match pattern")
        return None

# Get the list of file paths
file_paths = glob.glob(rf'C:\Users\pana\Desktop\DATA\ckw\2021\{season}\*_{season}_data.csv') #location of spring season files
file_paths = sorted(file_paths, key=extract_numeric_part)
print(file_paths)


['C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\0_spring_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\10000_spring_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\10001_spring_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\10002_spring_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\spring\\10003_spring_data.csv']


In [15]:
def day_avg_preprocess(df, timestamp_range):
    
    """
    Preprocess the input dataframe by converting the 'timestamp' column to date, 
    calculating the daily average of 'value_kwh', and filtering based on a timestamp range.

    Parameters:
    - df: DataFrame containing a 'timestamp' column and 'value_kwh' column.
    - timestamp_range: Tuple or list of two timestamps representing the start and end date range (inclusive).

    Returns:
    - day_avg: DataFrame containing the day-average of 'value_kwh' within the specified range sorted by date
    """
    # Convert the 'timestamp' column to date format
    df = df.with_columns(
        pl.col("timestamp").str.strptime(pl.Date, "%Y-%m-%dT%H:%M:%S.%fZ").alias("date")
        )

    # Calculate daily average
    day_avg = df.group_by("date").agg(
        pl.col("value_kwh").mean().alias("day_avg_kwh")
    )

    # Sort by date and filter by timestamp range
    day_avg = day_avg.sort("date").filter(pl.col("date").is_in(timestamp_range))

    # Processed data can be saved, printed, or returned here as needed
    return day_avg


In [16]:
# Define the output directory
output_dir = Path(r'C:\Users\pana\Desktop\DATA\ckw\2022\day_avg')
output_dir.mkdir(parents=True, exist_ok=True)  # Create if it doesn't exist

for idx, file_path in enumerate(file_paths):
    # Read the data
    df = pl.read_csv(file_path)

    day_avg= day_avg_preprocess(df,timestamp_range)

    # Save the daily average to a CSV file
    output_file = os.path.join(output_dir, f"{idx}_day_avg_data.csv")
    
    day_avg.write_csv(output_file)
    
    if idx%1000 ==0:
        print(f"{season} day-averaged files generated {idx} of {len(file_paths)}")

spring day-averaged files generated 0 of 5


  pl.col("timestamp").str.strptime(pl.Date, "%Y-%m-%dT%H:%M:%S.%fZ").alias("date")


In [17]:
def extract_numeric_part(file_path):
    match = re.search(rf'(\d+)_day_avg_data\.csv', file_path.name)
    if match:
        return int(match.group(1))
    else:
        print(f"Warning: Filename {file_path} does not match pattern")
        return None

# Get the list of file paths
file_paths = list(output_dir.glob('*_day_avg_data.csv')) #same path as previous cell
file_paths = sorted(file_paths, key=extract_numeric_part)
print(file_paths[:5])

[WindowsPath('C:/Users/pana/Desktop/DATA/ckw/2022/day_avg/0_day_avg_data.csv')]


## Adding summer values to the existing day-averaged spring values

In [18]:
season= "summer"
timestamp_range = season_ranges[season]
def extract_numeric_part(file_path):
    match = re.search(rf'(\d+)_{season}_data\.csv', file_path)
    if match:
        return int(match.group(1))
    else:
        print(f"Warning: Filename {file_path} does not match pattern")
        return None
    
#path were summer season files are
toadd_file_paths = glob.glob(rf'C:\Users\pana\Desktop\DATA\ckw\2021\{season}\*_{season}_data.csv')
toadd_file_paths = sorted(toadd_file_paths, key=extract_numeric_part)
print(toadd_file_paths[:5])

['C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\0_summer_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\1_summer_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\2_summer_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\3_summer_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\summer\\4_summer_data.csv']


In [19]:
def append_day_avg(existing_file_path, summer_file_path):
    # Load existing data into a Polars DataFrame
    existing_data = pl.read_csv(existing_file_path)
    existing_data = existing_data.with_columns(
    pl.col("date").str.strptime(pl.Date, "%Y-%m-%d").alias("date")
    )
   
    summer_file = pl.read_csv(summer_file_path)
    
    # Load summer data
    table_summer = day_avg_preprocess(summer_file,timestamp_range)
   
    # Ensure columns match between existing_data and pivot_table_summer
    existing_data = existing_data.select(table_summer.columns)
    #print(table_summer,existing_data)
    # Concatenate existing data with summer pivot_table
    concatenated_data = pl.concat([existing_data, table_summer])

    # Return the concatenated DataFrame
    return concatenated_data

for idx,file_path in enumerate(file_paths):
   
    existing_file = file_path
    #print(pl.read_csv(existing_file))
    summer_file = toadd_file_paths[idx]
    #print(summer_file,existing_file)
    result = append_day_avg(existing_file, summer_file)

    # Print or save the result
    output_file = os.path.join(output_dir, f"{idx}_day_avg_data.csv")
    result.write_csv(output_file)

    if idx%1000 ==0:
        print(f"{season} day-average files add to existing day-avg file {idx} of {len(file_paths)}")

summer day-average files add to existing day-avg file 0 of 1


  pl.col("timestamp").str.strptime(pl.Date, "%Y-%m-%dT%H:%M:%S.%fZ").alias("date")


## Adding Autumn

In [20]:
season= "autumn"
timestamp_range = season_ranges[season]
def extract_numeric_part(file_path):
    match = re.search(rf'(\d+)_{season}_data\.csv', file_path)
    if match:
        return int(match.group(1))
    else:
        print(f"Warning: Filename {file_path} does not match pattern")
        return None
    
#path were summer season files are
toadd_file_paths = glob.glob(rf'C:\Users\pana\Desktop\DATA\ckw\2021\{season}\*_{season}_data.csv')
toadd_file_paths = sorted(toadd_file_paths, key=extract_numeric_part)
print(toadd_file_paths[:5])

for idx,file_path in enumerate(file_paths):
   
    existing_file = file_path
    #print(pl.read_csv(existing_file))
    summer_file = toadd_file_paths[idx]
    #print(summer_file,existing_file)
    result = append_day_avg(existing_file, summer_file)

    # Print or save the result
    output_file = os.path.join(output_dir, f"{idx}_day_avg_data.csv")
    result.write_csv(output_file)

    if idx%1000 ==0:
        print(f"{season} weekday files add to existing day-avg file {idx} of {len(file_paths)}")

['C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\0_autumn_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\1_autumn_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\2_autumn_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\3_autumn_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\autumn\\4_autumn_data.csv']
autumn weekday files add to existing day-avg file 0 of 1


  pl.col("timestamp").str.strptime(pl.Date, "%Y-%m-%dT%H:%M:%S.%fZ").alias("date")


## Adding Winter

In [21]:
season= "winter"
timestamp_range = season_ranges[season]
def extract_numeric_part(file_path):
    match = re.search(rf'(\d+)_{season}_data\.csv', file_path)
    if match:
        return int(match.group(1))
    else:
        print(f"Warning: Filename {file_path} does not match pattern")
        return None
    
#path were summer season files are
toadd_file_paths = glob.glob(rf'C:\Users\pana\Desktop\DATA\ckw\2021\{season}\*_{season}_data.csv')
toadd_file_paths = sorted(toadd_file_paths, key=extract_numeric_part)
print(toadd_file_paths[:5])

for idx,file_path in enumerate(file_paths):
   
    existing_file = file_path
    #print(pl.read_csv(existing_file))
    summer_file = toadd_file_paths[idx]
    #print(summer_file,existing_file)
    result = append_day_avg(existing_file, summer_file)

    # Print or save the result
    output_file = os.path.join(output_dir, f"{idx}_day_avg_data.csv")
    result.write_csv(output_file)

    if idx%1000 ==0:
        print(f"{season} weekday files add to existing day-avg file {idx} of {len(file_paths)}")

['C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\0_winter_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\1_winter_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\2_winter_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\3_winter_data.csv', 'C:\\Users\\pana\\Desktop\\DATA\\ckw\\2021\\winter\\4_winter_data.csv']
winter weekday files add to existing day-avg file 0 of 1


  pl.col("timestamp").str.strptime(pl.Date, "%Y-%m-%dT%H:%M:%S.%fZ").alias("date")


# Normalizing

In [24]:
def normalize(df):
    """ This function normalized the energy consumed with the maximum energy consumed in a day as saves
        it column named "normalized_kwh"        """
    
    max_df= df.select(pl.max("day_avg_kwh"))
    max_df= max_df['day_avg_kwh'][0]
    #print(max_df)
    if max_df ==0:
        df = df.with_columns(
            normalized_kwh = pl.col("day_avg_kwh")
        )
    else:
        df = df.with_columns(
            normalized_kwh = pl.col("day_avg_kwh")/max_df
        )

    df = df.sort("date")
    return df

In [25]:
for idx,file_path in enumerate(file_paths):
   
    df = pl.read_csv(file_path)
    normalized_df = normalize(df)

    # Print or save the result
    output_file = os.path.join(output_dir, f"{idx}_day_avg_data.csv")
    normalized_df.write_csv(output_file)

    if idx%1000 ==0:
        print(f"{season} weekday files add to existing day-avg file {idx} of {len(file_paths)}")

print("Day-averaged profiles created")

winter weekday files add to existing day-avg file 0 of 1
