## Preprocessing script, It:

- Parses the 'date_time', sorts, and fills missing values (interpolation + extrapolation).

- Extracts **standard calendar/time fields**.

- Builds **Fourier (cyclical) encodings** for hour, weekday and month.

- Creates specified **lags** *(1 h, 24 h, 168 h)* and **rolling‐window means/std** *(3 h, 24 h, 168 h)* for all features, including the target 'kwh'.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read Cleaned_Aligned_Merged data
df = pd.read_csv('Cleaned_Aligned_Merged_Corrected_Method_0.csv')
df

Unnamed: 0,date_time,v_red,current,power_factor,kwh,v_blue,v_yellow,consumer_device_9,consumer_device_x,Temperature (°C),Dewpoint Temperature (°C),U Wind Component (m/s),V Wind Component (m/s),Total Precipitation (mm),Snowfall (mm),Snow Cover (%)
0,2024-07-22 19:00:00,125.629543,1.256716,0.257529,0.009407,106.511045,107.392295,0.022277,21.527228,11.742883,9.245966,0.209122,-0.491055,0.006502,0.00000,0.000000
1,2024-07-22 20:00:00,131.438261,1.570021,0.293663,0.011940,108.350976,109.446880,0.022277,21.527228,11.413995,9.097879,0.145172,-0.539001,0.006545,0.00000,0.000000
2,2024-07-22 21:00:00,133.606395,1.623356,0.337952,0.013546,114.896351,116.076771,0.022277,21.527228,10.794870,8.901697,0.192856,-0.485039,0.006595,0.00000,0.000000
3,2024-07-22 22:00:00,130.571961,1.573983,0.295072,0.012767,112.902697,113.931385,0.022277,21.527228,10.399469,8.749979,0.157013,-0.303240,0.006626,0.00000,0.000000
4,2024-07-22 23:00:00,134.383937,1.929517,0.367702,0.015238,116.606275,118.257191,0.022277,21.527228,10.401529,8.646371,0.177490,-0.574604,0.006639,0.00000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1278,2024-09-14 01:00:00,153.166360,1.311676,0.428511,0.013831,123.235015,127.066644,0.025281,20.637640,6.418542,3.930261,-0.238800,-0.254710,0.000076,0.00000,0.000000
1279,2024-09-14 02:00:00,154.429679,1.390145,0.442519,0.014450,124.869917,128.664296,0.025281,20.637640,6.850275,4.471368,-0.358810,-0.158602,0.000180,0.00000,0.000000
1280,2024-09-14 03:00:00,153.347107,1.335543,0.434412,0.013929,123.849137,127.600490,0.025281,20.637640,7.650003,4.275613,-0.455002,0.319778,0.000771,0.00001,0.000000
1281,2024-09-14 04:00:00,154.433743,1.359368,0.442128,0.014209,125.007733,128.811525,0.025281,20.637640,8.839426,5.121332,-0.315643,0.565828,0.001038,0.00001,0.103516


In [3]:
# Bulk replacement of 'spaces' with 'underscores'
df.columns = df.columns.str.replace(' ', '_')
df.columns

Index(['date_time', 'v_red', 'current', 'power_factor', 'kwh', 'v_blue',
       'v_yellow', 'consumer_device_9', 'consumer_device_x',
       'Temperature_(°C)', 'Dewpoint_Temperature_(°C)',
       'U_Wind_Component_(m/s)', 'V_Wind_Component_(m/s)',
       'Total_Precipitation_(mm)', 'Snowfall_(mm)', 'Snow_Cover_(%)'],
      dtype='object')

In [4]:
# 1. DATETIME INDEX
df['date_time'] = pd.to_datetime(df['date_time'])
df = df.set_index('date_time').sort_index()


# 2. CALENDAR & TIME FEATURES
df['hour']       = df.index.hour
df['day']        = df.index.day
df['month']      = df.index.month
df['year']       = df.index.year
df['dayofweek']  = df.index.dayofweek              # Monday=0 … Sunday=6
df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

# 3. FOURIER / CYCLICAL ENCODINGS (BOTH FOLK-KNOWLEDGE AND LITERATURE REPORTS THAT THE FEATURES IN QUESTION EXHIBIT SEASONALITY)
df['hour_sin']       = np.sin(2 * np.pi * df['hour']      / 24)
df['hour_cos']       = np.cos(2 * np.pi * df['hour']      / 24)
df['dow_sin']        = np.sin(2 * np.pi * df['dayofweek'] / 7)
df['dow_cos']        = np.cos(2 * np.pi * df['dayofweek'] / 7)
# subtract 1 so month 1→0, month 12→11
df['month_sin']      = np.sin(2 * np.pi * (df['month']-1) / 12)
df['month_cos']      = np.cos(2 * np.pi * (df['month']-1) / 12)

# Define seasons manually based on month
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Summer'
    elif month in [6, 7, 8]:
        return 'Monsoon'
    else:
        return ' Post-monsoon'

df['season'] = df['month'].apply(get_season)

# Square root transformation
df['kwh_sqrt'] = np.sqrt(df['kwh'])

# Binning precipitation
df['precip_bin'] = pd.cut(df['Total_Precipitation_(mm)'],
                           bins=[-0.1, 0, 2, 10, np.inf],
                           labels=['None', 'Light', 'Moderate', 'Heavy'])

# Binary snow cover
df['snow_present'] = df['Snow_Cover_(%)'].apply(lambda x: 1 if x > 0 else 0)

from sklearn.preprocessing import PowerTransformer


# Only use on numeric columns
pt = PowerTransformer(method='yeo-johnson')

# Select features to transform
features_to_transform = ['Snow_Cover_(%)', 'Total_Precipitation_(mm)', 'kwh', 'current']

transformed = pt.fit_transform(df[features_to_transform])

# preserve the original datetime index!
df_yeo = pd.DataFrame(
    transformed,
    index=df.index,  
    columns=[f"{col}_yeo" for col in features_to_transform]
)

# now concat: index stays a pure DatetimeIndex
df = pd.concat([df, df_yeo], axis=1)

df['snow_present'] = df['Snow_Cover_(%)'].apply(lambda x: 1 if x > 0 else 0)

df['snow_bin'] = pd.cut(df['Snow_Cover_(%)'], bins=[-0.1, 0, 25, 50, 100], labels=['None', 'Low', 'Medium', 'High'])

# 4. LAGS & ROLLING AGGREGATES (ONLY FOR EXTERNAL WEATHER VARIABLES SINCE ALL THE OTHER FEATURES ARE OBTAINED IN REAL-TIME)
cols = [
       'Temperature_(°C)', 'Dewpoint_Temperature_(°C)',
       'U_Wind_Component_(m/s)', 'V_Wind_Component_(m/s)',
       'Total_Precipitation_(mm)', 'Snowfall_(mm)', 'Snow_Cover_(%)'
]
lags    = [1, 24, 168]    # in hours: 1 h, 1 day, 1 week
windows = [3, 24, 168]    # rolling windows: 3 h, 24 h, 168 h

for col in cols:
    # create shifted lag features
    for lag in lags:
        df[f'{col}_lag_{lag}'] = df[col].shift(lag)
    # rolling mean & std (note: shift(1) to avoid peeking at current)
    for w in windows:
        df[f'{col}_roll_mean_{w}'] = df[col].shift(1).rolling(w).mean()
        df[f'{col}_roll_std_{w}']  = df[col].shift(1).rolling(w).std()

# 5. EXTRAPOLATE (AND RE‑INTERPOLATE) MISSING POST‑SHIFT VALUES
# This will fill in all NaNs introduced by shift/roll at the boundaries, by linearly extrapolating along the time axis.
# 5.1. Make sure any object‑dtype columns become proper types (future‑proofs Pandas)
df = df.infer_objects()  

# 5.2. Pick out only the numeric columns
num_cols = df.select_dtypes(include=['number']).columns

# 5.3. Time‑interpolate them in place; leave your categorical/string columns untouched
df[num_cols] = (
    df[num_cols]
      .interpolate(method='time', limit_direction='both')
)

  df = df.interpolate(method='time', limit_direction='both')


NotImplementedError: Categorical does not implement interpolate

In [None]:
# Assigning a new DataFrame without external_raw_weather_variable for a production-ready-forecaster:
pr = df.drop(columns=cols, errors='ignore')
pr

In [None]:
pr.to_csv('Training-ready_Production-ready-features-engineered.csv', index=True)

tr = pd.read_csv('Training-ready_Production-ready-features-engineered.csv')
tr

In [None]:
# DATETIME INDEX
tr['date_time'] = pd.to_datetime(tr['date_time'])
tr = tr.set_index('date_time').sort_index()
tr

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute the correlation matrix
corr_matrix = tr.corr(numeric_only=True)

# Set plot size and style
plt.figure(figsize=(128, 64))
sns.set(style='white')

# Plot the heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True, linewidths=0.5)

# Add title and show
plt.title('Correlation Heatmap (Training-Ready-Data)')
plt.tight_layout()
plt.savefig('Correlation_Heatmap_Training_Ready_Data.tiff', dpi=300)
plt.show()


In [None]:
tr.isnull().sum()