# 02_feature_engineering.ipynb - Feature Engineering
Create features for solar power forecasting models.

In [1]:
import pandas as pd
import numpy as np

# Load the merged and cleaned dataset
df = pd.read_csv('../data/processed/merged_cleaned_data.csv', parse_dates=['DATE_TIME'])
df = df.sort_values('DATE_TIME')
df = df.set_index('DATE_TIME')
df.head()

Unnamed: 0_level_0,PLANT_ID_x,SOURCE_KEY_x,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,PLANT_ID_y,SOURCE_KEY_y,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-05-15,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0,4135001,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0
2020-05-15,4135001,zVJPv84UY57bAof,0.0,0.0,0.0,7116151.0,4135001,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0
2020-05-15,4135001,zBIq5rxdHJRwDNY,0.0,0.0,0.0,6339380.0,4135001,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0
2020-05-15,4135001,z9Y9gH1T5YWrNuG,0.0,0.0,0.0,7007866.0,4135001,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0
2020-05-15,4135001,wCURE6d3bPkepu2,0.0,0.0,0.0,6782598.0,4135001,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0


In [2]:
# Create time-based features
df['hour'] = df.index.hour
df['day'] = df.index.day
df['dayofweek'] = df.index.dayofweek
df['month'] = df.index.month
df['is_weekend'] = df['dayofweek'].apply(lambda x: 1 if x >= 5 else 0)
df[['hour', 'day', 'dayofweek', 'month', 'is_weekend']].head()

Unnamed: 0_level_0,hour,day,dayofweek,month,is_weekend
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-05-15,0,15,4,5,0
2020-05-15,0,15,4,5,0
2020-05-15,0,15,4,5,0
2020-05-15,0,15,4,5,0
2020-05-15,0,15,4,5,0


In [7]:
df

Unnamed: 0_level_0,PLANT_ID_x,SOURCE_KEY_x,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,PLANT_ID_y,SOURCE_KEY_y,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,...,AC_POWER_lag1,IRRADIATION_lag1,AC_POWER_lag2,IRRADIATION_lag2,AC_POWER_lag3,IRRADIATION_lag3,AC_POWER_roll_mean_3h,IRRADIATION_roll_mean_3h,AC_POWER_roll_mean_6h,IRRADIATION_roll_mean_6h
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-05-15 00:00:00,4135001,uHbuxQJl8lW7ozc,0.0,0.0,0.0,7038681.0,4135001,HmiyD2TTLFNqkNe,25.184316,22.857507,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0
2020-05-15 00:00:00,4135001,rGa61gmuvPhdLxV,0.0,0.0,0.0,7111493.0,4135001,HmiyD2TTLFNqkNe,25.184316,22.857507,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0
2020-05-15 00:00:00,4135001,pkci93gMrogZuBj,0.0,0.0,0.0,7169102.0,4135001,HmiyD2TTLFNqkNe,25.184316,22.857507,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0
2020-05-15 00:00:00,4135001,ih0vzX44oOqAx2f,0.0,0.0,0.0,6185184.0,4135001,HmiyD2TTLFNqkNe,25.184316,22.857507,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0
2020-05-15 00:00:00,4135001,iCRJl6heRkivqQ3,0.0,0.0,0.0,7177992.0,4135001,HmiyD2TTLFNqkNe,25.184316,22.857507,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-17 23:45:00,4135001,1IF53ai7Xc0U56Y,0.0,0.0,6034.0,6433566.0,4135001,HmiyD2TTLFNqkNe,21.909288,20.427972,...,0.0,0.0,0.0,0.0,0.0,0.0,1.231015e-12,0.0,0.0,0.0
2020-06-17 23:45:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,5521.0,6485319.0,4135001,HmiyD2TTLFNqkNe,21.909288,20.427972,...,0.0,0.0,0.0,0.0,0.0,0.0,1.231015e-12,0.0,0.0,0.0
2020-06-17 23:45:00,4135001,zBIq5rxdHJRwDNY,0.0,0.0,5817.0,6583369.0,4135001,HmiyD2TTLFNqkNe,21.909288,20.427972,...,0.0,0.0,0.0,0.0,0.0,0.0,1.231015e-12,0.0,0.0,0.0
2020-06-17 23:45:00,4135001,ZoEaEvLYb1n2sOq,0.0,0.0,5871.0,7341753.0,4135001,HmiyD2TTLFNqkNe,21.909288,20.427972,...,0.0,0.0,0.0,0.0,0.0,0.0,1.231015e-12,0.0,0.0,0.0


In [3]:
# Create lag features (1h, 2h, 3h)
for lag in [1, 2, 3]:
    df[f'AC_POWER_lag{lag}'] = df['AC_POWER'].shift(lag)
    df[f'IRRADIATION_lag{lag}'] = df['IRRADIATION'].shift(lag)
df[[f'AC_POWER_lag{lag}' for lag in [1,2,3]]].head()

Unnamed: 0_level_0,AC_POWER_lag1,AC_POWER_lag2,AC_POWER_lag3
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-05-15,,,
2020-05-15,0.0,,
2020-05-15,0.0,0.0,
2020-05-15,0.0,0.0,0.0
2020-05-15,0.0,0.0,0.0


In [4]:
# Create rolling average features
for window in [3, 6]:
    df[f'AC_POWER_roll_mean_{window}h'] = df['AC_POWER'].rolling(window=window).mean()
    df[f'IRRADIATION_roll_mean_{window}h'] = df['IRRADIATION'].rolling(window=window).mean()
df[[f'AC_POWER_roll_mean_{window}h' for window in [3,6]]].head()

Unnamed: 0_level_0,AC_POWER_roll_mean_3h,AC_POWER_roll_mean_6h
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-05-15,,
2020-05-15,,
2020-05-15,0.0,
2020-05-15,0.0,
2020-05-15,0.0,


In [5]:
# Drop rows with NaN values from lags/rolling
df = df.dropna()
print("After dropping NA:", df.shape)

After dropping NA: (45675, 26)


In [6]:
# Save engineered features
df.to_csv('../data/processed/feature_engineered_data.csv')
print("✅ Saved: feature_engineered_data.csv")

✅ Saved: feature_engineered_data.csv
