In [24]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

def generate_pv_scenarios(csv_file, num_scenarios=3, num_periods=4):
    # Load the PV production data
    df = pd.read_csv(csv_file, sep = ';')
    df.dropna(axis = 'columns', inplace = True)

    # Ensure proper datetime parsing
    df["time"] = pd.to_datetime(df["time"])
    df.set_index("time", inplace=True)
    
    # Aggregate data into daily segments
    df["date"] = df.index.date
    daily_pv = df.groupby("date")["electricity"].sum()
    
    # Normalize by daily maximum to create patterns
    daily_pv_normalized = daily_pv / daily_pv.max()
    daily_pv_normalized = daily_pv_normalized.fillna(0)
    
    # Reshape data into daily profiles
    df["hour"] = df.index.hour
    daily_profiles = df.pivot_table(index="date", columns="hour", values="electricity", aggfunc=np.mean)
    daily_profiles = daily_profiles.fillna(0)
    
    # Cluster days into `num_scenarios` using KMeans
    kmeans = KMeans(n_clusters=num_scenarios, random_state=42, n_init=10)
    daily_profiles["scenario"] = kmeans.fit_predict(daily_profiles)
    
    # Get representative days for each scenario
    scenario_representatives = daily_profiles.groupby("scenario").mean()
    
    # Break each representative day into `num_periods` by averaging over periods
    period_length = 24 // num_periods
    scenario_periods = {}
    
    for scenario, row in scenario_representatives.iterrows():
        hourly_values = row.values[:24].reshape(num_periods, period_length).mean(axis=1)
        scenario_periods[scenario] = hourly_values
    
    return scenario_periods

# Example usage:
scenarios = generate_pv_scenarios("/Users/chris/Downloads/pv_hourly2019_cologne.csv", num_scenarios=3, num_periods=4)
print(scenarios)


{0: array([  2.55405405, 290.09159159, 203.95945946,   0.43093093]), 1: array([ 11.9291939 , 521.13616558, 393.82788671,   2.42810458]), 2: array([2.07920792e-01, 7.63844884e+01, 6.44389439e+01, 5.61056106e-02])}


  daily_profiles = df.pivot_table(index="date", columns="hour", values="electricity", aggfunc=np.mean)
