Todo 
- calcualte industry-household for country selection (already defined in first cell)
- calculate European demand for each of the indicators, return which countries are included in each calculation

In [11]:
# Import necessary libraries
import pandas as pd

# Define paths 
EUROSTAT_PATH = "src/data/raw/eurostat/latest_data.csv"
BNETZA_PATH = "src/data/raw/germany_household/latest_data.csv"


In [12]:
# 1. Load and preprocess daily demand data
df = pd.read_csv("src/data/processed/daily_demand_all.csv")
df['date'] = pd.to_datetime(df['date'].astype(str).str[:10])
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

# Group by relevant columns and convert to TWh
df = df.groupby(['country', 'type', 'source', 'year', 'month'], as_index=False).agg({'demand': 'sum'})
df['demand'] = df['demand'] / 1000000000

In [13]:
# 2. Load and process Eurostat data
eurostat_df = pd.read_csv(EUROSTAT_PATH)
eurostat_df['date'] = pd.to_datetime(eurostat_df['date'])
eurostat_df['month'] = eurostat_df['date'].dt.month
eurostat_df['year'] = eurostat_df['date'].dt.year
del eurostat_df['date']
df = pd.concat([df, eurostat_df], ignore_index=True)

In [14]:
# 3. Load and process BNetzA data 
bnetza_df = pd.read_csv(BNETZA_PATH)
bnetza_df['date'] = pd.to_datetime(bnetza_df['date'])
bnetza_df['month'] = bnetza_df['date'].dt.month
bnetza_df['year'] = bnetza_df['date'].dt.year
bnetza_append = bnetza_df[['country', 'type', 'source', 'year', 'month', 'demand']]
df = pd.concat([df, bnetza_append], ignore_index=True)


# 4. Aggregate data
aggregated_df = (
    df.groupby(['country', 'type', 'year', 'month'], as_index=False)
    .agg({
        'demand': 'sum',
        'source': lambda x: ', '.join(sorted(set(x))) if len(set(x)) > 1 else x.iloc[0]
    })
)

# add filtered code here

In [15]:
import pandas as pd

def track_available_demand(df):
    pivot_df = df.pivot_table(index=["country", "year", "month"], columns="type", values="demand", aggfunc="sum")
    
    records = []
    
    for index, row in pivot_df.iterrows():
        country, year, month = index
        existing_types = set(row.dropna().index)
        
        records.append({
            "country": country,
            "year": year,
            "month": month,
            "available_types": list(existing_types)
        })
    
    tracking_df = pd.DataFrame(records)
    return tracking_df


def adjust_demand(aggregated_df, new_type, operation, required_types_in=None, required_types_not_in=None):
    """
    Adjusts demand based on whether required types are in or not in available_types.
    """
    tracking_available_demand = track_available_demand(aggregated_df)
    
    required_types_in = required_types_in if required_types_in is not None else set()
    required_types_not_in = required_types_not_in if required_types_not_in is not None else set()
    
    valid_entries = tracking_available_demand[
        tracking_available_demand['available_types'].apply(
            lambda x: required_types_in.issubset(set(x)) and required_types_not_in.isdisjoint(set(x))
        )
    ]
    
    new_rows = []
    
    for _, row in valid_entries.iterrows():
        country, year, month = row['country'], row['year'], row['month']
        subset = aggregated_df[(aggregated_df['country'] == country) &
                               (aggregated_df['year'] == year) &
                               (aggregated_df['month'] == month)]
        
        new_demand = operation(subset)
        if new_demand is not None:
            new_rows.append({
                'country': country,
                'year': year,
                'month': month,
                'type': new_type,
                'demand': new_demand,
                'source': 'calculated'
            })
    
    if new_rows:
        new_df = pd.DataFrame(new_rows)
        aggregated_df = pd.concat([aggregated_df, new_df], ignore_index=True)
        
        # Remove 'industry-power' rows after calculating industry_str_demand_operation
        if new_type == 'industry':
            aggregated_df = aggregated_df[aggregated_df['type'] != 'industry-power'].reset_index(drop=True)
            aggregated_df = aggregated_df[aggregated_df['type'] != 'distribution'].reset_index(drop=True)
    
    return aggregated_df


def industry_str_demand_operation(subset):
    industry_power_demand = subset[subset['type'] == 'industry-power']['demand'].values
    power_demand = subset[subset['type'] == 'power']['demand'].values
    return industry_power_demand[0] - power_demand[0] if len(industry_power_demand) > 0 and len(power_demand) > 0 else None


def industry_sub_demand_operation(subset):
    total_demand = subset[subset['type'] == 'total']['demand'].values
    power_demand = subset[subset['type'] == 'power']['demand'].values
    household_demand = subset[subset['type'] == 'household']['demand'].values
    return total_demand[0] - power_demand[0] - household_demand[0] if len(total_demand) > 0 and len(power_demand) > 0 and len(household_demand) > 0 else None


def total_demand_operation(subset):
    return subset['demand'].sum()


def industry_household_demand_operation(subset):
    total_demand = subset[subset['type'] == 'total']['demand'].values
    power_demand = subset[subset['type'] == 'power']['demand'].values
    return total_demand[0] - power_demand[0] if len(total_demand) > 0 and len(power_demand) > 0 else None


# Apply operations dynamically based on missing data
# Apply operations dynamically based on missing data
updated_aggregated_df = adjust_demand(aggregated_df, 'industry', industry_sub_demand_operation, {'total', 'power', 'household'}, {'industry'})
updated_aggregated_df = adjust_demand(updated_aggregated_df, 'industry', industry_str_demand_operation, {'industry-power', 'power', 'household'}, {'industry'})
updated_aggregated_df = adjust_demand(updated_aggregated_df, 'total', total_demand_operation, None, {'total'})
final_aggregated_df = adjust_demand(updated_aggregated_df, 'industry-household', industry_household_demand_operation, {'power', 'total'},  {'industry', 'household'})



# Recalculate tracking_available_demand
tracking_available_demand = track_available_demand(final_aggregated_df)


In [16]:
tracking_available_demand 

Unnamed: 0,country,year,month,available_types
0,AT,2016,1,[total]
1,AT,2016,2,[total]
2,AT,2016,3,[total]
3,AT,2016,4,[total]
4,AT,2016,5,[total]
...,...,...,...,...
3145,UK,2024,9,"[power, total, household, industry]"
3146,UK,2024,10,"[power, total, household, industry]"
3147,UK,2024,11,"[power, total, household, industry]"
3148,UK,2024,12,"[power, total, household, industry]"


In [17]:
# Remove unwanted countries
final_aggregated_df = final_aggregated_df[~final_aggregated_df['country'].isin(["EU27_2020", "CY", "MT"])]

def compute_country_contributions(group):
    total_demand = group['demand'].sum()  # Compute total demand for the group
    
    contributions = {
        country: round(float(group[group['country'] == country]['demand'].sum() / total_demand) * 100, 10)  
        for country in group['country'].unique()
    }
    
    return pd.Series({'country_contributions': contributions, 'demand': total_demand})  # Rename total_demand to demand

# Group by type, year, and month and compute demand + country-wise contributions
df_pivot = final_aggregated_df.groupby(['type', 'year', 'month']).apply(compute_country_contributions).reset_index()

# Assign 'EU' as the country
df_pivot['country'] = 'EU'

# Add source column
df_pivot['source'] = 'calculated'

# Convert country contributions to a properly formatted dictionary with float values
df_pivot['country_contributions'] = df_pivot['country_contributions'].apply(lambda x: {k: float(v) for k, v in x.items()})

# Concatenate with the original dataset
final_aggregated_df = pd.concat([final_aggregated_df, df_pivot], ignore_index=True)


  df_pivot = final_aggregated_df.groupby(['type', 'year', 'month']).apply(compute_country_contributions).reset_index()


In [18]:
final_aggregated_df.head()

Unnamed: 0,country,type,year,month,demand,source,country_contributions
0,AT,power,2018,12,0.002566,energy-charts,
1,AT,power,2019,1,3.147638,energy-charts,
2,AT,power,2019,2,2.488796,energy-charts,
3,AT,power,2019,3,1.331403,energy-charts,
4,AT,power,2019,4,1.05502,energy-charts,


In [22]:
# 9. Final processing and export
final_aggregated_df = final_aggregated_df[final_aggregated_df['year'] >= 2019]
final_aggregated_df['demand'] = final_aggregated_df['demand'].round(2)

#### filter until 2025 - can be cleaner
final_aggregated_df = final_aggregated_df[final_aggregated_df['year'] < 2025]


In [23]:
# Save the results
final_aggregated_df.to_csv("src/data/analyzed/monthly_demand_clean.csv", index=False)
final_aggregated_df.to_json("src/data/analyzed/monthly_demand_clean.json", orient='records', indent=4)
with pd.ExcelWriter("src/data/analyzed/monthly_demand_clean.xlsx", engine='openpyxl') as writer:
    final_aggregated_df.to_excel(writer, sheet_name="Aggregated Data", index=False)


# add output to to_xlsx 