# Monthly data cleaning

In [1]:
import pandas as pd
import numpy as np


#data that has had initial cleaning performed on it
df = pd.read_json("src/data/analyzed/monthly_demand_clean.json")

In [2]:
# Define a mapping of EU country codes to their full names
country_mapping = {
    "AT": "Austria", "BE": "Belgium", "BG": "Bulgaria", "HR": "Croatia", 
    "CY": "Cyprus", "CZ": "Czech Republic", "DK": "Denmark", "EE": "Estonia", 
    "FI": "Finland", "FR": "France", "DE": "Germany", "GR": "Greece", 
    "HU": "Hungary", "IS": "Iceland", "IE": "Ireland", "IT": "Italy", 
    "LV": "Latvia", "LI": "Liechtenstein", "LT": "Lithuania", "LU": "Luxembourg", 
    "MT": "Malta", "NL": "Netherlands", "NO": "Norway", "PL": "Poland", 
    "PT": "Portugal", "RO": "Romania", "SK": "Slovakia", "SI": "Slovenia", 
    "ES": "Spain", "SE": "Sweden", "CH": "Switzerland", "TR": "Turkey", 
    "UK": "United Kingdom", "EU": "Europe*"
}

# Replace country codes with full names
df["country_full"] = df["country"].map(country_mapping)
df["country"] = df["country_full"]

# Create the 'countryType' column with full country names
df["country_type"] = df["country_full"] + " - " + df["type"]

In [3]:
# Define the years to average
years_to_average = [2019, 2020, 2021]

# Filter for rows corresponding to the specified years
filtered_df = df[df['year'].isin(years_to_average)]

# Group by country, type, and month, then calculate the average demand
index_df = (
    filtered_df
    .groupby(['country', 'type', 'month'])['demand']
    .mean()
    .reset_index(name='monthly_index')  # Calculate the average demand
)


df= df.merge(index_df, on=['country', 'type', 'month'], how='left')

In [4]:
# Calculate indexed demand values for all rows except "AVG-2019-2021"
df['demand_sector'] = (df['demand'] - df['monthly_index'])
df['demand_indexed'] = (df['demand'] / df['monthly_index'])*100


# Line

In [5]:
df_i = df.copy()
index_df_i = index_df.copy()

In [6]:

df_i["month"] = df_i["month"].astype(str).str.zfill(2)  # Add leading zero to month if needed
df_i["year"] = df_i["year"].astype(str)

# Create the new column 'monthb' as a concatenation of 'month' and 'year'
df_i["x_value"] =  df_i["month"] + "/"+ df_i["year"]

#Drop type=total
# Remove rows with years 2019, 2020, or 2021
df_i= df_i[~df_i['year'].isin(years_to_average)]

#filtering only to show totals for now 
df_i = df_i[df_i['type'] == 'total'].copy()


# Stacked

In [7]:
df_s = df.copy()
index_df_s = index_df.copy()

In [8]:
df_s = df_s[df_s['type'] != "total"]
df_s["month"] = df_s["month"].astype(str).str.zfill(2)  # Ensure two-digit month
df_s["year"] = df_s["year"].astype(str)  # Ensure year is string
df_s["x_value"] = df_s["month"] + "/" + df_s["year"]  # Format as "MM/YYYY"


In [9]:
# Create a mask for Europe* to keep only Europe* power, household, industry
europe_mask = (df_s['country'] == 'Europe*') & (~df_s['type'].isin(['household', 'industry', 'power']))
df_s = df_s.drop(df_s[europe_mask].index)

# Save .json

In [10]:
import pandas as pd
import json

# Create a copy to avoid modifying df_s in place
df_json = df_s.copy()

# Rename columns to match output structure
df_json["y_value"] = df_json["demand_sector"]
df_json["y_value"] = df_json["y_value"].round(2)
df_json["group_value"] = df_json["type"]
df_json["group_b_value"] = df_json["country"]

df_json = df_json[["x_value", "y_value", "group_value", "group_b_value"]]
# Convert all known NaN representations to actual NaN (ensuring full detection)
df_json.replace({"": None, "NaN": None, "nan": None, "NULL": None, pd.NA: None, float("nan"): None}, inplace=True)

# Drop all rows where y_value is NaN, None, or missing
df_json = df_json.dropna(subset=["y_value"])  # Ensures only valid rows remain

# Convert DataFrame to JSON format (forces NaN to be removed)
json_data = df_json.to_dict(orient="records")

# Save JSON without escaping forward slashes
file_path = "highcharts/data/monthly_demand_sector.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=4, ensure_ascii=False)


In [11]:
df_i

Unnamed: 0,country,type,year,month,demand,source,country_contributions,country_full,country_type,monthly_index,demand_sector,demand_indexed,x_value
72,Austria,total,2019,01,27.60,"aggm, eurostat",,Austria,Austria - total,26.916667,0.683333,102.538700,01/2019
73,Austria,total,2019,02,20.83,"aggm, eurostat",,Austria,Austria - total,20.686667,0.143333,100.692878,02/2019
74,Austria,total,2019,03,15.33,"aggm, eurostat",,Austria,Austria - total,18.386667,-3.056667,83.375635,03/2019
75,Austria,total,2019,04,13.65,"aggm, eurostat",,Austria,Austria - total,14.300000,-0.650000,95.454545,04/2019
76,Austria,total,2019,05,11.93,"aggm, eurostat",,Austria,Austria - total,11.133333,0.796667,107.155689,05/2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6679,Europe*,total,2024,08,302.66,calculated,"{'AT': 2.0719594338, 'BE': 2.4493779433, 'BG':...",Europe*,Europe* - total,374.383333,-71.723333,80.842274,08/2024
6680,Europe*,total,2024,09,350.90,calculated,"{'AT': 2.3220284785, 'BE': 2.5123477836, 'BG':...",Europe*,Europe* - total,421.640000,-70.740000,83.222654,09/2024
6681,Europe*,total,2024,10,448.90,calculated,"{'AT': 2.5858366105, 'BE': 2.549173655, 'BG': ...",Europe*,Europe* - total,556.133333,-107.233333,80.718053,10/2024
6682,Europe*,total,2024,11,673.94,calculated,"{'AT': 3.0639486043, 'BE': 2.5736032625, 'BG':...",Europe*,Europe* - total,728.890000,-54.950000,92.461140,11/2024


In [12]:
df_json = df_i.copy()
df_json["y_value"] = df_json["demand_indexed"]
df_json["y_value"] = df_json["demand_indexed"]

# Replace infinite values with NaN before rounding
df_json["y_value"].replace([np.inf, -np.inf], np.nan, inplace=True)

# Now apply rounding safely
df_json["y_value"] = df_json["y_value"].round(2)

#renaming total as country (change in future if we add industry/household/power etc)
df_json["group_value"] = df_json["country"]
df_json[df_json['type'] == 'total']

# Keep only required columns
df_json = df_json[["x_value", "y_value", "group_value"]]

# Convert all known NaN representations to actual NaN (ensuring full detection)
df_json.replace({"": None, "NaN": None, "nan": None, "NULL": None, pd.NA: None, float("nan"): None}, inplace=True)


# Drop all rows where y_value is NaN, None, or missing
df_json = df_json.dropna(subset=["y_value"])  # Ensures only valid rows remain

# Drop all rows where y_value is NaN, None, or missing
df_json = df_json.dropna(subset=["y_value"])  # Ensures only valid rows remain

# Convert DataFrame to JSON format (forces NaN to be removed)
json_data = df_json.to_dict(orient="records")

# Save JSON without escaping forward slashes
file_path = "highcharts/data/monthly_demand_indexed.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=4, ensure_ascii=False)


print(f"The file has been saved as: {file_path}")

The file has been saved as: highcharts/data/monthly_demand_indexed.json


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_json["y_value"].replace([np.inf, -np.inf], np.nan, inplace=True)


In [14]:
df_i

Unnamed: 0,country,type,year,month,demand,source,country_contributions,country_full,country_type,monthly_index,demand_sector,demand_indexed,x_value
72,Austria,total,2019,01,27.60,"aggm, eurostat",,Austria,Austria - total,26.916667,0.683333,102.538700,01/2019
73,Austria,total,2019,02,20.83,"aggm, eurostat",,Austria,Austria - total,20.686667,0.143333,100.692878,02/2019
74,Austria,total,2019,03,15.33,"aggm, eurostat",,Austria,Austria - total,18.386667,-3.056667,83.375635,03/2019
75,Austria,total,2019,04,13.65,"aggm, eurostat",,Austria,Austria - total,14.300000,-0.650000,95.454545,04/2019
76,Austria,total,2019,05,11.93,"aggm, eurostat",,Austria,Austria - total,11.133333,0.796667,107.155689,05/2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6679,Europe*,total,2024,08,302.66,calculated,"{'AT': 2.0719594338, 'BE': 2.4493779433, 'BG':...",Europe*,Europe* - total,374.383333,-71.723333,80.842274,08/2024
6680,Europe*,total,2024,09,350.90,calculated,"{'AT': 2.3220284785, 'BE': 2.5123477836, 'BG':...",Europe*,Europe* - total,421.640000,-70.740000,83.222654,09/2024
6681,Europe*,total,2024,10,448.90,calculated,"{'AT': 2.5858366105, 'BE': 2.549173655, 'BG': ...",Europe*,Europe* - total,556.133333,-107.233333,80.718053,10/2024
6682,Europe*,total,2024,11,673.94,calculated,"{'AT': 3.0639486043, 'BE': 2.5736032625, 'BG':...",Europe*,Europe* - total,728.890000,-54.950000,92.461140,11/2024
