# Monthly data cleaning

In [158]:
import pandas as pd
import numpy as np


#data that has had initial cleaning performed on it
df = pd.read_json("src/data/analyzed/monthly_demand_clean.json")

In [159]:
# Define a mapping of EU country codes to their full names
country_mapping = {
    "AT": "Austria", "BE": "Belgium", "BG": "Bulgaria", "HR": "Croatia", 
    "CY": "Cyprus", "CZ": "Czech Republic", "DK": "Denmark", "EE": "Estonia", 
    "FI": "Finland", "FR": "France", "DE": "Germany", "GR": "Greece", 
    "HU": "Hungary", "IS": "Iceland", "IE": "Ireland", "IT": "Italy", 
    "LV": "Latvia", "LI": "Liechtenstein", "LT": "Lithuania", "LU": "Luxembourg", 
    "MT": "Malta", "NL": "Netherlands", "NO": "Norway", "PL": "Poland", 
    "PT": "Portugal", "RO": "Romania", "SK": "Slovakia", "SI": "Slovenia", 
    "ES": "Spain", "SE": "Sweden", "CH": "Switzerland", "TR": "Turkey", 
    "UK": "United Kingdom", "EU": "Europe*"
}

# Replace country codes with full names
df["country_full"] = df["country"].map(country_mapping)
df["country"] = df["country_full"]

# Create the 'countryType' column with full country names
df["country_type"] = df["country_full"] + " - " + df["type"]

In [160]:
# Define the years to average
years_to_average = [2019, 2020, 2021]

# Filter for rows corresponding to the specified years
filtered_df = df[df['year'].isin(years_to_average)]

# Group by country, type, and month, then calculate the average demand
index_df = (
    filtered_df
    .groupby(['country', 'type', 'month'])['demand']
    .mean()
    .reset_index(name='monthly_index')  # Calculate the average demand
)


df= df.merge(index_df, on=['country', 'type', 'month'], how='left')


# Group by type, week, and country, then calculate the average demand
average_df = (
    filtered_df
    .groupby(['type', 'month','country'])['demand']
    .mean()
    .reset_index(name='demand_average') 
)

# Add a new column for the year and set it to "AVG-2019-2021"
average_df['year'] = "AVG-2019-2021"
df["demand_average"] =df["demand"]
df= pd.concat([df, average_df], ignore_index=True)

# Replace all 0s in the 'demand' column with NaN
#df['demand_average'] = df['demand_average'].replace(0, np.nan)

In [161]:
# Calculate indexed demand values for all rows except "AVG-2019-2021"
df['demand_sector'] = (df['demand'] - df['monthly_index'])
df['demand_indexed'] = (df['demand'] / df['monthly_index'])*100


# Indexed Line

In [162]:
df_i = df.copy()
index_df_i = index_df.copy()

In [163]:

df_i["month"] = df_i["month"].astype(str).str.zfill(2)  # Add leading zero to month if needed
df_i["year"] = df_i["year"].astype(str)

# Create the new column 'monthb' as a concatenation of 'month' and 'year'
df_i["x_value"] =  df_i["month"] + "/"+ df_i["year"]

#Drop type=total
# Remove rows with years 2019, 2020, or 2021
df_i= df_i[~df_i['year'].isin(years_to_average)]
df_i = df_i[df_i['type'] == 'total'].copy()



# Stacked

In [164]:
df_s = df.copy()
index_df_s = index_df.copy()

In [165]:
df_s = df_s[df_s['type'] != "total"]
df_s["month"] = df_s["month"].astype(str).str.zfill(2)  # Ensure two-digit month
df_s["year"] = df_s["year"].astype(str)  # Ensure year is string
df_s["x_value"] = df_s["month"] + "/" + df_s["year"]  # Format as "MM/YYYY"


# Save .json

In [166]:
import pandas as pd
import json
df_a = df.copy()
index_df_a = index_df.copy()



df_a = df_a[~df_a['year'].isin([2019, 2020, 2021])]

df_a["month"] = df_a["month"].astype(str).str.zfill(2)  # Ensure two-digit month
df_a["year"] = df_a["year"].astype(str)  # Ensure year is strin

df_json = df_a.copy()
df_json["y_value"] = df_json["demand_average"]



# Now apply rounding safely
df_json["y_value"] = df_json["y_value"].round(2)


df_json["x_value"] = df_json["year"]
df_json["x_b_value"] = df_json["month"]
df_json["group_value"] = df_json["type"]
df_json["group_b_value"] = df_json["country"]



# Keep only required columns
df_json = df_json[["x_value", "y_value", "x_b_value", "group_value", "group_b_value"]]

# Convert all known NaN representations to actual NaN (ensuring full detection)
df_json.replace({"": None, "NaN": None, "nan": None, "NULL": None, pd.NA: None, float("nan"): None}, inplace=True)


# Drop all rows where y_value is NaN, None, or missing
df_json = df_json.dropna(subset=["y_value"])  # Ensures only valid rows remain

# Drop all rows where y_value is NaN, None, or missing
df_json = df_json.dropna(subset=["y_value"])  # Ensures only valid rows remain

# Convert DataFrame to JSON format (forces NaN to be removed)
json_data = df_json.to_dict(orient="records")

# Save JSON without escaping forward slashes
file_path = "highcharts/data/monthly_demand_average.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=4, ensure_ascii=False)


print(f"The file has been saved as: {file_path}")

The file has been saved as: highcharts/data/monthly_demand_average.json


In [167]:


# Create a copy to avoid modifying df_s in place
df_json = df_s.copy()

# Rename columns to match output structure
df_json["y_value"] = df_json["demand_sector"]
df_json["y_value"] = df_json["y_value"].round(2)
df_json["group_value"] = df_json["type"]
df_json["group_b_value"] = df_json["country"]

df_json = df_json[["x_value", "y_value", "group_value", "group_b_value"]]
# Convert all known NaN representations to actual NaN (ensuring full detection)
df_json.replace({"": None, "NaN": None, "nan": None, "NULL": None, pd.NA: None, float("nan"): None}, inplace=True)

# Drop all rows where y_value is NaN, None, or missing
df_json = df_json.dropna(subset=["y_value"])  # Ensures only valid rows remain

# Convert DataFrame to JSON format (forces NaN to be removed)
json_data = df_json.to_dict(orient="records")

# Save JSON without escaping forward slashes
file_path = "highcharts/data/monthly_demand_sector.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=4, ensure_ascii=False)


In [168]:
df_json = df_i.copy()
df_json["y_value"] = df_json["demand_indexed"]


# Replace infinite values with NaN before rounding
df_json["y_value"].replace([np.inf, -np.inf], np.nan, inplace=True)

# Now apply rounding safely
df_json["y_value"] = df_json["y_value"].round(2)

df_json["group_value"] = df_json["country"]

df_json[df_json['type'] == 'total']
# Keep only required columns
df_json = df_json[["x_value", "y_value", "group_value"]]

# Convert all known NaN representations to actual NaN (ensuring full detection)
df_json.replace({"": None, "NaN": None, "nan": None, "NULL": None, pd.NA: None, float("nan"): None}, inplace=True)


# Drop all rows where y_value is NaN, None, or missing
df_json = df_json.dropna(subset=["y_value"])  # Ensures only valid rows remain


# Convert DataFrame to JSON format (forces NaN to be removed)
json_data = df_json.to_dict(orient="records")

# Save JSON without escaping forward slashes
file_path = "highcharts/data/monthly_demand_indexed.json"
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=4, ensure_ascii=False)


print(f"The file has been saved as: {file_path}")

The file has been saved as: highcharts/data/monthly_demand_indexed.json


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_json["y_value"].replace([np.inf, -np.inf], np.nan, inplace=True)
