In [2]:
import pandas as pd
import numpy as np
import glob
import os

# Columns to drop
DROP_COLS = ["Country Name", "Series Name"]

# Look for all .xlsx files in the same directory
xlsx_files = glob.glob("*.xlsx")

for xlsx_file in xlsx_files:
    print(f"\n=== Processing {xlsx_file} ===")
    df = pd.read_excel(xlsx_file, sheet_name="Data")
    print("Initial columns:", df.columns.tolist())

    # 1. Drop unwanted columns if they exist
    for col in DROP_COLS:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)
    print("Columns after dropping:", df.columns.tolist())

    # (Optional) Replace ".." with NaN if your data uses that for missing values
    # df.replace("..", np.nan, inplace=True)

    # 2. Identify columns that look like "2021Q1 [YR2021Q1]", etc.
    #    We assume they contain "YR" in the name.
    quarter_cols = [c for c in df.columns if "YR" in c]
    print("Detected quarter columns:", quarter_cols)

    # If no quarter columns found, skip this file
    if not quarter_cols:
        print("No quarter columns found. Skipping this file.")
        continue

    # 3. Melt (unpivot) these quarterly columns into rows
    try:
        df = df.melt(
            id_vars=["Country Code", "Series Code"],  # columns to keep
            value_vars=quarter_cols,
            var_name="Quarter",  # e.g. "2021Q1 [YR2021Q1]"
            value_name="Value"
        )
    except KeyError as e:
        print("ERROR in melt:", e)
        print("Check that 'Country Code' and 'Series Code' exist in df.columns.")
        continue

    print("Columns after melt:", df.columns.tolist())
    print("Sample data after melt:\n", df.head())

    # 4. Extract the 4-digit year from strings like "2021Q1 [YR2021Q1]"
    df["Year"] = df["Quarter"].str.extract(r"(\d{4})").astype(float).dropna().astype(int)

    # 5. Ensure "Value" is numeric (this fixes the aggregation issue)
    df["Value"] = pd.to_numeric(df["Value"], errors="coerce")

    # 6. Group by (Country Code, Series Code, Year) and compute the average of "Value"
    df = df.groupby(["Country Code", "Series Code", "Year"], as_index=False).agg({"Value": "mean"})
    print("Columns after groupby:", df.columns.tolist())
    print("Sample data after groupby:\n", df.head())

    # 7. Drop any rows where "Value" is NaN
    df.dropna(subset=["Value"], inplace=True)

    if df.empty:
        print("No data after dropping NaN values in 'Value'. Skipping reorder/save.")
        continue

    # 8. Reorder columns (only if they exist)
    needed_cols = ["Country Code", "Series Code", "Year", "Value"]
    missing_cols = [col for col in needed_cols if col not in df.columns]
    if missing_cols:
        print(f"ERROR: Missing columns in final DataFrame: {missing_cols}")
        continue

    df = df[needed_cols]

    # 9. Save to CSV (same base name as the original XLSX)
    out_csv = os.path.splitext(xlsx_file)[0] + ".csv"
    df.to_csv(out_csv, index=False)
    print(f"Saved to '{out_csv}'")



=== Processing Budgetary Central Gov.(% of GDP) Data - P_Data_Extract_From_Quarterly_Public_Sector_Debt.xlsx ===
Initial columns: ['Country Name', 'Country Code', 'Series Name', 'Series Code', '2012Q2 [YR2012Q2]', '2012Q3 [YR2012Q3]', '2012Q4 [YR2012Q4]', '2013Q1 [YR2013Q1]', '2013Q2 [YR2013Q2]', '2013Q3 [YR2013Q3]', '2013Q4 [YR2013Q4]', '2014Q1 [YR2014Q1]', '2014Q2 [YR2014Q2]', '2014Q3 [YR2014Q3]', '2014Q4 [YR2014Q4]', '2015Q1 [YR2015Q1]', '2015Q2 [YR2015Q2]', '2015Q3 [YR2015Q3]', '2015Q4 [YR2015Q4]', '2016Q1 [YR2016Q1]', '2016Q2 [YR2016Q2]', '2016Q3 [YR2016Q3]', '2016Q4 [YR2016Q4]', '2017Q1 [YR2017Q1]', '2017Q2 [YR2017Q2]', '2017Q3 [YR2017Q3]', '2017Q4 [YR2017Q4]', '2018Q1 [YR2018Q1]', '2018Q2 [YR2018Q2]', '2018Q3 [YR2018Q3]', '2018Q4 [YR2018Q4]', '2019Q1 [YR2019Q1]', '2019Q2 [YR2019Q2]', '2019Q3 [YR2019Q3]', '2019Q4 [YR2019Q4]', '2020Q1 [YR2020Q1]', '2020Q2 [YR2020Q2]', '2020Q3 [YR2020Q3]', '2020Q4 [YR2020Q4]', '2021Q1 [YR2021Q1]', '2021Q2 [YR2021Q2]', '2021Q3 [YR2021Q3]', '2021Q4 