In [3]:
import pandas as pd
import numpy as np
import glob
import os

# Columns to drop
DROP_COLS = ["Country Name", "Series Name"]

# Year columns to transform
YEAR_COLS = [f"{year} [YR{year}]" for year in range(2004, 2024)]


# Get list of .xlsx files in current directory
xlsx_files = glob.glob("*.xlsx")

for xlsx_file in xlsx_files:
    # 1. Read the data from the "Data" sheet
    df = pd.read_excel(xlsx_file, sheet_name="Data")

    # 2. Drop the "Country Name" and "Series Name" columns
    df.drop(columns=DROP_COLS, inplace=True)

    # 3. Convert all ".." to NULL (NaN)
    df.replace("..", np.nan, inplace=True)

    # 4. Reshape the year columns into rows
    df = df.melt(
        id_vars=["Country Code", "Series Code"],  # columns to keep as identifiers
        value_vars=YEAR_COLS,                     # columns to unpivot
        var_name="Year",                          # name for the new 'Year' column
        value_name="Value"                        # name for the new 'Value' column
    )

    # 5. Drop any row where 'Value' is NaN
    df.dropna(subset=["Value"], inplace=True)

    # 6. Extract the numeric year (e.g., "2021" from "2021 [YR2021]") and convert to int
    df["Year"] = df["Year"].str.extract(r"(\d{4})").astype(int)

    # 7. Reset the DataFrame index and create a new column named "Index"
    #    We'll have it start at 1 rather than 0.
    df.reset_index(drop=True, inplace=True)
    df["Index"] = df.index + 1

    # 8. Reorder columns so "Index" is first
    df = df[["Index", "Country Code", "Series Code", "Year", "Value"]]

    # 9. Save to CSV (same base name as the original .xlsx)
    out_csv = os.path.splitext(xlsx_file)[0] + ".csv"
    df.to_csv(out_csv, index=False)

    print(f"Processed '{xlsx_file}' → '{out_csv}'")


Processed 'Economic Data - P_Data_Extract_From_World_Development_Indicators.xlsx' → 'Economic Data - P_Data_Extract_From_World_Development_Indicators.csv'
Processed 'Education Data - P_Data_Extract_From_World_Development_Indicators.xlsx' → 'Education Data - P_Data_Extract_From_World_Development_Indicators.csv'
Processed 'Environment Data - P_Data_Extract_From_World_Development_Indicators.xlsx' → 'Environment Data - P_Data_Extract_From_World_Development_Indicators.csv'
Processed 'Financial Sector Data - P_Data_Extract_From_World_Development_Indicators.xlsx' → 'Financial Sector Data - P_Data_Extract_From_World_Development_Indicators.csv'
Processed 'Gender Data - P_Data_Extract_From_World_Development_Indicators.xlsx' → 'Gender Data - P_Data_Extract_From_World_Development_Indicators.csv'
Processed 'Health Data - P_Data_Extract_From_World_Development_Indicators.xlsx' → 'Health Data - P_Data_Extract_From_World_Development_Indicators.csv'
Processed 'Infrastructure Data - P_Data_Extract_From_W