## Importation des données

In [1]:
import sys
from pathlib import Path
import importlib
import ipynbname 
import pandas as pd
import geopandas as gpd
from datetime import datetime

code_path = ipynbname.path().parent.parent
# Ajouter le dossier scripts au path
scripts_path = code_path  / "scripts"
base_path=code_path.parent
sys.path.append(str(scripts_path.resolve()))

import data_utils  # importe le module une première fois

# Après avoir modifié data_utils.py
importlib.reload(data_utils)

# Maintenant tu peux accéder aux fonctions mises à jour
from data_utils import import_data_raw, import_data_sig, melt_long_format, clean_year_column, save_long_dataframe, concat_intermediate_files


In [2]:
#Country data
filename="data_final_all.csv"
filepath= base_path/ "Data" / 'data_final' / filename

df_data = pd.read_csv(filepath)
df_data.head()

Unnamed: 0,Year,Country,Value,Unit,Indicator,Source,Country_code,Type
0,1949,Afghanistan,0.004,MtC/year,Fossil CO2 emissions,GCB,AF,Annual
1,1950,Afghanistan,0.023,MtC/year,Fossil CO2 emissions,GCB,AF,Annual
2,1951,Afghanistan,0.025,MtC/year,Fossil CO2 emissions,GCB,AF,Annual
3,1952,Afghanistan,0.025,MtC/year,Fossil CO2 emissions,GCB,AF,Annual
4,1953,Afghanistan,0.029,MtC/year,Fossil CO2 emissions,GCB,AF,Annual


In [3]:
#Global sig
gdf_world=import_data_sig('world.geojson',base_path)
gdf_world.head()

Unnamed: 0,geo_point_2d,iso3,status,color_code,name,continent,region,Country_code,french_short,Country,geometry
0,"{ ""lon"": -16.984917430414384, ""lat"": 32.747916...",,PT Territory,PRT,Madeira Islands,Europe,Southern Europe,,Madeira Islands,,"POLYGON ((-17.1025 32.82333, -17.05306 32.8094..."
1,"{ ""lon"": 33.743791080217562, ""lat"": 21.8927401...",,Adm. by EGY,EGY,Ma'tan al-Sarra,Africa,Northern Africa,,Ma'tan al-Sarra,,"POLYGON ((33.25104 21.99977, 34.15064 21.99603..."
2,"{ ""lon"": 9.5613358449883421, ""lat"": 34.1108585...",TUN,Member State,TUN,Tunisia,Africa,Northern Africa,TN,Tunisie,Tunisia,"MULTIPOLYGON (((10.99361 33.75, 10.93778 33.72..."
3,"{ ""lon"": 43.77213543247138, ""lat"": 33.04802449...",IRQ,Member State,IRQ,Iraq,Asia,Western Asia,IQ,Iraq,Iraq,"POLYGON ((44.78734 37.14971, 44.76617 37.11228..."
4,"{ ""lon"": -6.3178452255610269, ""lat"": 31.883624...",MAR,Member State,MAR,Morocco,Africa,Northern Africa,MA,Maroc,Morocco,"POLYGON ((-2.94694 35.32916, -2.96618 35.31663..."


In [4]:
# Fetch data from world bank
from pathlib import Path
import requests
import pandas as pd
import time

# --- Chemin de sortie ---
filepath = base_path/ "Data" / "countries"
filepath.mkdir(parents=True, exist_ok=True)  # crée le dossier si inexistant
output_file = filepath / "countries_wdi_raw.csv"

# --- Paramètres WDI ---
WDI_BASE = "https://api.worldbank.org/v2"
INDICATORS = {
    "SP.POP.TOTL": "population_total",
    "NY.GDP.MKTP.CD": "gdp_current_usd",
    "NY.GDP.PCAP.CD": "gdp_percapita_current_usd",
    "NY.GDP.MKTP.PP.CD": "gdp_ppp_current_intl",
    "NY.GDP.PCAP.PP.CD": "gdp_percapita_ppp_current_intl"
}
START_YEAR = 1850
END_YEAR = 2025
PER_PAGE = 20000

# --- Fonction pour récupérer un indicateur ---
def fetch_indicator(indicator, start=START_YEAR, end=END_YEAR):
    rows = []
    page = 1
    while True:
        url = f"{WDI_BASE}/country/all/indicator/{indicator}"
        params = {"format": "json", "per_page": PER_PAGE, "page": page, "date": f"{start}:{end}"}
        r = requests.get(url, params=params)
        if r.status_code != 200:
            raise RuntimeError(f"Erreur HTTP {r.status_code} pour {indicator} page {page}")
        data = r.json()
        if not isinstance(data, list) or len(data) < 2:
            break
        meta, records = data[0], data[1]
        for item in records:
            rows.append({
                "country": item["country"]["value"],
                "countryiso3": item["country"]["id"],
                "Year": int(item["date"]),
                "value": None if item["value"] is None else float(item["value"])
            })
        total_pages = int(meta.get("pages", 1))
        if page >= total_pages:
            break
        page += 1
        time.sleep(0.2)
    return pd.DataFrame(rows)

# --- Télécharger les indicateurs ---
dfs = {}
for code, name in INDICATORS.items():
    print(f"Téléchargement de {code} ...")
    df = fetch_indicator(code)
    df = df.rename(columns={"value": name})
    dfs[name] = df

# --- Construire tableau complet pays x année ---
countries = pd.concat([df[['country','countryiso3']].drop_duplicates() for df in dfs.values()]).drop_duplicates()
years = list(range(START_YEAR, END_YEAR + 1))
cart = pd.MultiIndex.from_product([countries['countryiso3'], years], names=['countryiso3','Year']).to_frame(index=False)
cart['country'] = cart['countryiso3'].map(countries.set_index('countryiso3')['country'])

df_all = cart.copy()
for name, df in dfs.items():
    df_all = df_all.merge(df[['countryiso3','Year', name]], on=['countryiso3','Year'], how='left')

# --- Ajouter colonne de manquants ---
for name in INDICATORS.values():
    df_all[f"missing_{name}"] = df_all[name].isna()

# --- Sauvegarder ---
df_all = df_all.sort_values(['countryiso3','Year']).reset_index(drop=True)
df_all.to_csv(output_file, index=False, encoding="utf-8")

print(f"\n✅ Données enregistrées dans {output_file}")
print("Colonnes disponibles :", ", ".join(df_all.columns))
print(f"Nombre total de lignes : {len(df_all):,}")


Téléchargement de SP.POP.TOTL ...
Téléchargement de NY.GDP.MKTP.CD ...
Téléchargement de NY.GDP.PCAP.CD ...
Téléchargement de NY.GDP.MKTP.PP.CD ...
Téléchargement de NY.GDP.PCAP.PP.CD ...

✅ Données enregistrées dans C:\Users\Aubin\Documents\NetZero\Data\countries\countries_wdi_raw.csv
Colonnes disponibles : countryiso3, Year, country, population_total, gdp_current_usd, gdp_percapita_current_usd, gdp_ppp_current_intl, gdp_percapita_ppp_current_intl, missing_population_total, missing_gdp_current_usd, missing_gdp_percapita_current_usd, missing_gdp_ppp_current_intl, missing_gdp_percapita_ppp_current_intl
Nombre total de lignes : 46,816


In [5]:
# Fetch data from world bank
from pathlib import Path
import requests
import pandas as pd
import time

# --- Chemin de sortie ---
filepath = base_path/ "Data" / "countries"
filepath.mkdir(parents=True, exist_ok=True)  # crée le dossier si inexistant
output_file = filepath / "countries_wdi_raw.csv"

# --- Paramètres WDI ---
INDICATORS = {
    "SP.POP.TOTL": "population_total",
    "NY.GDP.MKTP.CD": "gdp_current_usd",
    "NY.GDP.PCAP.CD": "gdp_percapita_current_usd",
    "NY.GDP.MKTP.PP.CD": "gdp_ppp_current_intl",
    "NY.GDP.PCAP.PP.CD": "gdp_percapita_ppp_current_intl",
    "AG.LND.TOTL.K2": "area_km2"   # <-- ajout superficie
}
# --- Télécharger les indicateurs ---
dfs = {}
for code, name in INDICATORS.items():
    print(f"Téléchargement de {code} ...")
    df = fetch_indicator(code)
    df = df.rename(columns={"value": name})
    dfs[name] = df

# --- Construire tableau complet pays x année ---
countries = pd.concat([df[['country','countryiso3']].drop_duplicates() for df in dfs.values()]).drop_duplicates()
years = list(range(START_YEAR, END_YEAR + 1))
cart = pd.MultiIndex.from_product([countries['countryiso3'], years], names=['countryiso3','Year']).to_frame(index=False)
cart['country'] = cart['countryiso3'].map(countries.set_index('countryiso3')['country'])

df_all = cart.copy()
for name, df in dfs.items():
    df_all = df_all.merge(df[['countryiso3','Year', name]], on=['countryiso3','Year'], how='left')

# --- Ajouter colonne de manquants ---
for name in INDICATORS.values():
    df_all[f"missing_{name}"] = df_all[name].isna()

# --- Calculs supplémentaires ---
df_all['gdp_ppp_percapita_calc'] = df_all['gdp_ppp_current_intl'] / df_all['population_total']
df_all['population_density'] = df_all['population_total'] / df_all['area_km2']

# --- Sauvegarder ---
df_all = df_all.sort_values(['countryiso3','Year']).reset_index(drop=True)
df_all.to_csv(output_file, index=False, encoding="utf-8")

print(f"\n✅ Données enregistrées dans {output_file}")
print("Colonnes disponibles :", ", ".join(df_all.columns))
print(f"Nombre total de lignes : {len(df_all):,}")


Téléchargement de SP.POP.TOTL ...
Téléchargement de NY.GDP.MKTP.CD ...
Téléchargement de NY.GDP.PCAP.CD ...
Téléchargement de NY.GDP.MKTP.PP.CD ...
Téléchargement de NY.GDP.PCAP.PP.CD ...
Téléchargement de AG.LND.TOTL.K2 ...

✅ Données enregistrées dans C:\Users\Aubin\Documents\NetZero\Data\countries\countries_wdi_raw.csv
Colonnes disponibles : countryiso3, Year, country, population_total, gdp_current_usd, gdp_percapita_current_usd, gdp_ppp_current_intl, gdp_percapita_ppp_current_intl, area_km2, missing_population_total, missing_gdp_current_usd, missing_gdp_percapita_current_usd, missing_gdp_ppp_current_intl, missing_gdp_percapita_ppp_current_intl, missing_area_km2, gdp_ppp_percapita_calc, population_density
Nombre total de lignes : 46,816


In [6]:
df_all

Unnamed: 0,countryiso3,Year,country,population_total,gdp_current_usd,gdp_percapita_current_usd,gdp_ppp_current_intl,gdp_percapita_ppp_current_intl,area_km2,missing_population_total,missing_gdp_current_usd,missing_gdp_percapita_current_usd,missing_gdp_ppp_current_intl,missing_gdp_percapita_ppp_current_intl,missing_area_km2,gdp_ppp_percapita_calc,population_density
0,1A,1850,Arab World,,,,,,,True,True,True,True,True,True,,
1,1A,1851,Arab World,,,,,,,True,True,True,True,True,True,,
2,1A,1852,Arab World,,,,,,,True,True,True,True,True,True,,
3,1A,1853,Arab World,,,,,,,True,True,True,True,True,True,,
4,1A,1854,Arab World,,,,,,,True,True,True,True,True,True,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46811,ZW,2021,Zimbabwe,15797210.0,2.724051e+10,1724.387271,5.031071e+10,3184.784602,386850.0,False,False,False,False,False,False,3184.784602,40.835492
46812,ZW,2022,Zimbabwe,16069056.0,3.278966e+10,2040.546587,5.720647e+10,3560.039403,386850.0,False,False,False,False,False,False,3560.039403,41.538209
46813,ZW,2023,Zimbabwe,16340822.0,3.523137e+10,2156.034093,6.242782e+10,3820.359922,386850.0,False,False,False,False,False,False,3820.359922,42.240719
46814,ZW,2024,Zimbabwe,16634373.0,4.418770e+10,2656.409377,6.523499e+10,3921.697956,,False,False,False,False,False,True,3921.697956,


In [7]:
import pandas as pd
from pathlib import Path
import ipynbname

# --- Paths ---
code_path = ipynbname.path().parent.parent
base_path = code_path.parent
data_path = base_path / "Data" / "countries"

# --- Load WDI data ---
df_wdi = pd.read_csv(data_path / "countries_wdi_raw.csv")

# --- Compute derived indicators ---
df_wdi['gdp_percapita'] = df_wdi['gdp_current_usd'] / df_wdi['population_total']
df_wdi['gdp_ppp_percapita'] = df_wdi['gdp_ppp_current_intl'] / df_wdi['population_total']
df_wdi['population_density'] = df_wdi['population_total'] / df_wdi['area_km2']

# --- Normalisation specifications ---
norm_specs = {
    'area': ('area_km2', 1e6, '/million km²'),
    'population': ('population_total', 1e6, '/million inhabitants'),
    'gdp': ('gdp_current_usd', 1e9, '/billion USD'),
    'ppp': ('gdp_ppp_current_intl', 1e9, '/billion PPP$'),
    'gdp_hab': ('gdp_percapita', 1e3, '/thousand USD/hab'),
    'ppp_hab': ('gdp_ppp_percapita', 1e3, '/thousand PPP$/hab'),
    'densite': ('population_density', 1, '/hab/km²')
}

# --- Load your data ---
filename="data_final_all.csv"
filepath= base_path/ "Data" / 'data_final' / filename
df_data = pd.read_csv(filepath)# Year, Country, Value, Unit, Indicator, Source, Country_code
df_data.head()
#


Unnamed: 0,Year,Country,Value,Unit,Indicator,Source,Country_code,Type
0,1949,Afghanistan,0.004,MtC/year,Fossil CO2 emissions,GCB,AF,Annual
1,1950,Afghanistan,0.023,MtC/year,Fossil CO2 emissions,GCB,AF,Annual
2,1951,Afghanistan,0.025,MtC/year,Fossil CO2 emissions,GCB,AF,Annual
3,1952,Afghanistan,0.025,MtC/year,Fossil CO2 emissions,GCB,AF,Annual
4,1953,Afghanistan,0.029,MtC/year,Fossil CO2 emissions,GCB,AF,Annual


In [8]:
df_data

Unnamed: 0,Year,Country,Value,Unit,Indicator,Source,Country_code,Type
0,1949,Afghanistan,0.00400,MtC/year,Fossil CO2 emissions,GCB,AF,Annual
1,1950,Afghanistan,0.02300,MtC/year,Fossil CO2 emissions,GCB,AF,Annual
2,1951,Afghanistan,0.02500,MtC/year,Fossil CO2 emissions,GCB,AF,Annual
3,1952,Afghanistan,0.02500,MtC/year,Fossil CO2 emissions,GCB,AF,Annual
4,1953,Afghanistan,0.02900,MtC/year,Fossil CO2 emissions,GCB,AF,Annual
...,...,...,...,...,...,...,...,...
74227,2019,Zimbabwe,225.32013,TgC/year,LULUCF Net Emissions,GCB,ZW,Cumulative
74228,2020,Zimbabwe,226.85144,TgC/year,LULUCF Net Emissions,GCB,ZW,Cumulative
74229,2021,Zimbabwe,228.33659,TgC/year,LULUCF Net Emissions,GCB,ZW,Cumulative
74230,2022,Zimbabwe,229.85032,TgC/year,LULUCF Net Emissions,GCB,ZW,Cumulative


In [9]:
 #--- Merge df_data with WDI ---
df_norm = df_data.merge(df_wdi, left_on=['Country_code','Year'], right_on=['countryiso3','Year'], how='left')

# --- Apply all normalizations dynamically ---
for norm, (col_ref, divisor, unit_suffix) in norm_specs.items():
    val_col = f"Value_norm_{norm}"
    unit_col = f"Unit_norm_{norm}"
    # Normalize: df_data Value relative to the reference column
    df_norm[val_col] = df_norm['Value'] / (df_norm[col_ref] / divisor)
    df_norm[unit_col] = df_norm['Unit'] + unit_suffix

# --- Handle missing / infinite values ---
for col in [f"Value_norm_{n}" for n in norm_specs.keys()]:
    df_norm[col] = df_norm[col].replace([float('inf'), -float('inf')], pd.NA)

# --- Drop intermediate WDI columns if desired ---
drop_cols = [c for c in df_wdi.columns if c not in ['countryiso3','Year']]
df_final = df_norm.drop(columns=drop_cols)

# --- Drop intermediate columns ---
df_final = df_norm.drop(columns=['area_km2','countryiso3','population_total','gdp_current_usd','gdp_ppp_current_intl'])

output_dir = base_path / 'Data' / 'data_final'
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "data_final_all_norm.csv"
df_final.to_csv(output_file, index=False, encoding='utf-8')

print(f"✅ All normalizations applied and saved to {output_file}")
print(df_final.head())

✅ All normalizations applied and saved to C:\Users\Aubin\Documents\NetZero\Data\data_final\data_final_all_norm.csv
   Year      Country  Value      Unit             Indicator Source  \
0  1949  Afghanistan  0.004  MtC/year  Fossil CO2 emissions    GCB   
1  1950  Afghanistan  0.023  MtC/year  Fossil CO2 emissions    GCB   
2  1951  Afghanistan  0.025  MtC/year  Fossil CO2 emissions    GCB   
3  1952  Afghanistan  0.025  MtC/year  Fossil CO2 emissions    GCB   
4  1953  Afghanistan  0.029  MtC/year  Fossil CO2 emissions    GCB   

  Country_code    Type      country  gdp_percapita_current_usd  ...  \
0           AF  Annual  Afghanistan                        NaN  ...   
1           AF  Annual  Afghanistan                        NaN  ...   
2           AF  Annual  Afghanistan                        NaN  ...   
3           AF  Annual  Afghanistan                        NaN  ...   
4           AF  Annual  Afghanistan                        NaN  ...   

   Value_norm_gdp         Unit_norm_g

In [10]:
df_final

Unnamed: 0,Year,Country,Value,Unit,Indicator,Source,Country_code,Type,country,gdp_percapita_current_usd,...,Value_norm_gdp,Unit_norm_gdp,Value_norm_ppp,Unit_norm_ppp,Value_norm_gdp_hab,Unit_norm_gdp_hab,Value_norm_ppp_hab,Unit_norm_ppp_hab,Value_norm_densite,Unit_norm_densite
0,1949,Afghanistan,0.00400,MtC/year,Fossil CO2 emissions,GCB,AF,Annual,Afghanistan,,...,,MtC/year/billion USD,,MtC/year/billion PPP$,,MtC/year/thousand USD/hab,,MtC/year/thousand PPP$/hab,,MtC/year/hab/km²
1,1950,Afghanistan,0.02300,MtC/year,Fossil CO2 emissions,GCB,AF,Annual,Afghanistan,,...,,MtC/year/billion USD,,MtC/year/billion PPP$,,MtC/year/thousand USD/hab,,MtC/year/thousand PPP$/hab,,MtC/year/hab/km²
2,1951,Afghanistan,0.02500,MtC/year,Fossil CO2 emissions,GCB,AF,Annual,Afghanistan,,...,,MtC/year/billion USD,,MtC/year/billion PPP$,,MtC/year/thousand USD/hab,,MtC/year/thousand PPP$/hab,,MtC/year/hab/km²
3,1952,Afghanistan,0.02500,MtC/year,Fossil CO2 emissions,GCB,AF,Annual,Afghanistan,,...,,MtC/year/billion USD,,MtC/year/billion PPP$,,MtC/year/thousand USD/hab,,MtC/year/thousand PPP$/hab,,MtC/year/hab/km²
4,1953,Afghanistan,0.02900,MtC/year,Fossil CO2 emissions,GCB,AF,Annual,Afghanistan,,...,,MtC/year/billion USD,,MtC/year/billion PPP$,,MtC/year/thousand USD/hab,,MtC/year/thousand PPP$/hab,,MtC/year/hab/km²
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74227,2019,Zimbabwe,225.32013,TgC/year,LULUCF Net Emissions,GCB,ZW,Cumulative,Zimbabwe,1683.913136,...,8.761982,TgC/year/billion USD,4.594598,TgC/year/billion PPP$,133.807454,TgC/year/thousand USD/hab,70.165797,TgC/year/thousand PPP$/hab,5.707746,TgC/year/hab/km²
74228,2020,Zimbabwe,226.85144,TgC/year,LULUCF Net Emissions,GCB,ZW,Cumulative,Zimbabwe,1730.453910,...,8.443006,TgC/year/billion USD,4.161658,TgC/year/billion PPP$,131.093604,TgC/year/thousand USD/hab,64.617594,TgC/year/thousand PPP$/hab,5.651968,TgC/year/hab/km²
74229,2021,Zimbabwe,228.33659,TgC/year,LULUCF Net Emissions,GCB,ZW,Cumulative,Zimbabwe,1724.387271,...,8.382244,TgC/year/billion USD,4.538528,TgC/year/billion PPP$,132.416073,TgC/year/thousand USD/hab,71.696086,TgC/year/thousand PPP$/hab,5.591621,TgC/year/hab/km²
74230,2022,Zimbabwe,229.85032,TgC/year,LULUCF Net Emissions,GCB,ZW,Cumulative,Zimbabwe,2040.546587,...,7.009842,TgC/year/billion USD,4.017908,TgC/year/billion PPP$,112.641545,TgC/year/thousand USD/hab,64.563982,TgC/year/thousand PPP$/hab,5.533467,TgC/year/hab/km²
