## Importation des données

In [1]:
import sys
from pathlib import Path
import importlib
import ipynbname 
import pandas as pd
import geopandas as gpd
from datetime import datetime

code_path = ipynbname.path().parent.parent
# Ajouter le dossier scripts au path
scripts_path = code_path  / "scripts"
base_path=code_path.parent
sys.path.append(str(scripts_path.resolve()))

import data_utils  # importe le module une première fois

# Après avoir modifié data_utils.py
importlib.reload(data_utils)

# Maintenant tu peux accéder aux fonctions mises à jour
from data_utils import import_data_raw, import_data_sig, melt_long_format, clean_year_column, save_long_dataframe, concat_intermediate_files


In [42]:
#Country data
filename="data_final_all.csv"
filepath= base_path/ "Data" / 'data_final' / filename

df_data = pd.read_csv(filepath)
df_data.head()

Unnamed: 0,Year,Country,Value,Unit,Indicator,Source,Country_code
0,1960,Afghanistan,4.17891,TgC/year,LULUCF Net emissions,GCB,AF
1,1961,Afghanistan,3.59662,TgC/year,LULUCF Net emissions,GCB,AF
2,1962,Afghanistan,3.32416,TgC/year,LULUCF Net emissions,GCB,AF
3,1963,Afghanistan,3.23023,TgC/year,LULUCF Net emissions,GCB,AF
4,1964,Afghanistan,2.79609,TgC/year,LULUCF Net emissions,GCB,AF


In [3]:
#Global sig
gdf_world=import_data_sig('world.geojson',base_path)
gdf_world.head()

Unnamed: 0,geo_point_2d,iso3,status,color_code,name,continent,region,Country_code,french_short,Country,geometry
0,"{ ""lon"": -16.984917430414384, ""lat"": 32.747916...",,PT Territory,PRT,Madeira Islands,Europe,Southern Europe,,Madeira Islands,,"POLYGON ((-17.1025 32.82333, -17.05306 32.8094..."
1,"{ ""lon"": 33.743791080217562, ""lat"": 21.8927401...",,Adm. by EGY,EGY,Ma'tan al-Sarra,Africa,Northern Africa,,Ma'tan al-Sarra,,"POLYGON ((33.25104 21.99977, 34.15064 21.99603..."
2,"{ ""lon"": 9.5613358449883421, ""lat"": 34.1108585...",TUN,Member State,TUN,Tunisia,Africa,Northern Africa,TN,Tunisie,Tunisia,"MULTIPOLYGON (((10.99361 33.75, 10.93778 33.72..."
3,"{ ""lon"": 43.77213543247138, ""lat"": 33.04802449...",IRQ,Member State,IRQ,Iraq,Asia,Western Asia,IQ,Iraq,Iraq,"POLYGON ((44.78734 37.14971, 44.76617 37.11228..."
4,"{ ""lon"": -6.3178452255610269, ""lat"": 31.883624...",MAR,Member State,MAR,Morocco,Africa,Northern Africa,MA,Maroc,Morocco,"POLYGON ((-2.94694 35.32916, -2.96618 35.31663..."


In [28]:
# Fetch data from world bank
from pathlib import Path
import requests
import pandas as pd
import time

# --- Chemin de sortie ---
filepath = base_path/ "Data" / "countries"
filepath.mkdir(parents=True, exist_ok=True)  # crée le dossier si inexistant
output_file = filepath / "countries_wdi_1960_2024_raw.csv"

# --- Paramètres WDI ---
WDI_BASE = "https://api.worldbank.org/v2"
INDICATORS = {
    "SP.POP.TOTL": "population_total",
    "NY.GDP.MKTP.CD": "gdp_current_usd",
    "NY.GDP.PCAP.CD": "gdp_percapita_current_usd",
    "NY.GDP.MKTP.PP.CD": "gdp_ppp_current_intl",
    "NY.GDP.PCAP.PP.CD": "gdp_percapita_ppp_current_intl"
}
START_YEAR = 1960
END_YEAR = 2024
PER_PAGE = 20000

# --- Fonction pour récupérer un indicateur ---
def fetch_indicator(indicator, start=START_YEAR, end=END_YEAR):
    rows = []
    page = 1
    while True:
        url = f"{WDI_BASE}/country/all/indicator/{indicator}"
        params = {"format": "json", "per_page": PER_PAGE, "page": page, "date": f"{start}:{end}"}
        r = requests.get(url, params=params)
        if r.status_code != 200:
            raise RuntimeError(f"Erreur HTTP {r.status_code} pour {indicator} page {page}")
        data = r.json()
        if not isinstance(data, list) or len(data) < 2:
            break
        meta, records = data[0], data[1]
        for item in records:
            rows.append({
                "country": item["country"]["value"],
                "countryiso3": item["country"]["id"],
                "Year": int(item["date"]),
                "value": None if item["value"] is None else float(item["value"])
            })
        total_pages = int(meta.get("pages", 1))
        if page >= total_pages:
            break
        page += 1
        time.sleep(0.2)
    return pd.DataFrame(rows)

# --- Télécharger les indicateurs ---
dfs = {}
for code, name in INDICATORS.items():
    print(f"Téléchargement de {code} ...")
    df = fetch_indicator(code)
    df = df.rename(columns={"value": name})
    dfs[name] = df

# --- Construire tableau complet pays x année ---
countries = pd.concat([df[['country','countryiso3']].drop_duplicates() for df in dfs.values()]).drop_duplicates()
years = list(range(START_YEAR, END_YEAR + 1))
cart = pd.MultiIndex.from_product([countries['countryiso3'], years], names=['countryiso3','Year']).to_frame(index=False)
cart['country'] = cart['countryiso3'].map(countries.set_index('countryiso3')['country'])

df_all = cart.copy()
for name, df in dfs.items():
    df_all = df_all.merge(df[['countryiso3','Year', name]], on=['countryiso3','Year'], how='left')

# --- Ajouter colonne de manquants ---
for name in INDICATORS.values():
    df_all[f"missing_{name}"] = df_all[name].isna()

# --- Sauvegarder ---
df_all = df_all.sort_values(['countryiso3','Year']).reset_index(drop=True)
df_all.to_csv(output_file, index=False, encoding="utf-8")

print(f"\n✅ Données enregistrées dans {output_file}")
print("Colonnes disponibles :", ", ".join(df_all.columns))
print(f"Nombre total de lignes : {len(df_all):,}")


Téléchargement de SP.POP.TOTL ...
Téléchargement de NY.GDP.MKTP.CD ...
Téléchargement de NY.GDP.PCAP.CD ...
Téléchargement de NY.GDP.MKTP.PP.CD ...
Téléchargement de NY.GDP.PCAP.PP.CD ...
Téléchargement de AG.LND.TOTL.K2 ...

✅ Données enregistrées dans C:\Users\Aubin\Documents\NetZero\Data\countries\countries_area.csv
Colonnes disponibles : countryiso3, Year, country, population_total, gdp_current_usd, gdp_percapita_current_usd, gdp_ppp_current_intl, gdp_percapita_ppp_current_intl, surface_km2, missing_population_total, missing_gdp_current_usd, missing_gdp_percapita_current_usd, missing_gdp_ppp_current_intl, missing_gdp_percapita_ppp_current_intl, missing_surface_km2, gdp_ppp_percapita_calc, population_density
Nombre total de lignes : 17,290


In [31]:
# Fetch data from world bank
from pathlib import Path
import requests
import pandas as pd
import time

# --- Chemin de sortie ---
filepath = base_path/ "Data" / "countries"
filepath.mkdir(parents=True, exist_ok=True)  # crée le dossier si inexistant
output_file = filepath / "countries_wdi_1960_2024_raw.csv"

# --- Paramètres WDI ---
INDICATORS = {
    "SP.POP.TOTL": "population_total",
    "NY.GDP.MKTP.CD": "gdp_current_usd",
    "NY.GDP.PCAP.CD": "gdp_percapita_current_usd",
    "NY.GDP.MKTP.PP.CD": "gdp_ppp_current_intl",
    "NY.GDP.PCAP.PP.CD": "gdp_percapita_ppp_current_intl",
    "AG.LND.TOTL.K2": "area_km2"   # <-- ajout superficie
}
# --- Télécharger les indicateurs ---
dfs = {}
for code, name in INDICATORS.items():
    print(f"Téléchargement de {code} ...")
    df = fetch_indicator(code)
    df = df.rename(columns={"value": name})
    dfs[name] = df

# --- Construire tableau complet pays x année ---
countries = pd.concat([df[['country','countryiso3']].drop_duplicates() for df in dfs.values()]).drop_duplicates()
years = list(range(START_YEAR, END_YEAR + 1))
cart = pd.MultiIndex.from_product([countries['countryiso3'], years], names=['countryiso3','Year']).to_frame(index=False)
cart['country'] = cart['countryiso3'].map(countries.set_index('countryiso3')['country'])

df_all = cart.copy()
for name, df in dfs.items():
    df_all = df_all.merge(df[['countryiso3','Year', name]], on=['countryiso3','Year'], how='left')

# --- Ajouter colonne de manquants ---
for name in INDICATORS.values():
    df_all[f"missing_{name}"] = df_all[name].isna()

# --- Calculs supplémentaires ---
df_all['gdp_ppp_percapita_calc'] = df_all['gdp_ppp_current_intl'] / df_all['population_total']
df_all['population_density'] = df_all['population_total'] / df_all['area_km2']

# --- Sauvegarder ---
df_all = df_all.sort_values(['countryiso3','Year']).reset_index(drop=True)
df_all.to_csv(output_file, index=False, encoding="utf-8")

print(f"\n✅ Données enregistrées dans {output_file}")
print("Colonnes disponibles :", ", ".join(df_all.columns))
print(f"Nombre total de lignes : {len(df_all):,}")


Téléchargement de SP.POP.TOTL ...
Téléchargement de NY.GDP.MKTP.CD ...
Téléchargement de NY.GDP.PCAP.CD ...
Téléchargement de NY.GDP.MKTP.PP.CD ...
Téléchargement de NY.GDP.PCAP.PP.CD ...
Téléchargement de AG.LND.TOTL.K2 ...

✅ Données enregistrées dans C:\Users\Aubin\Documents\NetZero\Data\countries\countries_wdi_1960_2024_raw.csv
Colonnes disponibles : countryiso3, Year, country, population_total, gdp_current_usd, gdp_percapita_current_usd, gdp_ppp_current_intl, gdp_percapita_ppp_current_intl, area_km2, missing_population_total, missing_gdp_current_usd, missing_gdp_percapita_current_usd, missing_gdp_ppp_current_intl, missing_gdp_percapita_ppp_current_intl, missing_area_km2, gdp_ppp_percapita_calc, population_density
Nombre total de lignes : 17,290


In [32]:
df_all

Unnamed: 0,countryiso3,Year,country,population_total,gdp_current_usd,gdp_percapita_current_usd,gdp_ppp_current_intl,gdp_percapita_ppp_current_intl,area_km2,missing_population_total,missing_gdp_current_usd,missing_gdp_percapita_current_usd,missing_gdp_ppp_current_intl,missing_gdp_percapita_ppp_current_intl,missing_area_km2,gdp_ppp_percapita_calc,population_density
0,1A,1960,Arab World,91540853.0,,,,,,False,True,True,True,True,True,,
1,1A,1961,Arab World,93931683.0,1.999708e+10,212.889663,,,11235451.0,False,False,False,True,True,False,,8.360295
2,1A,1962,Arab World,96428599.0,2.032767e+10,210.805415,,,11235451.0,False,False,False,True,True,False,,8.582530
3,1A,1963,Arab World,99038509.0,2.236296e+10,225.800630,,,11235451.0,False,False,False,True,True,False,,8.814823
4,1A,1964,Arab World,101729760.0,2.481163e+10,243.897432,,,11235451.0,False,False,False,True,True,False,,9.054355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17285,ZW,2020,Zimbabwe,15526888.0,2.686856e+10,1730.453910,5.450987e+10,3510.676040,386850.0,False,False,False,False,False,False,3510.676040,40.136714
17286,ZW,2021,Zimbabwe,15797210.0,2.724051e+10,1724.387271,5.031071e+10,3184.784602,386850.0,False,False,False,False,False,False,3184.784602,40.835492
17287,ZW,2022,Zimbabwe,16069056.0,3.278966e+10,2040.546587,5.720647e+10,3560.039403,386850.0,False,False,False,False,False,False,3560.039403,41.538209
17288,ZW,2023,Zimbabwe,16340822.0,3.523137e+10,2156.034093,6.242782e+10,3820.359922,386850.0,False,False,False,False,False,False,3820.359922,42.240719


In [35]:
import pandas as pd
from pathlib import Path

# Ajouter le dossier scripts au path
code_path = ipynbname.path().parent.parent
base_path=code_path.parent
data_path= base_path/ "Data" / 'countries' 



# --- Load WDI data ---
df_wdi = pd.read_csv(data_path / "countries_wdi_1960_2024_raw.csv")  # countryiso3, Year, population_total, gdp_current_usd, gdp_ppp_current_intl

# --- Merge area and WDI ---
df_norm = df_data.merge(
    df_area[['Country_code', 'area_km2']],
    on='Country_code',
    how='left'
)
df_wdi_small = df_wdi[['countryiso3','Year','population_total','gdp_current_usd','gdp_ppp_current_intl']]
df_norm = df_norm.merge(
    df_wdi_small,
    left_on=['Country_code','Year'],
    right_on=['countryiso3','Year'],
    how='left'
)

# --- Compute normalizations in readable units ---
# Divisor for each normalization
norm_specs = {
    'area': ('area_km2', 1e6, '/million km²'),        # divide by 1e6
    'population': ('population_total', 1e6, '/million inhabitants'),
    'gdp': ('gdp_current_usd', 1e9, '/billion GDP$'),
    'ppp': ('gdp_ppp_current_intl', 1e9, '/billion PPP$')
    'density': ('gdp_ppp_current_intl', 1e9, '/billion PPP$')
}
}

for norm, (col_ref, divisor, unit_suffix) in norm_specs.items():
    val_col = f"Value_norm_{norm}"
    unit_col = f"Unit_norm_{norm}"
    df_norm[val_col] = df_norm['Value'] / (df_norm[col_ref] / divisor)
    df_norm[unit_col] = df_norm['Unit'] + unit_suffix

# --- Handle missing / infinite values ---
for col in [f"Value_norm_{n}" for n in norm_specs.keys()]:
    df_norm[col] = df_norm[col].replace([float('inf'), -float('inf')], pd.NA)

# --- Drop intermediate columns ---
df_final = df_norm.drop(columns=['area_km2','countryiso3','population_total','gdp_current_usd','gdp_ppp_current_intl'])

# --- Save CSV ---
output_dir = base_path / 'Data' / 'data_final'
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "data_final_all_norm.csv"
df_final.to_csv(output_file, index=False, encoding='utf-8')

print(f"✅ Normalized emissions saved to {output_file}")
print(f"Sample:\n{df_final.head()}")


✅ Normalized data saved to C:\Users\Aubin\Documents\NetZero\Data\data_final\data_final_all_norm.csv
Sample:
  countryiso3  Year     country  gdp_percapita_current_usd  \
0          1A  1960  Arab World                        NaN   
1          1A  1961  Arab World                 212.889663   
2          1A  1962  Arab World                 210.805415   
3          1A  1963  Arab World                 225.800630   
4          1A  1964  Arab World                 243.897432   

   gdp_percapita_ppp_current_intl  missing_population_total  \
0                             NaN                     False   
1                             NaN                     False   
2                             NaN                     False   
3                             NaN                     False   
4                             NaN                     False   

   missing_gdp_current_usd  missing_gdp_percapita_current_usd  \
0                     True                               True   
1         

In [54]:
import pandas as pd
from pathlib import Path
import ipynbname

# --- Paths ---
code_path = ipynbname.path().parent.parent
base_path = code_path.parent
data_path = base_path / "Data" / "countries"

# --- Load WDI data ---
df_wdi = pd.read_csv(data_path / "countries_wdi_1960_2024_raw.csv")

# --- Compute derived indicators ---
df_wdi['gdp_percapita'] = df_wdi['gdp_current_usd'] / df_wdi['population_total']
df_wdi['gdp_ppp_percapita'] = df_wdi['gdp_ppp_current_intl'] / df_wdi['population_total']
df_wdi['population_density'] = df_wdi['population_total'] / df_wdi['area_km2']

# --- Normalisation specifications ---
norm_specs = {
    'area': ('area_km2', 1e6, '/million km²'),
    'population': ('population_total', 1e6, '/million inhabitants'),
    'gdp': ('gdp_current_usd', 1e9, '/billion USD'),
    'ppp': ('gdp_ppp_current_intl', 1e9, '/billion PPP$'),
    'gdp_hab': ('gdp_percapita', 1e3, '/thousand USD/hab'),
    'ppp_hab': ('gdp_ppp_percapita', 1e3, '/thousand PPP$/hab'),
    'densite': ('population_density', 1, '/hab/km²')
}

# --- Load your data ---
filename="data_final_all.csv"
filepath= base_path/ "Data" / 'data_final' / filename
df_data = pd.read_csv(filepath)# Year, Country, Value, Unit, Indicator, Source, Country_code

# --- Merge df_data with WDI ---
df_norm = df_data.merge(df_wdi, left_on=['Country_code','Year'], right_on=['countryiso3','Year'], how='left')

# --- Apply all normalizations dynamically ---
for norm, (col_ref, divisor, unit_suffix) in norm_specs.items():
    val_col = f"Value_norm_{norm}"
    unit_col = f"Unit_norm_{norm}"
    # Normalize: df_data Value relative to the reference column
    df_norm[val_col] = df_norm['Value'] / (df_norm[col_ref] / divisor)
    df_norm[unit_col] = df_norm['Unit'] + unit_suffix

# --- Handle missing / infinite values ---
for col in [f"Value_norm_{n}" for n in norm_specs.keys()]:
    df_norm[col] = df_norm[col].replace([float('inf'), -float('inf')], pd.NA)

# --- Drop intermediate WDI columns if desired ---
drop_cols = [c for c in df_wdi.columns if c not in ['countryiso3','Year']]
df_final = df_norm.drop(columns=drop_cols)

# --- Drop intermediate columns ---
df_final = df_norm.drop(columns=['area_km2','countryiso3','population_total','gdp_current_usd','gdp_ppp_current_intl'])

output_dir = base_path / 'Data' / 'data_final'
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "data_final_all_norm.csv"
df_final.to_csv(output_file, index=False, encoding='utf-8')

print(f"✅ All normalizations applied and saved to {output_file}")
print(df_final.head())


✅ All normalizations applied and saved to C:\Users\Aubin\Documents\NetZero\Data\data_final\data_final_all_norm.csv
   Year      Country    Value      Unit             Indicator Source  \
0  1960  Afghanistan  4.17891  TgC/year  LULUCF Net emissions    GCB   
1  1961  Afghanistan  3.59662  TgC/year  LULUCF Net emissions    GCB   
2  1962  Afghanistan  3.32416  TgC/year  LULUCF Net emissions    GCB   
3  1963  Afghanistan  3.23023  TgC/year  LULUCF Net emissions    GCB   
4  1964  Afghanistan  2.79609  TgC/year  LULUCF Net emissions    GCB   

  Country_code      country  gdp_percapita_current_usd  \
0           AF  Afghanistan                        NaN   
1           AF  Afghanistan                        NaN   
2           AF  Afghanistan                        NaN   
3           AF  Afghanistan                        NaN   
4           AF  Afghanistan                        NaN   

   gdp_percapita_ppp_current_intl  ... Value_norm_gdp         Unit_norm_gdp  \
0                       

In [53]:
df_final

Unnamed: 0,Year,Country,Value,Unit,Indicator,Source,Country_code,country,gdp_percapita_current_usd,gdp_percapita_ppp_current_intl,...,Value_norm_gdp,Unit_norm_gdp,Value_norm_ppp,Unit_norm_ppp,Value_norm_gdp_hab,Unit_norm_gdp_hab,Value_norm_ppp_hab,Unit_norm_ppp_hab,Value_norm_densite,Unit_norm_densite
0,1960,Afghanistan,4.17891,TgC/year,LULUCF Net emissions,GCB,AF,Afghanistan,,,...,,TgC/year/billion USD,,TgC/year/billion PPP$,,TgC/year/thousand USD/hab,,TgC/year/thousand PPP$/hab,,TgC/year/hab/km²
1,1961,Afghanistan,3.59662,TgC/year,LULUCF Net emissions,GCB,AF,Afghanistan,,,...,,TgC/year/billion USD,,TgC/year/billion PPP$,,TgC/year/thousand USD/hab,,TgC/year/thousand PPP$/hab,0.254591,TgC/year/hab/km²
2,1962,Afghanistan,3.32416,TgC/year,LULUCF Net emissions,GCB,AF,Afghanistan,,,...,,TgC/year/billion USD,,TgC/year/billion PPP$,,TgC/year/thousand USD/hab,,TgC/year/thousand PPP$/hab,0.230543,TgC/year/hab/km²
3,1963,Afghanistan,3.23023,TgC/year,LULUCF Net emissions,GCB,AF,Afghanistan,,,...,,TgC/year/billion USD,,TgC/year/billion PPP$,,TgC/year/thousand USD/hab,,TgC/year/thousand PPP$/hab,0.219361,TgC/year/hab/km²
4,1964,Afghanistan,2.79609,TgC/year,LULUCF Net emissions,GCB,AF,Afghanistan,,,...,,TgC/year/billion USD,,TgC/year/billion PPP$,,TgC/year/thousand USD/hab,,TgC/year/thousand PPP$/hab,0.185820,TgC/year/hab/km²
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12603,2019,Zimbabwe,1.64642,TgC/year,LULUCF Net emissions,GCB,ZW,Zimbabwe,1683.913136,3211.253048,...,0.064024,TgC/year/billion USD,0.033573,TgC/year/billion PPP$,0.977735,TgC/year/thousand USD/hab,0.512703,TgC/year/thousand PPP$/hab,0.041707,TgC/year/hab/km²
12604,2020,Zimbabwe,1.53131,TgC/year,LULUCF Net emissions,GCB,ZW,Zimbabwe,1730.453910,3510.676040,...,0.056993,TgC/year/billion USD,0.028092,TgC/year/billion PPP$,0.884918,TgC/year/thousand USD/hab,0.436187,TgC/year/thousand PPP$/hab,0.038152,TgC/year/hab/km²
12605,2021,Zimbabwe,1.48515,TgC/year,LULUCF Net emissions,GCB,ZW,Zimbabwe,1724.387271,3184.784602,...,0.054520,TgC/year/billion USD,0.029520,TgC/year/billion PPP$,0.861262,TgC/year/thousand USD/hab,0.466327,TgC/year/thousand PPP$/hab,0.036369,TgC/year/hab/km²
12606,2022,Zimbabwe,1.51373,TgC/year,LULUCF Net emissions,GCB,ZW,Zimbabwe,2040.546587,3560.039403,...,0.046165,TgC/year/billion USD,0.026461,TgC/year/billion PPP$,0.741826,TgC/year/thousand USD/hab,0.425200,TgC/year/thousand PPP$/hab,0.036442,TgC/year/hab/km²
