## Importation des données

In [34]:
import sys
from pathlib import Path
import importlib
import ipynbname 
import pandas as pd
import geopandas as gpd
from datetime import datetime

code_path = ipynbname.path().parent.parent
# Ajouter le dossier scripts au path
scripts_path = code_path  / "scripts"
base_path=code_path.parent
sys.path.append(str(scripts_path.resolve()))

import data_utils  # importe le module une première fois

# Après avoir modifié data_utils.py
importlib.reload(data_utils)

# Maintenant tu peux accéder aux fonctions mises à jour
from data_utils import import_data_raw, import_data_sig, melt_long_format, clean_year_column, save_long_dataframe, concat_intermediate_files


In [2]:
#Country data
filename="data_final_all.csv"
filepath= base_path/ "Data" / 'data_final' / filename

df_data = pd.read_csv(filepath)
df_data.head()

Unnamed: 0,Year,Country,Value,Unit,Indicator,Source,Country_code
0,1960,Afghanistan,4.17891,TgC/year,LULUCF Net emissions,GCB,AF
1,1961,Afghanistan,3.59662,TgC/year,LULUCF Net emissions,GCB,AF
2,1962,Afghanistan,3.32416,TgC/year,LULUCF Net emissions,GCB,AF
3,1963,Afghanistan,3.23023,TgC/year,LULUCF Net emissions,GCB,AF
4,1964,Afghanistan,2.79609,TgC/year,LULUCF Net emissions,GCB,AF


In [3]:
#Global sig
gdf_world=import_data_sig('world.geojson',base_path)
gdf_world.head()

Unnamed: 0,geo_point_2d,iso3,status,color_code,name,continent,region,Country_code,french_short,Country,geometry
0,"{ ""lon"": -16.984917430414384, ""lat"": 32.747916...",,PT Territory,PRT,Madeira Islands,Europe,Southern Europe,,Madeira Islands,,"POLYGON ((-17.1025 32.82333, -17.05306 32.8094..."
1,"{ ""lon"": 33.743791080217562, ""lat"": 21.8927401...",,Adm. by EGY,EGY,Ma'tan al-Sarra,Africa,Northern Africa,,Ma'tan al-Sarra,,"POLYGON ((33.25104 21.99977, 34.15064 21.99603..."
2,"{ ""lon"": 9.5613358449883421, ""lat"": 34.1108585...",TUN,Member State,TUN,Tunisia,Africa,Northern Africa,TN,Tunisie,Tunisia,"MULTIPOLYGON (((10.99361 33.75, 10.93778 33.72..."
3,"{ ""lon"": 43.77213543247138, ""lat"": 33.04802449...",IRQ,Member State,IRQ,Iraq,Asia,Western Asia,IQ,Iraq,Iraq,"POLYGON ((44.78734 37.14971, 44.76617 37.11228..."
4,"{ ""lon"": -6.3178452255610269, ""lat"": 31.883624...",MAR,Member State,MAR,Morocco,Africa,Northern Africa,MA,Maroc,Morocco,"POLYGON ((-2.94694 35.32916, -2.96618 35.31663..."


In [35]:
# Fetch data from world bank
from pathlib import Path
import requests
import pandas as pd
import time

# --- Chemin de sortie ---
filepath = base_path/ "Data" / "countries"
filepath.mkdir(parents=True, exist_ok=True)  # crée le dossier si inexistant
output_file = filepath / "countries_wdi_1960_2024_raw.csv"

# --- Paramètres WDI ---
WDI_BASE = "https://api.worldbank.org/v2"
INDICATORS = {
    "SP.POP.TOTL": "population_total",
    "NY.GDP.MKTP.CD": "gdp_current_usd",
    "NY.GDP.PCAP.CD": "gdp_percapita_current_usd",
    "NY.GDP.MKTP.PP.CD": "gdp_ppp_current_intl",
    "NY.GDP.PCAP.PP.CD": "gdp_percapita_ppp_current_intl"
}
START_YEAR = 1960
END_YEAR = 2024
PER_PAGE = 20000

# --- Fonction pour récupérer un indicateur ---
def fetch_indicator(indicator, start=START_YEAR, end=END_YEAR):
    rows = []
    page = 1
    while True:
        url = f"{WDI_BASE}/country/all/indicator/{indicator}"
        params = {"format": "json", "per_page": PER_PAGE, "page": page, "date": f"{start}:{end}"}
        r = requests.get(url, params=params)
        if r.status_code != 200:
            raise RuntimeError(f"Erreur HTTP {r.status_code} pour {indicator} page {page}")
        data = r.json()
        if not isinstance(data, list) or len(data) < 2:
            break
        meta, records = data[0], data[1]
        for item in records:
            rows.append({
                "country": item["country"]["value"],
                "countryiso3": item["country"]["id"],
                "Year": int(item["date"]),
                "value": None if item["value"] is None else float(item["value"])
            })
        total_pages = int(meta.get("pages", 1))
        if page >= total_pages:
            break
        page += 1
        time.sleep(0.2)
    return pd.DataFrame(rows)

# --- Télécharger les indicateurs ---
dfs = {}
for code, name in INDICATORS.items():
    print(f"Téléchargement de {code} ...")
    df = fetch_indicator(code)
    df = df.rename(columns={"value": name})
    dfs[name] = df

# --- Construire tableau complet pays x année ---
countries = pd.concat([df[['country','countryiso3']].drop_duplicates() for df in dfs.values()]).drop_duplicates()
years = list(range(START_YEAR, END_YEAR + 1))
cart = pd.MultiIndex.from_product([countries['countryiso3'], years], names=['countryiso3','Year']).to_frame(index=False)
cart['country'] = cart['countryiso3'].map(countries.set_index('countryiso3')['country'])

df_all = cart.copy()
for name, df in dfs.items():
    df_all = df_all.merge(df[['countryiso3','Year', name]], on=['countryiso3','Year'], how='left')

# --- Ajouter colonne de manquants ---
for name in INDICATORS.values():
    df_all[f"missing_{name}"] = df_all[name].isna()

# --- Sauvegarder ---
df_all = df_all.sort_values(['countryiso3','Year']).reset_index(drop=True)
df_all.to_csv(output_file, index=False, encoding="utf-8")

print(f"\n✅ Données enregistrées dans {output_file}")
print("Colonnes disponibles :", ", ".join(df_all.columns))
print(f"Nombre total de lignes : {len(df_all):,}")


Téléchargement de SP.POP.TOTL ...
Téléchargement de NY.GDP.MKTP.CD ...
Téléchargement de NY.GDP.PCAP.CD ...
Téléchargement de NY.GDP.MKTP.PP.CD ...
Téléchargement de NY.GDP.PCAP.PP.CD ...

✅ Données enregistrées dans C:\Users\Aubin\Documents\NetZero\Data\countries\countries_wdi_1960_2024_raw.csv
Colonnes disponibles : countryiso3, Year, country, population_total, gdp_current_usd, gdp_percapita_current_usd, gdp_ppp_current_intl, gdp_percapita_ppp_current_intl, missing_population_total, missing_gdp_current_usd, missing_gdp_percapita_current_usd, missing_gdp_ppp_current_intl, missing_gdp_percapita_ppp_current_intl
Nombre total de lignes : 17,290


In [14]:
#Calculate countries sueprficy 
import geopandas as gpd
from pathlib import Path

# --- Chemin de sortie ---
filepath = base_path/ "Data" / "countries"
filepath.mkdir(parents=True, exist_ok=True)  # crée le dossier si inexistant
output_file = filepath / "countries_area.csv"

# Assuming gdf_world is already loaded
# Keep only necessary columns
cols_to_keep = ['iso3', 'Country_code', 'name', 'geometry']
gdf = gdf_world[cols_to_keep].copy()

# Compute area in square kilometers
# If geometry is in degrees, first project to an equal-area CRS
if gdf.crs is None or gdf.crs.is_geographic:
    gdf = gdf.to_crs("EPSG:6933")  # World Equidistant Cylindrical, meters

gdf['area_km2'] = gdf['geometry'].area / 10**6  # convert m^2 -> km^2

# Prepare CSV path
filepath = Path("..") / "Data" / "countries"
filepath.mkdir(parents=True, exist_ok=True)

# Keep only requested columns
df_out = gdf[['iso3', 'Country_code', 'name', 'area_km2']]

# Save CSV
df_out.to_csv(output_file, index=False, encoding="utf-8")

print(f"✅ CSV saved to {output_file}")


✅ CSV saved to C:\Users\Aubin\Documents\NetZero\Data\countries\countries_area.csv


In [54]:
import pandas as pd
from pathlib import Path

# Ajouter le dossier scripts au path
code_path = ipynbname.path().parent.parent
base_path=code_path.parent
data_path= base_path/ "Data" / 'countries' 

# --- Load country area ---
df_area = pd.read_csv(data_path / "countries_area.csv")  # Country_code, area_km2

# --- Load WDI data ---
df_wdi = pd.read_csv(data_path / "countries_wdi_1960_2024_raw.csv")  # countryiso3, Year, population_total, gdp_current_usd, gdp_ppp_current_intl

# --- Merge area and WDI ---
df_norm = df_data.merge(
    df_area[['Country_code', 'area_km2']],
    on='Country_code',
    how='left'
)
df_wdi_small = df_wdi[['countryiso3','Year','population_total','gdp_current_usd','gdp_ppp_current_intl']]
df_norm = df_norm.merge(
    df_wdi_small,
    left_on=['Country_code','Year'],
    right_on=['countryiso3','Year'],
    how='left'
)

# --- Compute normalizations in readable units ---
# Divisor for each normalization
norm_specs = {
    'area': ('area_km2', 1e6, '/million km²'),        # divide by 1e6
    'person': ('population_total', 1e6, '/million inhabitants'),
    'gdp': ('gdp_current_usd', 1e9, '/billion GDP$'),
    'ppp': ('gdp_ppp_current_intl', 1e9, '/billion PPP$')
}

for norm, (col_ref, divisor, unit_suffix) in norm_specs.items():
    val_col = f"Value_norm_{norm}"
    unit_col = f"Unit_norm_{norm}"
    df_norm[val_col] = df_norm['Value'] / (df_norm[col_ref] / divisor)
    df_norm[unit_col] = df_norm['Unit'] + unit_suffix

# --- Handle missing / infinite values ---
for col in [f"Value_norm_{n}" for n in norm_specs.keys()]:
    df_norm[col] = df_norm[col].replace([float('inf'), -float('inf')], pd.NA)

# --- Drop intermediate columns ---
df_final = df_norm.drop(columns=['area_km2','countryiso3','population_total','gdp_current_usd','gdp_ppp_current_intl'])

# --- Save CSV ---
output_dir = base_path / 'Data' / 'data_final'
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "data_final_all_norm.csv"
df_final.to_csv(output_file, index=False, encoding='utf-8')

print(f"✅ Normalized emissions saved to {output_file}")
print(f"Sample:\n{df_final.head()}")


✅ Normalized emissions saved to C:\Users\Aubin\Documents\NetZero\Data\data_final\data_final_all_norm.csv
Sample:
   Year      Country    Value      Unit             Indicator Source  \
0  1960  Afghanistan  4.17891  TgC/year  LULUCF Net emissions    GCB   
1  1961  Afghanistan  3.59662  TgC/year  LULUCF Net emissions    GCB   
2  1962  Afghanistan  3.32416  TgC/year  LULUCF Net emissions    GCB   
3  1963  Afghanistan  3.23023  TgC/year  LULUCF Net emissions    GCB   
4  1964  Afghanistan  2.79609  TgC/year  LULUCF Net emissions    GCB   

  Country_code  Value_norm_area        Unit_norm_area  Value_norm_person  \
0           AF         6.510146  TgC/year/million km²           0.462522   
1           AF         5.603021  TgC/year/million km²           0.390339   
2           AF         5.178568  TgC/year/million km²           0.353468   
3           AF         5.032238  TgC/year/million km²           0.336325   
4           AF         4.355910  TgC/year/million km²           0.284899  

In [51]:
df_final

Unnamed: 0,Year,Country,Value,Unit,Indicator,Source,Country_code,Value_norm_area,Unit_norm_area,Value_norm_person,Unit_norm_person,Value_norm_gdp,Unit_norm_gdp,Value_norm_ppp,Unit_norm_ppp
0,1960,Afghanistan,4.17891,TgC/year,LULUCF Net emissions,GCB,AF,6.510146,TgC/year/million km²,0.462522,TgC/year/million inhabitants,,TgC/year/billion USD,,TgC/year/billion PPP$
1,1961,Afghanistan,3.59662,TgC/year,LULUCF Net emissions,GCB,AF,5.603021,TgC/year/million km²,0.390339,TgC/year/million inhabitants,,TgC/year/billion USD,,TgC/year/billion PPP$
2,1962,Afghanistan,3.32416,TgC/year,LULUCF Net emissions,GCB,AF,5.178568,TgC/year/million km²,0.353468,TgC/year/million inhabitants,,TgC/year/billion USD,,TgC/year/billion PPP$
3,1963,Afghanistan,3.23023,TgC/year,LULUCF Net emissions,GCB,AF,5.032238,TgC/year/million km²,0.336325,TgC/year/million inhabitants,,TgC/year/billion USD,,TgC/year/billion PPP$
4,1964,Afghanistan,2.79609,TgC/year,LULUCF Net emissions,GCB,AF,4.355910,TgC/year/million km²,0.284899,TgC/year/million inhabitants,,TgC/year/billion USD,,TgC/year/billion PPP$
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14203,2019,Zimbabwe,1.64642,TgC/year,LULUCF Net emissions,GCB,ZW,4.223092,TgC/year/million km²,0.107811,TgC/year/million inhabitants,0.064024,TgC/year/billion USD,0.033573,TgC/year/billion PPP$
14204,2020,Zimbabwe,1.53131,TgC/year,LULUCF Net emissions,GCB,ZW,3.927833,TgC/year/million km²,0.098623,TgC/year/million inhabitants,0.056993,TgC/year/billion USD,0.028092,TgC/year/billion PPP$
14205,2021,Zimbabwe,1.48515,TgC/year,LULUCF Net emissions,GCB,ZW,3.809432,TgC/year/million km²,0.094013,TgC/year/million inhabitants,0.054520,TgC/year/billion USD,0.029520,TgC/year/billion PPP$
14206,2022,Zimbabwe,1.51373,TgC/year,LULUCF Net emissions,GCB,ZW,3.882740,TgC/year/million km²,0.094202,TgC/year/million inhabitants,0.046165,TgC/year/billion USD,0.026461,TgC/year/billion PPP$
