In [25]:
import pandas as pd
import numpy as np

from google.cloud import bigquery
from google.auth import default
import polars as pl

import pycountry

In [26]:
# Get credentials from gcloud auth
creds, _ = default()

# Create BigQuery client
client = bigquery.Client(project="emb-prod-376511", credentials=creds)

# Get the table
table_emissions = client.get_table("sources.iea_coal_emissions_latest")

# Get rows
emissions = client.list_rows(table_emissions)

df_emissions = emissions.to_dataframe()

df_emissions



Unnamed: 0,REGION,COUNTRY,EMISSIONS__KT,SOURCE,TYPE,SEGMENT,REASON,BASE_YEAR,EMISSIONS_RANK,ENERGY_RANK,NOTES
0,Africa,Algeria,297.99,IEA,Agriculture,Total,All,2022-2023,21.0,11.0,Estimates for emissions from end-uses (includi...
1,Africa,Algeria,47.05,IEA,Energy,Abandoned facilities,All,2024,21.0,11.0,"Estimates from end-uses (Bioenergy, Other from..."
2,Africa,Algeria,0.23,IEA,Energy,Bioenergy,All,2022-2023,21.0,11.0,"Estimates from end-uses (Bioenergy, Other from..."
3,Africa,Algeria,75.83,IEA,Energy,Gas pipelines and LNG facilities,Fugitive,2024,21.0,11.0,"Estimates from end-uses (Bioenergy, Other from..."
4,Africa,Algeria,107.98,IEA,Energy,Gas pipelines and LNG facilities,Vented,2024,21.0,11.0,"Estimates from end-uses (Bioenergy, Other from..."
...,...,...,...,...,...,...,...,...,...,...,...
1581,World,,4588.80,IEA,Energy,Satellite-detected large oil and gas emissions,All,2024,,,"Estimates from end-uses (Bioenergy, Other from..."
1582,World,,25904.25,IEA,Energy,Steam coal,All,2024,,,"Estimates from end-uses (Bioenergy, Other from..."
1583,World,,144562.48,IEA,Energy,Total,All,2022-2024,,,"Estimates from end-uses (Bioenergy, Other from..."
1584,World,,7739.48,IEA,Other,Total,All,2022-2023,,,Estimates for emissions from end-uses (includi...


In [27]:
# Get the table
table_production = client.get_table("methane.mart_coal_production")

# Get rows
production = client.list_rows(table_production)

df_production = production.to_dataframe()

df_production

Unnamed: 0,COUNTRY_CODE,YEAR,PRODUCTION_MT,FORECAST_FLAG,SOURCE
0,AFG,1991,0.094,False,EIA
1,AFG,2000,0.001,False,EIA
2,AFG,1999,0.001,False,EIA
3,AFG,1998,0.002,False,EIA
4,AFG,1997,0.002,False,EIA
...,...,...,...,...,...
2892,TZA,2025,2.581,True,IEA
2893,USA,2025,443.050,True,IEA
2894,MNG,2025,97.000,True,IEA
2895,THA,2024,12.000,False,IEA


In [28]:
df_emissions = df_emissions[
    df_emissions["SEGMENT"].str.lower().str.contains("coal", na=False)
] # This is already in the pipeline



In [29]:
# --- Helper: convert country names to ISO3 codes ---
def to_iso3(name):
    try:
        return pycountry.countries.lookup(name.strip()).alpha_3
    except:
        return None


# --- Preprocess df_emissions ---
df_emissions['COUNTRY_CODE'] = df_emissions['COUNTRY'].apply(to_iso3)

# --- Handle BASE_YEAR ranges like "2022-2024" ---
def parse_base_year_range(x):
    if pd.isna(x):
        return (np.nan, np.nan)
    if isinstance(x, str):
        x = x.strip()
        if '-' in x:
            a, b = x.split('-')
            try:
                return int(a), int(b)
            except:
                return (np.nan, np.nan)
        try:
            y = int(x)
            return (y, y)
        except:
            return (np.nan, np.nan)
    # numeric
    return (int(x), int(x))

df_emissions[['BASE_YEAR_START', 'BASE_YEAR_END']] = df_emissions['BASE_YEAR'].apply(
    lambda x: pd.Series(parse_base_year_range(x))
)


# --- Preprocess df_production ---
df_production['YEAR'] = pd.to_numeric(df_production['YEAR'], errors='coerce')

prod = df_production[['COUNTRY_CODE', 'YEAR', 'PRODUCTION_MT']]


# --- Merge production for start year ---
df_merged = df_emissions.merge(
    prod.rename(columns={'PRODUCTION_MT': 'PROD_START'}),
    how='left',
    left_on=['COUNTRY_CODE', 'BASE_YEAR_START'],
    right_on=['COUNTRY_CODE', 'YEAR']
).drop(columns=['YEAR'])

# --- Merge production for end year ---
df_merged = df_merged.merge(
    prod.rename(columns={'PRODUCTION_MT': 'PROD_END'}),
    how='left',
    left_on=['COUNTRY_CODE', 'BASE_YEAR_END'],
    right_on=['COUNTRY_CODE', 'YEAR']
).drop(columns=['YEAR'])


# --- Keep ONLY if production exists for either the start year or end year ---
df_merged.loc[
    (df_merged['PROD_START'].isna()) & (df_merged['PROD_END'].isna()),
    'EMISSIONS__KT'
] = np.nan


# --- Final cleanup ---
df_final = df_merged.drop(columns=['BASE_YEAR_START','BASE_YEAR_END','PROD_START','PROD_END'])


# --- Save ---
df_final.to_csv("emissions_new.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emissions['COUNTRY_CODE'] = df_emissions['COUNTRY'].apply(to_iso3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emissions[['BASE_YEAR_START', 'BASE_YEAR_END']] = df_emissions['BASE_YEAR'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emissions[['BASE_YEAR_START', 'BASE_YEAR_END']