In [32]:
import pandas as pd
import numpy as np

from google.cloud import bigquery
from google.auth import default
import polars as pl

import pycountry

In [33]:
# Get credentials from gcloud auth
creds, _ = default()

# Create BigQuery client
client = bigquery.Client(project="emb-prod-376511", credentials=creds)

# Get the table
table_emissions = client.get_table("sources.iea_coal_emissions_latest")

# Get rows
emissions = client.list_rows(table_emissions)

df_emissions = emissions.to_dataframe()



In [34]:
# Get the table
table_production = client.get_table("methane.mart_coal_production")

# Get rows
production = client.list_rows(table_production)

df_production = production.to_dataframe()

df_production

Unnamed: 0,COUNTRY_CODE,YEAR,PRODUCTION_MT,FORECAST_FLAG,SOURCE
0,AFG,1991,0.094,False,EIA
1,AFG,2000,0.001,False,EIA
2,AFG,1999,0.001,False,EIA
3,AFG,1998,0.002,False,EIA
4,AFG,1997,0.002,False,EIA
...,...,...,...,...,...
2892,TZA,2025,2.581,True,IEA
2893,USA,2025,443.050,True,IEA
2894,MNG,2025,97.000,True,IEA
2895,THA,2024,12.000,False,IEA


In [35]:
df_emissions = df_emissions[
    df_emissions["SEGMENT"].str.lower().str.contains("coal", na=False)
] # This is already in the pipeline



In [36]:
# --- Helper: convert country names to ISO3 codes ---
def to_iso3(name):
    try:
        return pycountry.countries.lookup(name).alpha_3
    except:
        return None

# --- Ensure consistent country codes ---
df_emissions['COUNTRY_CODE'] = df_emissions['COUNTRY'].apply(to_iso3)

# --- Convert years to numeric ---
df_emissions['BASE_YEAR'] = pd.to_numeric(df_emissions['BASE_YEAR'], errors='coerce')
df_production['YEAR'] = pd.to_numeric(df_production['YEAR'], errors='coerce')

# --- Keep only required columns from production ---
prod = df_production[['COUNTRY_CODE', 'YEAR', 'PRODUCTION_MT']].copy()

# --- Merge emissions with production ---
df_merged = df_emissions.merge(
    prod,
    how='left',     # IMPORTANT → allows detecting missing production rows
    left_on=['COUNTRY_CODE', 'BASE_YEAR'],
    right_on=['COUNTRY_CODE', 'YEAR']
)

# --- Apply both rules ---
df_merged.loc[
    (df_merged['PRODUCTION_MT'].isna()) |         # (2) no production row OR NaN
    (df_merged['PRODUCTION_MT'] == 0),            # (1) zero production
    'EMISSIONS__KT'
] = np.nan

# --- Cleanup ---
df_emissions = df_merged.drop(columns=['YEAR'])

df_emissions.to_csv('emissions_new.csv', index=False)