In [6]:
import pandas as pd
import numpy as np

from google.cloud import bigquery
from google.auth import default
import polars as pl

In [8]:
# Get credentials from gcloud auth
creds, _ = default()

# Create BigQuery client
client = bigquery.Client(project="emb-prod-376511", credentials=creds)

# Get the table
table_bq = client.get_table("methane.mart_coal_emissions_all")

# Get rows
rows = client.list_rows(table_bq)

df = rows.to_dataframe()

df



Unnamed: 0,COUNTRY_CODE,YEAR,EMISSIONS_CH4_KT,EMISSIONS_TYPE,EMISSIONS_ESTIMATED_FLAG,SOURCE_EMISSIONS,SOURCE_PRODUCTION,SOURCE_INTENSITY,SOURCE_ALL,ANNEX_FLAG
0,AFG,2024,,,,,,,,False
1,ALB,2005,,,,,,,,
2,ALB,1990,,,,,,,,False
3,ALB,2024,,,,,,,,False
4,ARG,2024,,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...
7036,ZWE,2018,0.41,Report,False,UNFCCC,,,UNFCCC,False
7037,ZWE,2019,0.41,Report,False,UNFCCC,,,UNFCCC,False
7038,ZWE,2020,0.43,Report,False,UNFCCC,,,UNFCCC,False
7039,ZWE,2021,0.48,Report,False,UNFCCC,,,UNFCCC,False


In [10]:
df = df[df['COUNTRY_CODE'] != 'EU']

In [11]:
# Select and rename relevant columns
df_clean = df[["YEAR", "EMISSIONS_CH4_KT", "SOURCE_ALL"]].rename(
    columns={
        "YEAR": "year",
        "EMISSIONS_CH4_KT": "emissions",
        "SOURCE_ALL": "source"
    }
)
# Drop rows without emissions data
df_clean = df_clean.dropna(subset=["emissions"])

df_clean = df_clean[df_clean['source'] != 'EIA-'] #REmove EIA- and EIA-GEM data that we don't want to show on data tool
df_clean = df_clean[df_clean['source'] != 'EIA-GEM']

# Global totals per year and source
global_by_year_source = (
    df_clean.groupby(["year", "source"], as_index=False)["emissions"].sum()
)

# Aggregate years < 2024
older = (
    global_by_year_source[global_by_year_source["year"] < 2024]
    .groupby("year", as_index=False)["emissions"]
    .sum()
    .assign(source="UNFCCC")
)

# keep >= 2024 untouched
newer = global_by_year_source[global_by_year_source["year"] >= 2024]

# Combine them back together
global_by_year_source_combine = (
    pd.concat([older, newer], ignore_index=True)
        .sort_values("year")
)

# Save results
global_by_year_source_combine.to_csv("global_emissions_by_year_source.csv", index=False)

In [13]:
# Select and rename relevant columns
df_clean = df[["YEAR", "EMISSIONS_CH4_KT", "SOURCE_ALL"]].rename(
    columns={
        "YEAR": "year",
        "EMISSIONS_CH4_KT": "emissions",
        "SOURCE_ALL": "source"
    }
)

# Drop rows without emissions data
df_clean = df_clean.dropna(subset=["emissions"])

# Global totals per year and source
global_by_year_source = (
    df_clean.groupby(["year", "source"], as_index=False)["emissions"].sum()
)

global_by_year_source

Unnamed: 0,year,source,emissions
0,1985,UNFCCC,63.955126
1,1986,UNFCCC,17.098332
2,1988,UNFCCC,1025.576883
3,1989,UNFCCC,368.222628
4,1990,UNFCCC,18441.297257
...,...,...,...
144,2026,IEA-IEA,34266.900083
145,2026,IEA-UNFCCC,34675.479025
146,2027,IEA-GEM,53988.560842
147,2027,IEA-IEA,34109.405914


In [None]:
# Save results
global_by_year_source.to_csv("global_emissions_by_year_source.csv", index=False)

In [15]:
# # Global totals per year and source
# global_by_year_source = df_clean.groupby(
#     ["year", "source"],
#     as_index=False,
# ).agg(
#     EMISSIONS_CH4_KT=("emissions", "sum"),
#     N_COUNTRIES=("emissions", "count"),
# )

# global_by_year_source.to_csv("global_emissions_test.csv", index=False)
