In [3]:
'''
This code will update the asset data
'''

import duckdb
import os
from dotenv import load_dotenv
from urllib.parse import quote_plus
from sqlalchemy import create_engine

load_dotenv()

# Build SQLAlchemy engine for PostgreSQL
user = quote_plus(os.getenv("CLIMATETRACE_USER"))
password = quote_plus(os.getenv("CLIMATETRACE_PASS"))
host = os.getenv("CLIMATETRACE_HOST")
port = os.getenv("CLIMATETRACE_PORT")
database = os.getenv("CLIMATETRACE_DB")

postgres_url = f"postgresql://{user}:{password}@{host}:{port}/{database}"
parquet_path = "data/asset_emissions_country_subsector.parquet"
os.makedirs("data/asset_parquet", exist_ok=True)


print("Running query and writing to parquet file, this may take an hour...")
# Use DuckDB to write directly from PostgreSQL to Parquet
con = duckdb.connect()
con.execute(f"""
    INSTALL postgres;
    LOAD postgres;

    CREATE TABLE asset_emissions_parquet AS
    SELECT ae.iso3_country,
        ae.original_inventory_sector,
        ae.start_time,
        ae.gas,
        sch.sector,
        ca.name as country_name,
        ca.continent,
        ca.unfccc_annex,
        ca.em_finance,
        ca.eu,
        ca.oecd,
        ca.developed_un,
        ae.release,
        sum(emissions_quantity) emissions_quantity,
        sum(activity) activity,
        sum(emissions_quantity) / sum(activity) weighted_average_emissions_factor
    FROM postgres_scan('{postgres_url}', 'public', 'asset_emissions') ae
    LEFT JOIN postgres_scan('{postgres_url}', 'public', 'country_analysis') ca
        ON CAST(ca.iso3_country AS VARCHAR) = CAST(ae.iso3_country AS VARCHAR)
    LEFT JOIN (
        SELECT DISTINCT sector, subsector FROM postgres_scan('{postgres_url}', 'public', 'asset_schema')
    ) sch
        ON CAST(sch.subsector AS VARCHAR) = CAST(ae.original_inventory_sector AS VARCHAR)
    WHERE ae.start_time >= DATE '2022-03-01'
      AND ae.gas = 'co2e_100yr'
      AND ae.most_granular = TRUE
      
    group by ae.iso3_country,
        ae.original_inventory_sector,
        ae.start_time,
        ae.gas,
        sch.sector,
        ca.name,
        ca.continent,
        ca.unfccc_annex,
        ca.em_finance,
        ca.eu,
        ca.oecd,
        ca.developed_un,
        ae.release;

    COPY asset_emissions_parquet TO '{parquet_path}' (FORMAT PARQUET);
""")
con.close()

print("Parquet file exported!")

ModuleNotFoundError: No module named 'duckdb'

In [1]:
import sys
print(sys.executable)



/usr/local/bin/python3


In [None]:
'''
This is a script to take raw csvs in data/raw_csvs folder and covert them to 
parquets for manageable GitHub storage and limited memory usage (DuckDB).
'''

import pandas as pd
from pathlib import Path

# Set input and output directories
input_dir = Path("data/raw_csvs")
output_dir = Path("data")

# Make sure the output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# Loop through all CSV files in the input directory
for csv_file in input_dir.glob("*.csv"):
    print(f"Converting {csv_file.name}...")

    # Read CSV into DataFrame
    df = pd.read_csv(csv_file)

    # Create output path by replacing .csv with .parquet
    parquet_file = output_dir / csv_file.with_suffix(".parquet").name

    # Write to Parquet
    df.to_parquet(parquet_file, engine="pyarrow", index=False)
    print(f"Saved to {parquet_file}")

     # Delete original CSV
    csv_file.unlink()
    print(f"Deleted original CSV: {csv_file.name}")

print("✅ CSV to Parquet conversion complete.")

In [None]:
# map_region_condition(region_selection)

test = map_region_condition('Asia')

print(test)
print(test['column_name'])
print(test['column_value'])

In [None]:
import duckdb

con = duckdb.connect()

# Replace with your actual path if needed
parquet_path = "data/asset_parquet/asset_emissions_country_subsector.parquet"

# Count how many rows you actually wrote
result = con.execute(f"""SELECT sum(emissions_quantity) FROM '{parquet_path}' where start_time = '2025-02-01' and original_inventory_sector not in ('forest-land-clearing',
                                                'forest-land-degradation',
                                                'forest-land-fires',
                                                'net-forest-land',
                                                'net-shrubgrass',
                                                'net-wetland',
                                                'removals',
                                                'shrubgrass-fires',
                                                'water-reservoirs',
                                                'wetland-fires')""").df()


print(result)

In [None]:
import duckdb

con = duckdb.connect()

# parquet_path = "data/country_subsector_emissions_totals_202504.parquet"
parquet_path = "data/asset_emissions_country_subsector.parquet"

result = con.execute(f""" 
                     SELECT DISTINCT original_inventory_sector
      FROM '{parquet_path}'
      WHERE gas = 'co2e_100yr'
         AND iso3_country = 'USA'
"""
).df()

print(result)

In [None]:
import duckdb

con = duckdb.connect()

result = con.execute(f"""
                         SELECT 
        strftime(start_time, '%Y-%m') AS year_month,
        SUM(activity) AS activity,
        SUM(emissions_quantity) AS emissions_quantity
    FROM 'data/asset_emissions_country_subsector.parquet'
    WHERE gas = 'co2e_100yr' AND original_inventory_sector = 'coal-mining'
        and original_inventory_sector not in ('forest-land-clearing',
                                                'forest-land-degradation',
                                                'forest-land-fires',
                                                'net-forest-land',
                                                'net-shrubgrass',
                                                'net-wetland',
                                                'removals',
                                                'shrubgrass-fires',
                                                'water-reservoirs',
                                                'wetland-fires')
    GROUP BY year_month
    ORDER BY year_month
                     """)

In [None]:
result

In [None]:
result.df()