In [None]:
'''
This code updates the asset data

'''

import duckdb
import os
from dotenv import load_dotenv
from urllib.parse import quote_plus

load_dotenv()

# Build SQLAlchemy engine for PostgreSQL
user = quote_plus(os.getenv("CLIMATETRACE_USER"))
password = quote_plus(os.getenv("CLIMATETRACE_PASS"))
host = os.getenv("CLIMATETRACE_HOST")
port = os.getenv("CLIMATETRACE_PORT")
database = os.getenv("CLIMATETRACE_DB")

postgres_url = f"postgresql://{user}:{password}@{host}:{port}/{database}"
parquet_path = "data/test/asset_emissions_country_subsector.parquet"

# Use DuckDB to write directly from PostgreSQL to Parquet
con = duckdb.connect()


print("Getting max month...")
max_date = con.execute(f"""
    select max(start_time)
    from postgres_scan('{postgres_url}', 'public', 'asset_emissions')                       
""").fetchone()[0]

print("Running asset-level query and writing to parquet file, this may take a while...")
con.execute(f"""
    INSTALL postgres;
    LOAD postgres;

    CREATE TABLE asset_emissions_parquet AS
    SELECT ae.iso3_country,
        ae.original_inventory_sector,
        ae.start_time,
        ae.gas,
        sch.sector,
        ca.name as country_name,
        ca.continent,
        ca.unfccc_annex,
        ca.em_finance,
        ca.eu,
        ca.oecd,
        ca.developed_un,
        ae.release,
        sum(emissions_quantity) emissions_quantity,
        sum(activity) activity,
        sum(emissions_quantity) / sum(activity) weighted_average_emissions_factor
    
    FROM postgres_scan('{postgres_url}', 'public', 'asset_emissions') ae
    LEFT JOIN postgres_scan('{postgres_url}', 'public', 'country_analysis') ca
        ON CAST(ca.iso3_country AS VARCHAR) = CAST(ae.iso3_country AS VARCHAR)
    LEFT JOIN (
        SELECT DISTINCT sector, subsector FROM postgres_scan('{postgres_url}', 'public', 'asset_schema')
    ) sch
        ON CAST(sch.subsector AS VARCHAR) = CAST(ae.original_inventory_sector AS VARCHAR)
    
    WHERE ae.start_time >= (DATE '{max_date}' - INTERVAL '36 months')
      AND ae.gas = 'co2e_100yr'
      AND ae.most_granular = TRUE
    
    GROUP BY ae.iso3_country,
        ae.original_inventory_sector,
        ae.start_time,
        ae.gas,
        sch.sector,
        ca.name,
        ca.continent,
        ca.unfccc_annex,
        ca.em_finance,
        ca.eu,
        ca.oecd,
        ca.developed_un,
        ae.release;

    COPY asset_emissions_parquet TO '{parquet_path}' (FORMAT PARQUET);
""")
con.close()

print("✅ Asset parquet file exported")

Getting max month...
Running asset-level query and writing to parquet file, this may take a while...
✅ Asset parquet file exported


In [None]:
import sys
print(sys.executable)



In [None]:
'''
This is a script to take raw csvs in data/raw_csvs folder and covert them to 
parquets for manageable GitHub storage and limited memory usage (DuckDB).
'''

import pandas as pd
from pathlib import Path

# Set input and output directories
input_dir = Path("data/raw_csvs")
output_dir = Path("data")

# Make sure the output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# Loop through all CSV files in the input directory
for csv_file in input_dir.glob("*.csv"):
    print(f"Converting {csv_file.name}...")

    # Read CSV into DataFrame
    df = pd.read_csv(csv_file)

    # Create output path by replacing .csv with .parquet
    parquet_file = output_dir / csv_file.with_suffix(".parquet").name

    # Write to Parquet
    df.to_parquet(parquet_file, engine="pyarrow", index=False)
    print(f"Saved to {parquet_file}")

     # Delete original CSV
    csv_file.unlink()
    print(f"Deleted original CSV: {csv_file.name}")

print("✅ CSV to Parquet conversion complete.")

In [None]:
# map_region_condition(region_selection)

test = map_region_condition('Asia')

print(test)
print(test['column_name'])
print(test['column_value'])

In [None]:
import duckdb

con = duckdb.connect()

# Replace with your actual path if needed
parquet_path = "data/country_subsector_emissions_totals.parquet"

# Count how many rows you actually wrote
result = con.execute(f"""SELECT sum(emissions_quantity) FROM '{parquet_path}' where start_time = '2025-02-01' and original_inventory_sector not in ('forest-land-clearing',
                                                'forest-land-degradation',
                                                'forest-land-fires',
                                                'net-forest-land',
                                                'net-shrubgrass',
                                                'net-wetland',
                                                'removals',
                                                'shrubgrass-fires',
                                                'water-reservoirs',
                                                'wetland-fires')""").df()


print(result)

In [None]:
import duckdb

con = duckdb.connect()

# parquet_path = "data/country_subsector_emissions_totals_202504.parquet"
parquet_path = "data/asset_emissions_country_subsector.parquet"

result = con.execute(f""" 
                     SELECT DISTINCT original_inventory_sector
      FROM '{parquet_path}'
      WHERE gas = 'co2e_100yr'
         AND iso3_country = 'USA'
"""
).df()

print(result)

In [8]:
import duckdb

con = duckdb.connect()

result = con.execute(f"""
                         SELECT 
        strftime(start_time, '%Y-%m') AS year_month,
        SUM(activity) AS activity,
        SUM(emissions_quantity) AS emissions_quantity
    FROM 'data/test/asset_emissions_country_subsector.parquet'
    WHERE gas = 'co2e_100yr' -- AND original_inventory_sector = 'coal-mining'
        and original_inventory_sector not in ('forest-land-clearing',
                                                'forest-land-degradation',
                                                'forest-land-fires',
                                                'net-forest-land',
                                                'net-shrubgrass',
                                                'net-wetland',
                                                'removals',
                                                'shrubgrass-fires',
                                                'water-reservoirs',
                                                'wetland-fires')
    GROUP BY year_month
    ORDER BY year_month
                     """).df()

print(result)

   year_month      activity  emissions_quantity
0     2022-03  7.878927e+12        3.868239e+09
1     2022-04  6.077973e+12        3.714841e+09
2     2022-05  6.149944e+12        3.732438e+09
3     2022-06  6.083853e+12        3.665544e+09
4     2022-07  5.792758e+12        3.753667e+09
5     2022-08  5.786821e+12        3.733130e+09
6     2022-09  5.732323e+12        3.675613e+09
7     2022-10  7.223399e+12        3.898317e+09
8     2022-11  7.156353e+12        3.774393e+09
9     2022-12  7.239493e+12        3.822891e+09
10    2023-01  7.961784e+12        3.896984e+09
11    2023-02  7.744182e+12        3.645399e+09
12    2023-03  7.979751e+12        3.914822e+09
13    2023-04  6.222939e+12        3.767996e+09
14    2023-05  6.294335e+12        3.786957e+09
15    2023-06  6.223382e+12        3.716462e+09
16    2023-07  5.875298e+12        3.799801e+09
17    2023-08  5.869286e+12        3.782257e+09
18    2023-09  5.812722e+12        3.729781e+09
19    2023-10  7.325844e+12        3.934

<duckdb.duckdb.DuckDBPyConnection object at 0x10d74ca30>


In [None]:
!pip install -r requirements.txt

In [None]:
result.df()