- Annual brick production in Bangladesh in 2018: 33 billion for 7,859 kilns
- https://www.ccacoalition.org/sites/default/files/resources/2019_Report_Bangladesh%20Brick%20Sector%20Roadmap.pdf

In [2]:
import xarray as xr
import os
import pandas as pd
import numpy as np
import dask
import dask.array as da
import netCDF4
import zarr
import gcsfs
import esmpy
import xesmf as xe
import geopandas as gpd
import rioxarray
import matplotlib.pyplot as plt
from shapely.geometry import mapping
import cartopy.crs as ccrs
from shapely.ops import transform
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import pycountry_convert as pc
import glob
import calendar
import datetime
from shapely import wkt
from shapely.geometry import Point

In [3]:
path = "/Users/akawano/Library/CloudStorage/GoogleDrive-akawano@stanford.edu/My Drive/MyProjects/04_brick_kiln_emissions/emission_data"
dist_path = "/Users/akawano/Library/CloudStorage/GoogleDrive-akawano@stanford.edu/My Drive/MyProjects/04_brick_kiln_emissions"

In [4]:
# Import kiln location data
kilns_gps = gpd.read_file(os.path.join(path, "gps_all_kilns.shp"))
kilns_gps.head()

Unnamed: 0,kiln_id,category,division,geometry
0,100000,PNAS,Chittagong,POINT (5241276.392 3739925.248)
1,100001,PNAS,Chittagong,POINT (5241189.938 3740082.634)
2,100002,PNAS,Chittagong,POINT (5240070.054 3740287.121)
3,100003,PNAS,Chittagong,POINT (5241021.955 3740369.603)
4,100004,PNAS,Chittagong,POINT (5239764.288 3751000.298)


In [8]:
# Read RCT emissions data
emi = pd.read_csv(os.path.join(path, "rct_emissions_data.csv"))
print(emi['kiln_district'].unique())
emi.head()

['jhenaidah' 'jashore' 'khulna' 'chuadanga' 'kushtia' 'narail']


Unnamed: 0,kiln_id,kiln_district,treatment,treatment_bundled,firing_end_month,firing_end_week,kiln_bricks_fired_lakhs,kiln_circuits_completed,production_cost_estimate_bdt_per_1k_bricks,annual_production_targeted_lakh_bricks,...,monitor_time_hr,n_bricks,n_bricks_24_hr,total_production_kpm,total_fuel_consumed_kpm,total_coal_consumed_kpm,specific_fuel_consumption_kpm,specific_coal_consumption_kpm,co2e_annual_production,pm25_annual_production
0,44053,jhenaidah,incentive,g,may,3,8.22,9.5,9200,90,...,24.0,53425,53425.0,0.106693,5.7106,5.472,10.689003,10.242396,1606.559399,4.241808
1,41091,jashore,control,control,july,4,8.4,10.5,9000,90,...,24.933333,33600,32342.245989,0.111686,5.27752,5.27752,15.706905,15.706905,2608.164201,6.886351
2,41046,jashore,incentive,g,july,2,8.1,4.5,9800,60,...,23.833333,39200,39474.125874,0.097995,5.91342,5.91342,15.085255,15.085255,985.608992,2.602309
3,41075,jashore,control,control,july,2,8.24,4.5,9000,60,...,24.4,40000,39344.262295,0.099609,6.02095,4.93236,15.052375,12.3309,953.25919,2.516896
4,47051,khulna,control,control,july,4,8.2,13.5,8500,80,...,23.983333,51000,51035.441279,0.169756,7.8,7.8,15.294118,15.294118,3355.43265,8.859368


In [33]:
# Import kiln locations for those which participated in RCT
df = pd.read_csv(os.path.join(path, "gps_khulna_kilns.csv"))
print(df.shape)
df.head()                            

(590, 6)


Unnamed: 0,kiln_id,kiln_district,project_enrollment,treatment,latitude,longitude
0,17,jashore,pilot,technical,22.90992,89.149059
1,16,jashore,pilot,control,23.01115,89.215458
2,3,jashore,pilot,control,23.054592,88.926128
3,7,jashore,pilot,incentive,23.097718,89.292937
4,23,jashore,pilot,technical,23.16353,89.361963


In [7]:
emi = pd.read_csv(os.path.join(path, "rct_emissions_data.csv"))
print(emi['kiln_district'].unique())
emi.head()

['jhenaidah' 'jashore' 'khulna' 'chuadanga' 'kushtia' 'narail']


Unnamed: 0,kiln_id,kiln_district,treatment,treatment_bundled,firing_end_month,firing_end_week,kiln_bricks_fired_lakhs,kiln_circuits_completed,production_cost_estimate_bdt_per_1k_bricks,annual_production_targeted_lakh_bricks,...,monitor_time_hr,n_bricks,n_bricks_24_hr,total_production_kpm,total_fuel_consumed_kpm,total_coal_consumed_kpm,specific_fuel_consumption_kpm,specific_coal_consumption_kpm,co2e_annual_production,pm25_annual_production
0,44053,jhenaidah,incentive,g,may,3,8.22,9.5,9200,90,...,24.0,53425,53425.0,0.106693,5.7106,5.472,10.689003,10.242396,1606.559399,4.241808
1,41091,jashore,control,control,july,4,8.4,10.5,9000,90,...,24.933333,33600,32342.245989,0.111686,5.27752,5.27752,15.706905,15.706905,2608.164201,6.886351
2,41046,jashore,incentive,g,july,2,8.1,4.5,9800,60,...,23.833333,39200,39474.125874,0.097995,5.91342,5.91342,15.085255,15.085255,985.608992,2.602309
3,41075,jashore,control,control,july,2,8.24,4.5,9000,60,...,24.4,40000,39344.262295,0.099609,6.02095,4.93236,15.052375,12.3309,953.25919,2.516896
4,47051,khulna,control,control,july,4,8.2,13.5,8500,80,...,23.983333,51000,51035.441279,0.169756,7.8,7.8,15.294118,15.294118,3355.43265,8.859368


In [11]:
for i in emi.columns:
    print(i)

kiln_id
kiln_district
treatment
treatment_bundled
firing_end_month
firing_end_week
kiln_bricks_fired_lakhs
kiln_circuits_completed
production_cost_estimate_bdt_per_1k_bricks
annual_production_targeted_lakh_bricks
annual_production_actual_lakh_bricks
annual_production_delta_lakh_bricks
fuel_type_2022_2023
endline_adoption
total_fuel_consumed_endline
total_coal_consumed_endline
specific_fuel_consumption_endline
specific_coal_consumption_endline
endline_data_finish
firing_month
firing_week
circuits_completed
lot_1_weight
lot_2_weight
lot_3_weight
lot_4_weight
class_1_count
class_1_5_count
class_2_count
class_3_count
broken_bricks_cubic_feet
other_type_1
other_type_1_name
other_type_1_count
other_type_2
other_type_2_name
other_type_2_count
bricks_1_chamber_count
chamber_completed_during_monitoring_count
fuel_feeding_start_datetime
fuel_feeding_end_datetime
flue_monitoring_location
single_brick_weight
monitor_time
fuel_feeding_time
cycle_1_time_min
cycle_2_time_min
cycle_3_time_min
cycle_4_

In [31]:
emi['co2_emission']

0      21.189124
1      31.556736
2      30.307780
3      28.498033
4      30.727405
         ...    
271    26.584524
272    22.997882
273    25.825248
274    20.806533
275    24.343184
Name: co2_emission, Length: 276, dtype: float64

In [1]:
# Emissions data for RCT kilns
# CO2 emissions are reported in tons/100,000 bricks
# PM2.5 emissions are kg/100,000 bricks

# CO2 and PM2.5 emissions multiplied by annual brick production (variable “annual_production_actual_lakh_bricks”), 
# a variable we only have for RCT kilns =  season-wide CO2 emissions in tons and PM2.5 emissions in kg.

emi = pd.read_csv(os.path.join(path, "rct_emissions_data.csv"))
emi['CO2_tons_season'] = emi['co2_emission'] * emi['annual_production_actual_lakh_bricks']
emi['pm25_kg_season'] = emi['pm_2_5_emissions'] * emi['annual_production_actual_lakh_bricks']

# get how many months each kiln operated in this firing season
# create firing start date 
month_to_num = {month.lower(): index for index, month in enumerate(calendar.month_name) if month}

# Define which months correspond to each year
months_2022 = ['october', 'november', 'december']
months_2023 = ['january', 'february', 'march', 'april', 'may', 'june']

def get_week_start_date(year, month, week):
    # Get the first day of the month
    first_day = datetime.date(year, month, 1)
    # Find the first Monday of the month (if the first day isn't already a Monday)
    first_monday = first_day + datetime.timedelta(days=(0 - first_day.weekday()) % 7)
    # Calculate the Monday of the desired week
    return first_monday + datetime.timedelta(weeks=week-1)

# Apply the function with a conditional for the year
emi['firing_start_date'] = emi.apply(
    lambda row: get_week_start_date(
        2022 if row['firing_month'].lower() in months_2022 else 2023,
        month_to_num[row['firing_month'].lower()],
        int(row['firing_week'])
    ),
    axis=1
)

# Convert firing_end_date similarly, and fix the misplaced parenthesis
emi['firing_end_date'] = emi.apply(
    lambda row: get_week_start_date(
        2023, 
        month_to_num[row['firing_end_month'].lower()],  # map month name to number
        int(row['firing_end_week'])
    ),
    axis=1
)
emi['firing_start_date'] = pd.to_datetime(emi['firing_start_date'])
emi['firing_end_date'] = pd.to_datetime(emi['firing_end_date'])

# Calculate season_days as an integer representing the number of days
emi['season_days'] = (emi['firing_end_date'] - emi['firing_start_date']).dt.days

emi['avg_co2_tons_per_day'] = emi['CO2_tons_season']/emi['season_days'] 
emi['avg_co2_tons_per_month'] = emi['avg_co2_tons_per_day']*30 

emi['avg_pm25_kg_per_day'] = emi['pm25_kg_season']/emi['season_days'] 
emi['avg_pm25_kg_per_month'] = emi['avg_pm25_kg_per_day']*30 

emi['category'] = 'RCT'

#get per day emission, then multiply
emi.head()

NameError: name 'pd' is not defined

In [35]:
# average season_days amongst RCT kilns
operation_avg_days = emi['season_days'].mean()

# average annual production brickls
brick_mean = emi['annual_production_actual_lakh_bricks'].mean()


In [36]:
brick_mean

np.float64(62.593492391304345)

In [37]:
#6.2 million

In [38]:
33000000000/7859

4199007.507316452

In [39]:
brick_mean = (42+63)/2
brick_mean

52.5

In [40]:
## Sub-district boundaries
subdist = gpd.read_file("/Users/akawano/Library/CloudStorage/GoogleDrive-akawano@stanford.edu/My Drive/MyProjects/04_brick_kiln_emissions/bgd_adm_bbs_20201113_SHP/bgd_admbnda_adm3_bbs_20201113.shp")
subdist = subdist[subdist['ADM1_EN']=='Dhaka'] #only dhaka
subdist = subdist[['ADM3_EN', 'ADM2_EN','geometry']].copy()
subdist['ADM3_EN'] = subdist['ADM3_EN'].str.lower()
subdist['ADM2_EN'] = subdist['ADM2_EN'].str.lower()
subdist['centroid'] = subdist['geometry'].centroid
subdist = subdist.drop(columns = 'geometry').rename(columns = {'centroid':'geometry'})
subdist["latitude"] = subdist.geometry.map(lambda p: p.y)
subdist["longitude"] = subdist.geometry.map(lambda p: p.x)
subdist = subdist.drop(columns = 'geometry')
subdist = subdist.rename(columns = {'ADM3_EN':'kiln_sub_district', 'ADM2_EN':'kiln_district'})
subdist.head()



  subdist['centroid'] = subdist['geometry'].centroid


Unnamed: 0,kiln_sub_district,kiln_district,latitude,longitude
1,adabor,dhaka,23.770167,90.353335
9,alfadanga,faridpur,23.296928,89.669834
13,araihazar,narayanganj,23.760913,90.668743
20,austagram,kishoreganj,24.323013,91.121613
24,badda,dhaka,23.798007,90.448937


In [41]:
emi2 = pd.read_csv(os.path.join(path, "scaling_emissions_data.csv"))

emi2['CO2_tons_season'] = emi2['co2_emission'] * brick_mean # use average annual production of bricks available in RCT
emi2['pm25_kg_season'] = emi2['pm_2_5_emissions'] * brick_mean

# Convert firing_start_date using the mapped month numbers
emi2['firing_start_date'] = emi2.apply(
    lambda row: get_week_start_date(
        2022, 
        month_to_num[row['firing_season_firing_month'].lower()],  # map month name to number
        int(row['firing_season_firing_week'])
    ),
    axis=1
)

emi2['firing_start_date'] = pd.to_datetime(emi2['firing_start_date'])
emi2['firing_end_date'] = emi2['firing_start_date'] + pd.to_timedelta(operation_avg_days, unit='D') # use data from RCT

# Calculate season_days as an integer representing the number of days
emi2['season_days'] = operation_avg_days

emi2['avg_co2_tons_per_day'] = emi2['CO2_tons_season']/emi2['season_days'] 
emi2['avg_co2_tons_per_month'] = emi2['avg_co2_tons_per_day']*30 

emi2['avg_pm25_kg_per_day'] = emi2['pm25_kg_season']/emi2['season_days'] 
emi2['avg_pm25_kg_per_month'] = emi2['avg_pm25_kg_per_day']*30 

emi2['category'] = 'Scaling'
emi2.head()

Unnamed: 0,kiln_id,collection_date,stage,kiln_division,kiln_district,kiln_sub_district,firing_season_firing_month,firing_season_firing_week,firing_season_circuits_completed,firing_season_circuits_current,...,CO2_tons_season,pm25_kg_season,firing_start_date,firing_end_date,season_days,avg_co2_tons_per_day,avg_co2_tons_per_month,avg_pm25_kg_per_day,avg_pm25_kg_per_month,category
0,33183,2024-05-09,stage 2 - No Info,dhaka,gazipur,kaliakair,november,2,9,10,...,1521.480843,4017.174775,2022-11-14,2023-04-29 15:07:49.565217391,166.630435,9.13087,273.926101,24.10829,723.248687,Scaling
1,33096,2024-05-12,stage 1,dhaka,gazipur,kaliakair,december,2,9,10,...,1248.641553,3296.795602,2022-12-12,2023-05-27 15:07:49.565217391,166.630435,7.493478,224.80435,19.785075,593.552241,Scaling
2,33051,2024-05-07,stage 2 - No Info,dhaka,gazipur,kaliakair,november,3,6,7,...,1570.380091,4146.283747,2022-11-21,2023-05-06 15:07:49.565217391,166.630435,9.424329,282.729879,24.883112,746.493356,Scaling
3,33032,2024-05-05,stage 2 - Info,dhaka,gazipur,kaliakair,november,1,9,10,...,1693.724993,4471.952014,2022-11-07,2023-04-22 15:07:49.565217391,166.630435,10.16456,304.936789,26.837546,805.12639,Scaling
4,33006,2024-03-28,stage 2 - Info,dhaka,gazipur,kapasia,december,1,6,7,...,1950.092208,5148.839871,2022-12-05,2023-05-20 15:07:49.565217391,166.630435,11.703097,351.092922,30.899757,926.992697,Scaling


In [42]:
emi2['kiln_district'].unique()

array(['gazipur', 'tangail', 'dhaka', 'manikganj'], dtype=object)

In [43]:
# get latitude and longitude from subdistrict centroid
emi_scale = pd.merge(emi2, subdist, on = ['kiln_sub_district','kiln_district'], how = 'left')
emi_scale



Unnamed: 0,kiln_id,collection_date,stage,kiln_division,kiln_district,kiln_sub_district,firing_season_firing_month,firing_season_firing_week,firing_season_circuits_completed,firing_season_circuits_current,...,firing_start_date,firing_end_date,season_days,avg_co2_tons_per_day,avg_co2_tons_per_month,avg_pm25_kg_per_day,avg_pm25_kg_per_month,category,latitude,longitude
0,33183,2024-05-09,stage 2 - No Info,dhaka,gazipur,kaliakair,november,2,9,10,...,2022-11-14,2023-04-29 15:07:49.565217391,166.630435,9.130870,273.926101,24.108290,723.248687,Scaling,24.103209,90.264598
1,33096,2024-05-12,stage 1,dhaka,gazipur,kaliakair,december,2,9,10,...,2022-12-12,2023-05-27 15:07:49.565217391,166.630435,7.493478,224.804350,19.785075,593.552241,Scaling,24.103209,90.264598
2,33051,2024-05-07,stage 2 - No Info,dhaka,gazipur,kaliakair,november,3,6,7,...,2022-11-21,2023-05-06 15:07:49.565217391,166.630435,9.424329,282.729879,24.883112,746.493356,Scaling,24.103209,90.264598
3,33032,2024-05-05,stage 2 - Info,dhaka,gazipur,kaliakair,november,1,9,10,...,2022-11-07,2023-04-22 15:07:49.565217391,166.630435,10.164560,304.936789,26.837546,805.126390,Scaling,24.103209,90.264598
4,33006,2024-03-28,stage 2 - Info,dhaka,gazipur,kapasia,december,1,6,7,...,2022-12-05,2023-05-20 15:07:49.565217391,166.630435,11.703097,351.092922,30.899757,926.992697,Scaling,24.147201,90.599857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,33122,2024-03-28,stage 2 - Info,dhaka,gazipur,gazipur_sadar,december,1,6,7,...,2022-12-05,2023-05-20 15:07:49.565217391,166.630435,7.643822,229.314647,20.182027,605.460806,Scaling,,
192,26190,2024-03-31,stage 1,dhaka,dhaka,savar,october,3,10,11,...,2022-10-17,2023-04-01 15:07:49.565217391,166.630435,7.664641,229.939233,20.236997,607.109903,Scaling,23.881056,90.280762
193,26215,2024-03-28,stage 1,dhaka,dhaka,dhamrai,november,1,9,10,...,2022-11-07,2023-04-22 15:07:49.565217391,166.630435,6.471570,194.147113,17.086925,512.607758,Scaling,23.937071,90.141472
194,26177,2024-03-30,stage 2 - No Info,dhaka,dhaka,dhamrai,october,3,7,8,...,2022-10-17,2023-04-01 15:07:49.565217391,166.630435,13.766451,412.993533,36.347641,1090.429241,Scaling,23.937071,90.141472


In [44]:
emissions = pd.concat([emi, emi_scale])
emissions = emissions[['kiln_id', 'category','firing_start_date','firing_end_date','season_days','avg_co2_tons_per_day','avg_co2_tons_per_month',
'avg_pm25_kg_per_day','avg_pm25_kg_per_month', 'latitude','longitude']].copy()

emissions['firing_end_date'] = emissions['firing_end_date'].dt.strftime('%Y-%m-%d')
emissions.head()

Unnamed: 0,kiln_id,category,firing_start_date,firing_end_date,season_days,avg_co2_tons_per_day,avg_co2_tons_per_month,avg_pm25_kg_per_day,avg_pm25_kg_per_month,latitude,longitude
0,44053,RCT,2022-11-14,2023-05-15,182.0,8.827249,264.817483,23.306638,699.199151,,
1,41091,RCT,2022-12-12,2023-07-24,224.0,11.64359,349.307705,30.742639,922.279179,,
2,41046,RCT,2022-11-28,2023-07-10,224.0,4.40004,132.001204,11.617452,348.523552,,
3,41075,RCT,2022-12-19,2023-07-10,203.0,4.695858,140.875742,12.398502,371.955047,,
4,47051,RCT,2022-11-14,2023-07-24,252.0,13.315209,399.456268,35.156224,1054.686722,,


In [45]:
emissions['category'].unique()

array(['RCT', 'Scaling'], dtype=object)

In [46]:
# Match with location datae

emissions_RCT = emissions[emissions['category']=='RCT']
emissions_else = emissions[emissions['category']!='RCT']
emissions_data = pd.merge(emissions_RCT.drop(columns = ['latitude','longitude']), df[['kiln_id','latitude','longitude']].copy(), on = ['kiln_id'], how = 'left')
emissions_data.head()


Unnamed: 0,kiln_id,category,firing_start_date,firing_end_date,season_days,avg_co2_tons_per_day,avg_co2_tons_per_month,avg_pm25_kg_per_day,avg_pm25_kg_per_month,latitude,longitude
0,44053,RCT,2022-11-14,2023-05-15,182.0,8.827249,264.817483,23.306638,699.199151,23.600572,89.179497
1,41091,RCT,2022-12-12,2023-07-24,224.0,11.64359,349.307705,30.742639,922.279179,23.204301,89.360513
2,41046,RCT,2022-11-28,2023-07-10,224.0,4.40004,132.001204,11.617452,348.523552,23.109009,89.307483
3,41075,RCT,2022-12-19,2023-07-10,203.0,4.695858,140.875742,12.398502,371.955047,23.003571,88.989972
4,47051,RCT,2022-11-14,2023-07-24,252.0,13.315209,399.456268,35.156224,1054.686722,22.782409,89.332652


In [47]:
emissions_df = pd.concat([emissions_data, emissions_else])

In [48]:
emissions_df['category'].unique()

array(['RCT', 'Scaling'], dtype=object)

In [49]:
emissions_df[emissions_df['category']=='Scaling']

Unnamed: 0,kiln_id,category,firing_start_date,firing_end_date,season_days,avg_co2_tons_per_day,avg_co2_tons_per_month,avg_pm25_kg_per_day,avg_pm25_kg_per_month,latitude,longitude
0,33183,Scaling,2022-11-14,2023-04-29,166.630435,9.130870,273.926101,24.108290,723.248687,24.103209,90.264598
1,33096,Scaling,2022-12-12,2023-05-27,166.630435,7.493478,224.804350,19.785075,593.552241,24.103209,90.264598
2,33051,Scaling,2022-11-21,2023-05-06,166.630435,9.424329,282.729879,24.883112,746.493356,24.103209,90.264598
3,33032,Scaling,2022-11-07,2023-04-22,166.630435,10.164560,304.936789,26.837546,805.126390,24.103209,90.264598
4,33006,Scaling,2022-12-05,2023-05-20,166.630435,11.703097,351.092922,30.899757,926.992697,24.147201,90.599857
...,...,...,...,...,...,...,...,...,...,...,...
191,33122,Scaling,2022-12-05,2023-05-20,166.630435,7.643822,229.314647,20.182027,605.460806,,
192,26190,Scaling,2022-10-17,2023-04-01,166.630435,7.664641,229.939233,20.236997,607.109903,23.881056,90.280762
193,26215,Scaling,2022-11-07,2023-04-22,166.630435,6.471570,194.147113,17.086925,512.607758,23.937071,90.141472
194,26177,Scaling,2022-10-17,2023-04-01,166.630435,13.766451,412.993533,36.347641,1090.429241,23.937071,90.141472


In [50]:
# Import all the kiln locations data
location = pd.read_csv(os.path.join(path, "model_kilns.csv")).drop(columns = ['label','prob','prediction'])

# Include those kiln locations only when probability > 0.8
print(location.shape)
geometry = [Point(xy) for xy in zip(location['long'], location['lat'])]

# Create a GeoDataFrame using the geometry column
location_gdf = gpd.GeoDataFrame(location, geometry=geometry, crs = 4326)
location_gdf = location_gdf.to_crs(7755)
location_gdf['id'] = location_gdf.index
location_gdf['category'] = 'Others'
location_gdf.head()


(6978, 2)


Unnamed: 0,lat,long,geometry,id,category
0,21.111399,92.184783,POINT (5241276.392 3739925.248),0,Others
1,21.11291,92.184072,POINT (5241189.938 3740082.634),1,Others
2,21.115673,92.173307,POINT (5240070.054 3740287.121),2,Others
3,21.115673,92.182674,POINT (5241021.955 3740369.603),3,Others
4,21.21411,92.179388,POINT (5239764.288 3751000.298),4,Others


In [51]:
# CO2 and PM2.5 emissions from nearby kilns in RCT & Scaling

geometry = [Point(xy) for xy in zip(emissions_df['longitude'], emissions_df['latitude'])]

# Create a GeoDataFrame using the geometry column
emissions_gdf = gpd.GeoDataFrame(emissions_df, geometry=geometry, crs = 4326)
emissions_gdf = emissions_gdf.to_crs(7755)

# 1. Ensure both GeoDataFrames have the same CRS
emissions_gdf = emissions_gdf.to_crs(location_gdf.crs)

# Create a buffer around each kiln location (if not already done)
neighbor_distance = 10000  # e.g., 10km
location_gdf['buffer'] = location_gdf.geometry.buffer(neighbor_distance)

# Set the 'buffer' column as the active geometry column
location_gdf_buffer = location_gdf.set_geometry('buffer')

# Now perform the spatial join using the active geometry from location_gdf_buffer.
neighbors = gpd.sjoin(
    emissions_gdf, 
   location_gdf_buffer[['id', 'category','buffer']], 
    how='inner', 
    predicate='within'
)

neighbors['firing_start_date'] = pd.to_datetime(neighbors['firing_start_date'])
neighbors['firing_end_date'] = pd.to_datetime(neighbors['firing_end_date'])

# Define the emission columns for which to compute the mean
aggregations = {
    'avg_co2_tons_per_day': 'mean',
    'avg_co2_tons_per_month': 'mean',
    'avg_pm25_kg_per_day': 'mean',
    'avg_pm25_kg_per_month': 'mean',
    'firing_start_date': 'median',
    'firing_end_date': 'median'
}

neighbors_mean = neighbors.groupby('id').agg(aggregations).reset_index()

# Optionally, convert back to a string format (year-month-day)
neighbors_mean['firing_start_date'] = neighbors_mean['firing_start_date'].dt.strftime('%Y-%m-%d')
neighbors_mean['firing_end_date'] = neighbors_mean['firing_end_date'].dt.strftime('%Y-%m-%d')
neighbors_mean.head()

Unnamed: 0,id,avg_co2_tons_per_day,avg_co2_tons_per_month,avg_pm25_kg_per_day,avg_pm25_kg_per_month,firing_start_date,firing_end_date
0,777,12.297206,368.91619,32.468386,974.051575,2022-11-21,2023-07-24
1,807,11.57049,347.114694,30.549632,916.488959,2022-11-24,2023-05-18
2,809,12.297206,368.91619,32.468386,974.051575,2022-11-21,2023-07-24
3,812,12.297206,368.91619,32.468386,974.051575,2022-11-21,2023-07-24
4,813,12.297206,368.91619,32.468386,974.051575,2022-11-21,2023-07-24


In [52]:
location_merged = pd.merge(location_gdf, neighbors_mean, on = 'id', how = 'left')

# For those kilns that do not have neighboring kilns in RCT/scaling data, impute missing emissions using whole average values
location_merged['avg_co2_tons_per_day'] = location_merged['avg_co2_tons_per_day'].fillna(
    emissions_df['avg_co2_tons_per_day'].mean()
)
location_merged['avg_co2_tons_per_month'] = location_merged['avg_co2_tons_per_month'].fillna(
    emissions_df['avg_co2_tons_per_month'].mean()
)
location_merged['avg_pm25_kg_per_day'] = location_merged['avg_pm25_kg_per_day'].fillna(
    emissions_df['avg_pm25_kg_per_day'].mean()
)
location_merged['avg_pm25_kg_per_month'] = location_merged['avg_pm25_kg_per_month'].fillna(
    emissions_df['avg_pm25_kg_per_month'].mean()
)

location_merged['firing_start_date'] = location_merged['firing_start_date'].fillna(
    emissions_df['firing_start_date'].median()
)

emissions_df['firing_end_date'] = pd.to_datetime(emissions_df['firing_end_date'])
location_merged['firing_end_date'] = pd.to_datetime(location_merged['firing_end_date'])

location_merged['firing_end_date'] = location_merged['firing_end_date'].fillna(
    emissions_df['firing_end_date'].median()
)

location_merged = location_merged.rename(columns = {'id':'kiln_id', 'lat':'latitude','long':'longitude'}).drop(columns = ['buffer'])
location_merged.head()





Unnamed: 0,latitude,longitude,geometry,kiln_id,category,avg_co2_tons_per_day,avg_co2_tons_per_month,avg_pm25_kg_per_day,avg_pm25_kg_per_month,firing_start_date,firing_end_date
0,21.111399,92.184783,POINT (5241276.392 3739925.248),0,Others,10.163055,304.891646,26.833573,805.007199,2022-11-21 00:00:00,2023-04-29
1,21.11291,92.184072,POINT (5241189.938 3740082.634),1,Others,10.163055,304.891646,26.833573,805.007199,2022-11-21 00:00:00,2023-04-29
2,21.115673,92.173307,POINT (5240070.054 3740287.121),2,Others,10.163055,304.891646,26.833573,805.007199,2022-11-21 00:00:00,2023-04-29
3,21.115673,92.182674,POINT (5241021.955 3740369.603),3,Others,10.163055,304.891646,26.833573,805.007199,2022-11-21 00:00:00,2023-04-29
4,21.21411,92.179388,POINT (5239764.288 3751000.298),4,Others,10.163055,304.891646,26.833573,805.007199,2022-11-21 00:00:00,2023-04-29


In [53]:
emissions_gdf[emissions_gdf['category']=='Scaling']

Unnamed: 0,kiln_id,category,firing_start_date,firing_end_date,season_days,avg_co2_tons_per_day,avg_co2_tons_per_month,avg_pm25_kg_per_day,avg_pm25_kg_per_month,latitude,longitude,geometry
0,33183,Scaling,2022-11-14,2023-04-29,166.630435,9.130870,273.926101,24.108290,723.248687,24.103209,90.264598,POINT (5022388.231 4048467.295)
1,33096,Scaling,2022-12-12,2023-05-27,166.630435,7.493478,224.804350,19.785075,593.552241,24.103209,90.264598,POINT (5022388.231 4048467.295)
2,33051,Scaling,2022-11-21,2023-05-06,166.630435,9.424329,282.729879,24.883112,746.493356,24.103209,90.264598,POINT (5022388.231 4048467.295)
3,33032,Scaling,2022-11-07,2023-04-22,166.630435,10.164560,304.936789,26.837546,805.126390,24.103209,90.264598,POINT (5022388.231 4048467.295)
4,33006,Scaling,2022-12-05,2023-05-20,166.630435,11.703097,351.092922,30.899757,926.992697,24.147201,90.599857,POINT (5055360.003 4055703.879)
...,...,...,...,...,...,...,...,...,...,...,...,...
191,33122,Scaling,2022-12-05,2023-05-20,166.630435,7.643822,229.314647,20.182027,605.460806,,,POINT (NaN NaN)
192,26190,Scaling,2022-10-17,2023-04-01,166.630435,7.664641,229.939233,20.236997,607.109903,23.881056,90.280762,POINT (5025754.234 4024522.555)
193,26215,Scaling,2022-11-07,2023-04-22,166.630435,6.471570,194.147113,17.086925,512.607758,23.937071,90.141472,POINT (5011443.34 4029582.865)
194,26177,Scaling,2022-10-17,2023-04-01,166.630435,13.766451,412.993533,36.347641,1090.429241,23.937071,90.141472,POINT (5011443.34 4029582.865)


In [54]:
fin_gdf = pd.concat([location_merged, emissions_gdf])
fin_df = fin_gdf.drop(columns = ['season_days', 'geometry','avg_co2_tons_per_day','avg_pm25_kg_per_day'])
fin_df['firing_start_date'] = pd.to_datetime(fin_df['firing_start_date'])
fin_df['firing_end_date'] = pd.to_datetime(fin_df['firing_end_date'])

fin_df = fin_df[fin_df['latitude'].notna()]
fin_df.head()

Unnamed: 0,latitude,longitude,kiln_id,category,avg_co2_tons_per_month,avg_pm25_kg_per_month,firing_start_date,firing_end_date
0,21.111399,92.184783,0,Others,304.891646,805.007199,2022-11-21,2023-04-29
1,21.11291,92.184072,1,Others,304.891646,805.007199,2022-11-21,2023-04-29
2,21.115673,92.173307,2,Others,304.891646,805.007199,2022-11-21,2023-04-29
3,21.115673,92.182674,3,Others,304.891646,805.007199,2022-11-21,2023-04-29
4,21.21411,92.179388,4,Others,304.891646,805.007199,2022-11-21,2023-04-29


In [55]:
fin_df['category'].unique()

array(['Others', 'RCT', 'Scaling'], dtype=object)

In [56]:
fin_df['firing_end_date'].max()

Timestamp('2023-07-24 00:00:00')

In [57]:
def month_active_fraction(row, current_month):
    # current_month is a Timestamp representing the first day of the month (e.g., 2022-11-01)
    start = row['firing_start_date']
    end = row['firing_end_date']
    
    # Define the month’s start and end dates.
    month_start = current_month
    last_day = calendar.monthrange(current_month.year, current_month.month)[1]
    month_end = current_month.replace(day=last_day)
    
    # Overlapping period:
    overlap_start = max(start, month_start)
    overlap_end = min(end, month_end)
    
    # If there's no overlap, return 0
    if overlap_end < overlap_start:
        return 0
    
    # +1 to count inclusive days.
    overlap_days = (overlap_end - overlap_start).days + 1
    total_days = last_day
    fraction = overlap_days / total_days
    return fraction
    

def expand_row(row):
    # Ensure firing dates are datetime objects (if not already)
    start_month = row['firing_start_date'].replace(day=1)
    end_month = row['firing_end_date'].replace(day=1)
    
    # Create a date range for the first day of each month in the firing period
    monthly_dates = pd.date_range(start=start_month, end=end_month, freq='MS')
    
    expanded = []
    for d in monthly_dates:
        new_row = row.copy()
        # Compute the fraction of the month that is active
        fraction = month_active_fraction(row, d)
        
        # Create a new column 'date' with year-month (formatted as 'YYYY-MM')
        new_row['date'] = d.strftime('%Y-%m')
        
        # Adjust emissions by the fraction of the month active.
        # For example, if only half the month was active, emissions are halved.
        new_row['avg_co2_tons_per_month'] = row['avg_co2_tons_per_month'] * fraction
        new_row['avg_pm25_kg_per_month'] = row['avg_pm25_kg_per_month'] * fraction
        expanded.append(new_row)
    return expanded

# Expand each row in fin_df
expanded_rows = []
for _, row in fin_df.iterrows():
    expanded_rows.extend(expand_row(row))

# Create the new DataFrame
expanded_df = pd.DataFrame(expanded_rows)

# Select only the desired columns
result_df = expanded_df[['latitude', 'longitude', 'kiln_id', 'category', 'date',
                           'avg_co2_tons_per_month', 'avg_pm25_kg_per_month']]

result_df.head()

Unnamed: 0,latitude,longitude,kiln_id,category,date,avg_co2_tons_per_month,avg_pm25_kg_per_month
0,21.111399,92.184783,0,Others,2022-11,101.630549,268.335733
0,21.111399,92.184783,0,Others,2022-12,304.891646,805.007199
0,21.111399,92.184783,0,Others,2023-01,304.891646,805.007199
0,21.111399,92.184783,0,Others,2023-02,304.891646,805.007199
0,21.111399,92.184783,0,Others,2023-03,304.891646,805.007199


In [58]:
result_df['category'].unique()

array(['Others', 'RCT', 'Scaling'], dtype=object)

In [59]:
result_df = result_df[['kiln_id','category','date','avg_co2_tons_per_month','avg_pm25_kg_per_month','latitude','longitude']].copy()
result_df

Unnamed: 0,kiln_id,category,date,avg_co2_tons_per_month,avg_pm25_kg_per_month,latitude,longitude
0,0,Others,2022-11,101.630549,268.335733,21.111399,92.184783
0,0,Others,2022-12,304.891646,805.007199,21.111399,92.184783
0,0,Others,2023-01,304.891646,805.007199,21.111399,92.184783
0,0,Others,2023-02,304.891646,805.007199,21.111399,92.184783
0,0,Others,2023-03,304.891646,805.007199,21.111399,92.184783
...,...,...,...,...,...,...,...
195,26202,Scaling,2022-12,165.077055,435.853915,23.937071,90.141472
195,26202,Scaling,2023-01,165.077055,435.853915,23.937071,90.141472
195,26202,Scaling,2023-02,165.077055,435.853915,23.937071,90.141472
195,26202,Scaling,2023-03,165.077055,435.853915,23.937071,90.141472


In [60]:
result_df.to_csv(os.path.join(path, "co2_pm25_emissions_brik_kiln.csv"), index = False)