<a href="https://colab.research.google.com/github/anujavenkatachalam04/chvi_vbd_rj/blob/main/notebooks/ncd_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NCD Preprocessing

In [None]:
import os
import pandas as pd
import geopandas as gpd
import requests
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')
import geopandas as gpd
from shapely.geometry import Point
import uuid
import re
import datetime

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir("/content/drive/MyDrive/CHVI")

In [None]:
ncd_raw="1_Data/NCD/raw"

In [None]:
blocks_shp=gpd.read_file("5_Shapefiles/SUBDISTRICT_11/Rajasthan_Blocks.geojson")

In [None]:
diab=[file for file in os.listdir(f"{ncd_raw}/diabetes") if file.endswith(".csv")]

In [None]:
# consolidate_diab
diab_all=pd.DataFrame()
for file in tqdm(diab):
  diab_all=pd.concat([diab_all,pd.read_csv(f"{ncd_raw}/diabetes/{file}")])

100%|██████████| 21/21 [00:00<00:00, 104.54it/s]


In [None]:
# consolidate_ht
ht=[file for file in os.listdir(f"{ncd_raw}/hypert") if file.endswith(".csv")]
ht_all=pd.DataFrame()
for file in tqdm(ht):
  ht_all=pd.concat([ht_all,pd.read_csv(f"{ncd_raw}/hypert/{file}")])

100%|██████████| 21/21 [00:00<00:00, 106.89it/s]


In [None]:
rename_cols_list=['enrolled_all', 'enrolled_30', 'screened',
       'rescreened', 'referred_by_screening', 'referred_for_via_screening',
       'diagnosed', 'under_treatment', 'examined', 'via_examinations',
       'comorbidity', 'control', 'followup_adherence']

In [None]:
def rename_cols(disease, cols_to_rename, cols):
    return [f"{disease}_{col}" if col in cols_to_rename else col for col in cols]

In [None]:
ht_all.columns=rename_cols("ht", rename_cols_list, list(ht_all.columns))

In [None]:
diab_all.columns=rename_cols("diab", rename_cols_list, list(diab_all.columns))

In [None]:
len(ht_all), len(diab_all)

(8589, 8589)

In [None]:
ncd=ht_all.merge(diab_all, how="outer")

In [None]:
ncd=ncd[['district_id',
       'district_name', 'taluka_id', 'taluka_name', 'Month', 'Year','ht_diagnosed','diab_diagnosed', 'diab_enrolled_all','ht_enrolled_all']]

In [None]:
ncd["district_name"]=ncd["district_name"].str.lstrip().str.rstrip().str.upper()
ncd["taluka_name"]=ncd["taluka_name"].str.lstrip().str.rstrip().str.upper()

# Mapping NCD to Census 2011 using CHIP data (lat-long of AWC is used to find the block)


In [None]:
ncd_chip_map=pd.read_excel(f"{ncd_raw}/mapping_files/district_taluka_ncd.xlsx")

In [None]:
ncd_chip_map=ncd_chip_map[["taluka_id","chip_block_id"]]

In [None]:
ncd_chip_map.isna().sum()

Unnamed: 0,0
taluka_id,0
chip_block_id,11


In [None]:
len(ncd), len(ncd_chip_map)

(8589, 485)

In [None]:
ncd_mapped=ncd.merge(ncd_chip_map, how="left")

In [None]:
# blocks missing chip block id are invalid
ncd_mapped[(ncd_mapped["chip_block_id"].isna())]["taluka_name"].unique()

array(['K', 'C', 'DELET BLOCK', 'WRONG DATA', 'UU DELETE', 'Z DELET',
       'BLANK 2', 'WRONG BLOCK'], dtype=object)

In [None]:
ncd_mapped=ncd_mapped.dropna(subset="chip_block_id")

In [None]:
ncd_mapped

Unnamed: 0,district_id,district_name,taluka_id,taluka_name,Month,Year,ht_diagnosed,diab_diagnosed,diab_enrolled_all,ht_enrolled_all,chip_block_id
0,10801,GANGANAGAR,108001,KARANPUR,April,2024,3199.0,2587.0,127355,127355,126.0
1,10801,GANGANAGAR,108001,KARANPUR,April,2025,6817.0,5212.0,142845,142845,126.0
2,10801,GANGANAGAR,108001,KARANPUR,August,2024,5138.0,4104.0,133544,133544,126.0
3,10801,GANGANAGAR,108001,KARANPUR,August,2025,7268.0,5469.0,143083,143083,126.0
4,10801,GANGANAGAR,108001,KARANPUR,December,2024,6010.0,4690.0,142363,142363,126.0
...,...,...,...,...,...,...,...,...,...,...,...
9949,10848,SALUMBER,108482,SEMARI,May,2025,1638.0,409.0,57875,57875,496.0
9950,10848,SALUMBER,108482,SEMARI,November,2024,1113.0,258.0,30305,30305,496.0
9951,10848,SALUMBER,108482,SEMARI,October,2024,938.0,234.0,28996,28996,496.0
9952,10848,SALUMBER,108482,SEMARI,September,2024,752.0,197.0,27659,27659,496.0


In [None]:
# map chip_block_id to lat, long using awc data

In [None]:
# get awc lat-long
awc_data=pd.read_csv(f"{ncd_raw}/mapping_files/awc_lat_long.csv")

In [None]:
# get mapping of awc to block id
awc_block_id=pd.read_csv(f"{ncd_raw}/mapping_files/chip_awc_block.csv")

In [None]:
awc_block_id=awc_block_id[["block_id","anganwadi_id"]]

In [None]:
awc_block_id=awc_block_id.drop_duplicates(subset=["block_id","anganwadi_id"])

In [None]:
len(awc_block_id), len(awc_data)

(65127, 58994)

In [None]:
awc_data=awc_data.merge(awc_block_id, how="outer")

In [None]:
# 12% awcs are missing lat, longs
awc_data.isna().sum()/len(awc_data)

Unnamed: 0,0
anganwadi_id,0.0
median_lat,0.123356
median_lng,0.123356
block_id,0.020867


In [None]:
# Compute centroid per block_id using mean lat/lon
block_centroids = (
    awc_data.dropna(subset=["median_lat", "median_lng"], how="any").groupby(by="block_id")[["median_lat", "median_lng"]]
    .mean()
    .reset_index()
    .rename(columns={"median_lat": "centroid_lat", "median_lng": "centroid_lng"})
)

print(block_centroids.head())


   block_id  centroid_lat  centroid_lng
0       1.0     26.498218     74.737770
1       2.0     26.355787     75.037162
2       3.0     26.022645     74.780578
3       4.0     25.967486     74.228917
4       5.0     25.964676     75.173050


In [None]:
block_centroids.isna().sum()

Unnamed: 0,0
block_id,0
centroid_lat,0
centroid_lng,0


In [None]:
# save chip block centroids
block_centroids.to_csv(f"{ncd_raw}/mapping_files/chip_block_centroids.csv", index=False)

In [None]:
# merge block centroids to ncd data
ncd_mapped_chip=ncd_mapped.merge(block_centroids, left_on="chip_block_id", right_on="block_id", how="left").drop(columns="block_id")

In [None]:
# check missing centroids - i.w., where no AWCs were surveyed inthe block/no awc lat-longs in the entire block
ncd_mapped_chip[ncd_mapped_chip["centroid_lat"].isna()][["chip_block_id", "district_name", "taluka_id","taluka_name"]].drop_duplicates()

Unnamed: 0,chip_block_id,district_name,taluka_id,taluka_name
2751,473.0,JAIPUR,108327,SIRSI
7308,202.0,KOTA,108195,KHAIRABAD


In [None]:
# fix these manually by looking up the centroid from google maps - by tehsil polygon
ncd_mapped_chip.loc[ncd_mapped_chip["taluka_id"]==108327, "centroid_lat"]=26.91072
ncd_mapped_chip.loc[ncd_mapped_chip["taluka_id"]==108327, "centroid_lng"]=75.68197

In [None]:
blocks_shp[blocks_shp["NAME"]]

In [None]:
# fix these manually by looking up the centroid from google maps - by tehsil polygon
ncd_mapped_chip.loc[ncd_mapped_chip["taluka_id"]==108195, "centroid_lat"]=24.65458
ncd_mapped_chip.loc[ncd_mapped_chip["taluka_id"]==108195, "centroid_lng"]=75.92918

In [None]:
# check missing centroids - i.w., where no AWCs were surveyed inthe block/no awc lat-longs in the entire block
ncd_mapped_chip[ncd_mapped_chip["centroid_lat"].isna()][["chip_block_id", "district_name", "taluka_id","taluka_name"]].drop_duplicates()

Unnamed: 0,chip_block_id,district_name,taluka_id,taluka_name


In [None]:
# Map to census
ncd_gdf = gpd.GeoDataFrame(ncd_mapped_chip,
    geometry=gpd.points_from_xy(ncd_mapped_chip["centroid_lng"], ncd_mapped_chip["centroid_lat"]),
    crs="EPSG:4326"  # WGS84 lat-long
)

In [None]:
ncd_joined = gpd.sjoin(
    ncd_gdf,
    blocks_shp[["NAME", "DISTRICT", "C_CODE11", "geometry"]],
    how="left",
    predicate="within"
)

In [None]:
# re-mapping manually
ncd_joined.loc[(ncd_joined["district_name"]=="KARAULI") & (ncd_joined["taluka_name"]=="MANDRAYAL"), "NAME"]="Mandrail"
ncd_joined.loc[(ncd_joined["district_name"]=="BHILWARA") & (ncd_joined["taluka_name"]=="HURDA"), "NAME"]="Hurda"
ncd_joined.loc[(ncd_joined["district_name"]=="KOTA") & (ncd_joined["taluka_name"]=="SULTANPUR"), "NAME"]="Digod"
ncd_joined.loc[(ncd_joined["district_name"]=="KOTA") & (ncd_joined["taluka_name"]=="SULTANPUR"), "DISTRICT"]="Kota"
ncd_joined.loc[(ncd_joined["district_name"]=="BUNDI") & (ncd_joined["taluka_name"]=="KHAPREN"), "NAME"]="Indragarh"
ncd_joined.loc[(ncd_joined["district_name"]=="BIKANER") & (ncd_joined["taluka_name"]=="BLOCK PANCHU"), "NAME"]="Chhatargarh"
ncd_joined.loc[(ncd_joined["district_name"]=="BUNDI") & (ncd_joined["taluka_name"]=="TALERA"), "NAME"]="Keshoraipatan"
ncd_joined.loc[(ncd_joined["district_name"]=="BIKANER") & (ncd_joined["taluka_name"]=="BLOCK PUGAL"), "NAME"]="Poogal"

In [None]:
ncd_joined.drop(columns=['chip_block_id', 'centroid_lat', 'centroid_lng',
       'geometry', 'index_right', 'C_CODE11'], inplace=True)

In [None]:
merged=ncd_joined.merge(blocks_shp[["NAME", "DISTRICT", "C_CODE11", "geometry"]], on=["NAME", "DISTRICT"], how="right")

In [None]:
merged.rename(columns={
    "district_id":"ncd_district_id",
    "taluka_id":"ncd_taluka_id"
}, inplace=True)

In [None]:
merged['ht_proportion']=merged['ht_diagnosed']/merged['ht_enrolled_all']
merged['diab_proportion']=merged['diab_diagnosed']/merged['diab_enrolled_all']

In [None]:
merged["Year_Month"] = pd.to_datetime(
    merged["Year"].astype(str) + "-" + merged["Month"].astype(str).str.zfill(2)
).dt.to_period("M")

  merged["Year_Month"] = pd.to_datetime(


In [None]:
merged=merged[['NAME', 'DISTRICT', 'C_CODE11','Year_Month','ncd_district_id', 'ncd_taluka_id','ht_diagnosed', 'diab_diagnosed', 'diab_enrolled_all',
       'ht_enrolled_all', 'ht_proportion', 'diab_proportion']]

In [None]:
merged.to_csv("1_Data/NCD/clean/ncd_data_2024_2025.csv", index=False)

In [None]:
# The End!