 # Data Processing Pipeline Overview
 
This notebook processes raw data from multiple sources to produce a final, clean, and usable dataset for model building.

The steps include:
- Loading and cleaning pincode, district, and state information
- Merging population and GDP data at the pincode and state levels
- Handling missing values and ensuring data consistency
- Generating features required for modeling
 
 The final output will be a dataset with one row per pincode, containing all relevant features for downstream machine learning tasks.

# Part 1: Pincode

## Level 1

In [1]:
import pandas as pd

# Read the pincode directory CSV
df = pd.read_csv('raw_data/pincode_directory.csv')

# Print debug info
print(f"Total rows in original file: {len(df)}")
print(f"Unique pincodes in original file: {df['pincode'].nunique()}")

# For each pincode, keep only the first occurrence, but keep all columns
df = df.drop_duplicates(subset=['pincode'], keep='first')

print(f"\nRows after keeping first occurrence per pincode: {len(df)}")
print(f"Unique pincodes after this operation: {df['pincode'].nunique()}")
print(f"Columns in the file: {list(df.columns)}")

Total rows in original file: 165631
Unique pincodes in original file: 19584

Rows after keeping first occurrence per pincode: 19584
Unique pincodes after this operation: 19584
Columns in the file: ['circlename', 'regionname', 'divisionname', 'officename', 'pincode', 'officetype', 'delivery', 'district', 'statename', 'latitude', 'longitude']


In [2]:
df = df[['pincode', 'district', 'statename', 'latitude', 'longitude']]

df

Unnamed: 0,pincode,district,statename,latitude,longitude
0,507204,KHAMMAM,TELANGANA,16.9980961,80.3087675
2,507169,KHAMMAM,TELANGANA,17.0202400,80.0717300
3,507208,KHAMMAM,TELANGANA,17.155349999999999,80.216399999999993
7,507002,KHAMMAM,TELANGANA,17.2734239,80.1830938
9,507168,KHAMMAM,TELANGANA,17.596,81.056700000000006
...,...,...,...,...,...
165522,713154,PURBA BARDHAMAN,WEST BENGAL,23.2001716,88.0833558
165564,712222,HOOGHLY,WEST BENGAL,22.7921667,88.3344722
165579,711113,HOWRAH,WEST BENGAL,22.6062000,88.2940000
165594,721303,MEDINIPUR WEST,WEST BENGAL,22.3396944,87.2127500


In [3]:
# Remove rows with latitude or longitude outside India's bounding box
# Ensure latitude and longitude are numeric before filtering
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')
df = df[
    (df['latitude'] >= 6.75) & (df['latitude'] <= 37.10) &
    (df['longitude'] >= 68.70) & (df['longitude'] <= 97.40)
]

df

Unnamed: 0,pincode,district,statename,latitude,longitude
0,507204,KHAMMAM,TELANGANA,16.998096,80.308768
2,507169,KHAMMAM,TELANGANA,17.020240,80.071730
3,507208,KHAMMAM,TELANGANA,17.155350,80.216400
7,507002,KHAMMAM,TELANGANA,17.273424,80.183094
9,507168,KHAMMAM,TELANGANA,17.596000,81.056700
...,...,...,...,...,...
165522,713154,PURBA BARDHAMAN,WEST BENGAL,23.200172,88.083356
165564,712222,HOOGHLY,WEST BENGAL,22.792167,88.334472
165579,711113,HOWRAH,WEST BENGAL,22.606200,88.294000
165594,721303,MEDINIPUR WEST,WEST BENGAL,22.339694,87.212750


In [4]:
# Check for NaN values in any column
num_nan_any = df.isna().any(axis=1).sum()

print(f"Number of pincodes with NaN in any column: {num_nan_any}")

# Show some examples if any exist
if num_nan_any > 0:
    print("Rows with NaN in any column:")
    print(df[df.isna().any(axis=1)])

# Remove all rows where any column is NaN
df = df.dropna()

Number of pincodes with NaN in any column: 110
Rows with NaN in any column:
        pincode district statename   latitude  longitude
380      523261      NaN       NaN  15.808005  80.039520
960      494446      NaN       NaN  18.848901  80.414201
985      494111      NaN       NaN  18.397601  81.185815
5303     811315      NaN       NaN  25.167361  86.098825
6275     506313      NaN       NaN  17.786184  79.521437
...         ...      ...       ...        ...        ...
163288   411077      NaN       NaN  18.639300  73.796300
163300   411075      NaN       NaN  18.527388  73.850553
163380   431025      NaN       NaN  19.890768  75.361271
163560   794116      NaN       NaN  25.730736  89.896855
165437   736209      NaN       NaN  26.387458  89.593697

[110 rows x 5 columns]


In [5]:
df

Unnamed: 0,pincode,district,statename,latitude,longitude
0,507204,KHAMMAM,TELANGANA,16.998096,80.308768
2,507169,KHAMMAM,TELANGANA,17.020240,80.071730
3,507208,KHAMMAM,TELANGANA,17.155350,80.216400
7,507002,KHAMMAM,TELANGANA,17.273424,80.183094
9,507168,KHAMMAM,TELANGANA,17.596000,81.056700
...,...,...,...,...,...
165522,713154,PURBA BARDHAMAN,WEST BENGAL,23.200172,88.083356
165564,712222,HOOGHLY,WEST BENGAL,22.792167,88.334472
165579,711113,HOWRAH,WEST BENGAL,22.606200,88.294000
165594,721303,MEDINIPUR WEST,WEST BENGAL,22.339694,87.212750


In [6]:
# Add a column 'urban' to df: 1 if pincode is in raw_data/urban.csv, else 0

# Read the urban pincodes
urban_df = pd.read_csv('raw_data/urban.csv')

# Assume the urban csv has a column named 'pincode'
urban_pincodes = set(urban_df['pincode'].astype(str))

# Ensure df['pincode'] is string for comparison
df['pincode'] = df['pincode'].astype(str)

# Add the 'urban' column: 1 if pincode in urban_pincodes, else 0
df['urban'] = df['pincode'].apply(lambda x: 1 if x in urban_pincodes else 0)

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pincode'] = df['pincode'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['urban'] = df['pincode'].apply(lambda x: 1 if x in urban_pincodes else 0)


Unnamed: 0,pincode,district,statename,latitude,longitude,urban
0,507204,KHAMMAM,TELANGANA,16.998096,80.308768,0
2,507169,KHAMMAM,TELANGANA,17.020240,80.071730,0
3,507208,KHAMMAM,TELANGANA,17.155350,80.216400,0
7,507002,KHAMMAM,TELANGANA,17.273424,80.183094,0
9,507168,KHAMMAM,TELANGANA,17.596000,81.056700,0
...,...,...,...,...,...,...
165522,713154,PURBA BARDHAMAN,WEST BENGAL,23.200172,88.083356,0
165564,712222,HOOGHLY,WEST BENGAL,22.792167,88.334472,1
165579,711113,HOWRAH,WEST BENGAL,22.606200,88.294000,1
165594,721303,MEDINIPUR WEST,WEST BENGAL,22.339694,87.212750,0


In [7]:
import numpy as np

# Read airport data
airport_df = pd.read_csv('raw_data/airport.csv')
airport_df

Unnamed: 0,name,lat,lon
0,Aamby Valley Airport,18.609617,73.377586
1,Abhinav Nagar AFS,19.216776,72.920158
2,Abu Road Airport,24.494200,72.781502
3,Adampur AFS (VIAX),31.433178,75.758156
4,Agartala Airport (IXA/VEAT),23.886999,91.240402
...,...,...,...
463,Walong Advanced Landing Ground,28.129673,97.019661
464,Warangal Airport (WGC/VOWA),17.914400,79.602203
465,Yelahanka AFS (VOYK),13.135500,77.606003
466,Zakhama Helipad,25.597464,94.122490


In [8]:
# Ensure airport_df has columns: 'lat', 'lon', 'name'
airport_coords = airport_df[['lat', 'lon']].astype(float).to_numpy()
airport_names = airport_df['name'].tolist()

# Function to compute haversine distance (in km) between two points or arrays of points
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    # Convert all inputs to float64 for safety
    lat1 = np.asarray(lat1, dtype=np.float64)
    lon1 = np.asarray(lon1, dtype=np.float64)
    lat2 = np.asarray(lat2, dtype=np.float64)
    lon2 = np.asarray(lon2, dtype=np.float64)
    # Convert degrees to radians
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# For each pincode, compute distance to all airports and take the minimum and the airport name
def nearest_airport_info(row):
    lat = float(row['latitude'])
    lon = float(row['longitude'])
    dists = haversine(lat, lon, airport_coords[:,0], airport_coords[:,1])
    min_idx = np.argmin(dists)
    return pd.Series({'nearest_airport_km': dists[min_idx], 'nearest_airport_name': airport_names[min_idx]})

df[['nearest_airport_km', 'nearest_airport_name']] = df.apply(nearest_airport_info, axis=1)

# Save for next cell
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['nearest_airport_km', 'nearest_airport_name']] = df.apply(nearest_airport_info, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['nearest_airport_km', 'nearest_airport_name']] = df.apply(nearest_airport_info, axis=1)


Unnamed: 0,pincode,district,statename,latitude,longitude,urban,nearest_airport_km,nearest_airport_name
0,507204,KHAMMAM,TELANGANA,16.998096,80.308768,0,73.514759,Vijayawada Airport (VGA/VOBZ)
2,507169,KHAMMAM,TELANGANA,17.020240,80.071730,0,94.474522,Vijayawada Airport (VGA/VOBZ)
3,507208,KHAMMAM,TELANGANA,17.155350,80.216400,0,92.975247,Vijayawada Airport (VGA/VOBZ)
7,507002,KHAMMAM,TELANGANA,17.273424,80.183094,0,94.184775,Warangal Airport (WGC/VOWA)
9,507168,KHAMMAM,TELANGANA,17.596000,81.056700,0,97.279182,Rajahmundry Airport (RJA/VORY)
...,...,...,...,...,...,...,...,...
165522,713154,PURBA BARDHAMAN,WEST BENGAL,23.200172,88.083356,0,42.227284,Guskhara Airfield
165564,712222,HOOGHLY,WEST BENGAL,22.792167,88.334472,1,2.776867,Barrackpore AFS (VEPI)
165579,711113,HOWRAH,WEST BENGAL,22.606200,88.294000,1,11.338644,Behala Airport (VEBA)
165594,721303,MEDINIPUR WEST,WEST BENGAL,22.339694,87.212750,0,0.181318,Kalaikunda AFS (VEDX)


In [9]:
import geopandas as gpd

# Read the railway.geojson file
railway_gdf = gpd.read_file('raw_data/railway.geojson')

# Extract only the station name, latitude, and longitude
# Assume the station name is in a column called 'name' (adjust if needed)
railway_df = railway_gdf.copy()
if 'geometry' in railway_df.columns:
    railway_df['longitude'] = railway_df.geometry.x
    railway_df['latitude'] = railway_df.geometry.y

# Keep only the relevant columns: name, latitude, longitude
railway_df = railway_df[['name', 'latitude', 'longitude']]

railway_df

Unnamed: 0,name,latitude,longitude
0,Badhal,27.252059,75.451645
1,KICHHA,28.913427,79.519746
2,Sherekan,29.555198,74.434991
3,Bhukarka,29.238227,74.751031
4,Nohar,29.192563,74.773628
...,...,...,...
8692,SALEM MARKET,11.654084,78.142621
8693,OMALUR,11.738554,78.046780
8694,TOLASAMPATTI,11.761619,77.979547
8695,MECHERI ROAD,11.804211,77.921857


In [10]:
# Compute nearest railway station distance and name for each row in df

# Prepare railway coordinates and names
railway_coords = railway_df[['latitude', 'longitude']].astype(float).to_numpy()
railway_names = railway_df['name'].tolist()

def nearest_railway_info(row):
    lat = float(row['latitude'])
    lon = float(row['longitude'])
    dists = haversine(lat, lon, railway_coords[:,0], railway_coords[:,1])
    min_idx = np.argmin(dists)
    return pd.Series({'nearest_railway_km': dists[min_idx], 'nearest_railway_name': railway_names[min_idx]})

df[['nearest_railway_km', 'nearest_railway_name']] = df.apply(nearest_railway_info, axis=1)

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['nearest_railway_km', 'nearest_railway_name']] = df.apply(nearest_railway_info, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['nearest_railway_km', 'nearest_railway_name']] = df.apply(nearest_railway_info, axis=1)


Unnamed: 0,pincode,district,statename,latitude,longitude,urban,nearest_airport_km,nearest_airport_name,nearest_railway_km,nearest_railway_name
0,507204,KHAMMAM,TELANGANA,16.998096,80.308768,0,73.514759,Vijayawada Airport (VGA/VOBZ),4.790836,MOTIMARI
2,507169,KHAMMAM,TELANGANA,17.020240,80.071730,0,94.474522,Vijayawada Airport (VGA/VOBZ),17.719886,NAGALWANCHA
3,507208,KHAMMAM,TELANGANA,17.155350,80.216400,0,92.975247,Vijayawada Airport (VGA/VOBZ),3.589161,CHINTA KANI
7,507002,KHAMMAM,TELANGANA,17.273424,80.183094,0,94.184775,Warangal Airport (WGC/VOWA),5.440535,KHAMMAM
9,507168,KHAMMAM,TELANGANA,17.596000,81.056700,0,97.279182,Rajahmundry Airport (RJA/VORY),33.803966,Panduranga Pura
...,...,...,...,...,...,...,...,...,...,...
165522,713154,PURBA BARDHAMAN,WEST BENGAL,23.200172,88.083356,0,42.227284,Guskhara Airfield,2.633228,NIMO
165564,712222,HOOGHLY,WEST BENGAL,22.792167,88.334472,1,2.776867,Barrackpore AFS (VEPI),0.446098,BAIDYABATI
165579,711113,HOWRAH,WEST BENGAL,22.606200,88.294000,1,11.338644,Behala Airport (VEBA),1.647900,DASHNAGAR
165594,721303,MEDINIPUR WEST,WEST BENGAL,22.339694,87.212750,0,0.181318,Kalaikunda AFS (VEDX),2.876168,KHEMASULI


In [11]:
import pandas as pd

# Load the hospital CSV (assume file is named 'raw_data/hospital.csv')
# The CSV should have at least a 'pincode' column
hospital_df = pd.read_csv('raw_data/hospital.csv')
hospital_df

  hospital_df = pd.read_csv('raw_data/hospital.csv')


Unnamed: 0,Sr_No,Location_Coordinates,Location,Hospital_Name,Hospital_Category,Hospital_Care_Type,Discipline_Systems_of_Medicine,Address_Original_First_Line,State,District,...,Number_Doctor,Num_Mediconsultant_or_Expert,Total_Num_Beds,Number_Private_Wards,Num_Bed_for_Eco_Weaker_Sec,Empanelment_or_Collaboration_with,Emergency_Services,Tariff_Range,State_ID,District_ID
0,1,"11.6357989, 92.7120575",Near Dollygunj Junction,Chakraborty Multi Speciality Hospital,0,Hospital,Allopathic,Near Dollygunj Junction,Andaman and Nicobar Islands,South Andaman,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,35,640
1,2,"11.8311681, 92.6586401",Medical Board Office,Inhs Dhanvantri,0,0,Allopathic,Medical Board Office,Andaman and Nicobar Islands,South Andaman,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,35,640
2,3,"11.8311681, 92.6586401",Near Masjid,Maricar Hospital,0,Hospital,Allopathic,Near Masjid,Andaman and Nicobar Islands,South Andaman,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,35,640
3,4,"11.6498468, 92.7294624","Lamba Line, P.B. No. 526",Pillar Health Centre,0,0,Allopathic,"Lamba Line, P.B. No. 526",Andaman and Nicobar Islands,South Andaman,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,35,640
4,5,"11.6233774, 92.7264828","GB Pant Road, City Centre, Port Blair","G B Pant Hospital, Port Blair",Public/ Government,Hospital,Allopathic,"GB Pant Road, City Centre, Port Blair",Andaman and Nicobar Islands,South Andaman,...,,,0,,,,,,35,640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30268,32044,,"Po& Vill Maniktala,P S Ashoknagar,Manir Tala,S...",Banani Nursing Home,0,0,0,"Po& Vill Maniktala,P S Ashoknagar,Manir Tala",West Bengal,South 24 Parganas,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,19,343
30269,32045,,"Raidigi South 24 Pgs, Po.Raidigi",Suraksha Nursing Home And Diagnostic Centre,0,0,0,"Raidigi South 24 Pgs, Po.Raidigi",West Bengal,South 24 Parganas,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,19,343
30270,32046,,"Amtala, D. H. Road, P.O. Kanyanagar, Ps. Bishn...",The Amtala Nursing Home,0,0,0,"Amtala, D. H. Road, P.O. Kanyanagar, Ps. Bishn...",West Bengal,South 24 Parganas,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,19,343
30271,32047,,"Ghatakpukur P.O-B, Gobindopur, Bhangar",Maa Anowara General Hospital,0,0,0,"Ghatakpukur P.O-B, Gobindopur, Bhangar",West Bengal,South 24 Parganas,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,19,343


In [12]:
# Count and print number of rows with NaN in 'Location' and 'Pincode'
num_nan_location = hospital_df['Location'].isna().sum()
num_nan_pincode = hospital_df['Pincode'].isna().sum()
print(f"Number of rows with NaN in Location column: {num_nan_location}")
print(f"Number of rows with NaN in Pincode column: {num_nan_pincode}")

# Remove rows with NaN in either 'Location' or 'Pincode', and also where Pincode is 0 or '0'
hospital_df = hospital_df.dropna(subset=['Location', 'Pincode'])
# Remove rows where Pincode is 0 (as int or str)
hospital_df = hospital_df[~hospital_df['Pincode'].astype(str).isin(['0', '0.0'])]
hospital_df

Number of rows with NaN in Location column: 3
Number of rows with NaN in Pincode column: 1


Unnamed: 0,Sr_No,Location_Coordinates,Location,Hospital_Name,Hospital_Category,Hospital_Care_Type,Discipline_Systems_of_Medicine,Address_Original_First_Line,State,District,...,Number_Doctor,Num_Mediconsultant_or_Expert,Total_Num_Beds,Number_Private_Wards,Num_Bed_for_Eco_Weaker_Sec,Empanelment_or_Collaboration_with,Emergency_Services,Tariff_Range,State_ID,District_ID
0,1,"11.6357989, 92.7120575",Near Dollygunj Junction,Chakraborty Multi Speciality Hospital,0,Hospital,Allopathic,Near Dollygunj Junction,Andaman and Nicobar Islands,South Andaman,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,35,640
1,2,"11.8311681, 92.6586401",Medical Board Office,Inhs Dhanvantri,0,0,Allopathic,Medical Board Office,Andaman and Nicobar Islands,South Andaman,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,35,640
2,3,"11.8311681, 92.6586401",Near Masjid,Maricar Hospital,0,Hospital,Allopathic,Near Masjid,Andaman and Nicobar Islands,South Andaman,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,35,640
3,4,"11.6498468, 92.7294624","Lamba Line, P.B. No. 526",Pillar Health Centre,0,0,Allopathic,"Lamba Line, P.B. No. 526",Andaman and Nicobar Islands,South Andaman,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,35,640
4,5,"11.6233774, 92.7264828","GB Pant Road, City Centre, Port Blair","G B Pant Hospital, Port Blair",Public/ Government,Hospital,Allopathic,"GB Pant Road, City Centre, Port Blair",Andaman and Nicobar Islands,South Andaman,...,,,0,,,,,,35,640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30268,32044,,"Po& Vill Maniktala,P S Ashoknagar,Manir Tala,S...",Banani Nursing Home,0,0,0,"Po& Vill Maniktala,P S Ashoknagar,Manir Tala",West Bengal,South 24 Parganas,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,19,343
30269,32045,,"Raidigi South 24 Pgs, Po.Raidigi",Suraksha Nursing Home And Diagnostic Centre,0,0,0,"Raidigi South 24 Pgs, Po.Raidigi",West Bengal,South 24 Parganas,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,19,343
30270,32046,,"Amtala, D. H. Road, P.O. Kanyanagar, Ps. Bishn...",The Amtala Nursing Home,0,0,0,"Amtala, D. H. Road, P.O. Kanyanagar, Ps. Bishn...",West Bengal,South 24 Parganas,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,19,343
30271,32047,,"Ghatakpukur P.O-B, Gobindopur, Bhangar",Maa Anowara General Hospital,0,0,0,"Ghatakpukur P.O-B, Gobindopur, Bhangar",West Bengal,South 24 Parganas,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,19,343


In [13]:
# Count number of hospitals per pincode and sort by num_hospitals
hosp_count = hospital_df.groupby('Pincode').size().reset_index(name='num_hospitals')
hosp_count = hosp_count.sort_values(by='num_hospitals', ascending=True)
hosp_count

Unnamed: 0,Pincode,num_hospitals
5407,855114,1
4005,689595,1
1767,403002,1
4003,689547,1
1769,403103,1
...,...,...
0,121001,128
2198,452001,132
2349,492001,133
142,143001,149


In [14]:
# Merge with main df (hospital_df uses 'Pincode', df uses 'pincode')
df = df.merge(hosp_count, left_on='pincode', right_on='Pincode', how='left')

# Fill NaN with 0 (no hospitals found for that pincode)
df['num_hospitals'] = df['num_hospitals'].fillna(0).astype(int)

# Drop the extra 'Pincode' column if it was added by the merge
if 'Pincode' in df.columns:
    df = df.drop(columns=['Pincode'])

df

Unnamed: 0,pincode,district,statename,latitude,longitude,urban,nearest_airport_km,nearest_airport_name,nearest_railway_km,nearest_railway_name,num_hospitals
0,507204,KHAMMAM,TELANGANA,16.998096,80.308768,0,73.514759,Vijayawada Airport (VGA/VOBZ),4.790836,MOTIMARI,0
1,507169,KHAMMAM,TELANGANA,17.020240,80.071730,0,94.474522,Vijayawada Airport (VGA/VOBZ),17.719886,NAGALWANCHA,0
2,507208,KHAMMAM,TELANGANA,17.155350,80.216400,0,92.975247,Vijayawada Airport (VGA/VOBZ),3.589161,CHINTA KANI,0
3,507002,KHAMMAM,TELANGANA,17.273424,80.183094,0,94.184775,Warangal Airport (WGC/VOWA),5.440535,KHAMMAM,3
4,507168,KHAMMAM,TELANGANA,17.596000,81.056700,0,97.279182,Rajahmundry Airport (RJA/VORY),33.803966,Panduranga Pura,0
...,...,...,...,...,...,...,...,...,...,...,...
17960,713154,PURBA BARDHAMAN,WEST BENGAL,23.200172,88.083356,0,42.227284,Guskhara Airfield,2.633228,NIMO,0
17961,712222,HOOGHLY,WEST BENGAL,22.792167,88.334472,1,2.776867,Barrackpore AFS (VEPI),0.446098,BAIDYABATI,0
17962,711113,HOWRAH,WEST BENGAL,22.606200,88.294000,1,11.338644,Behala Airport (VEBA),1.647900,DASHNAGAR,2
17963,721303,MEDINIPUR WEST,WEST BENGAL,22.339694,87.212750,0,0.181318,Kalaikunda AFS (VEDX),2.876168,KHEMASULI,0


In [15]:
# Save the result to processed_data folder
df.to_csv('processed_data/pincode_lev1.csv', index=False)

## Level 2

In [16]:
import pandas as pd

geoiq_csv_path = 'processed_data/geoiq_pincode_data.csv'
geoiq_df = pd.read_csv(geoiq_csv_path)

# Find rows with any NaN value in any field
geoiq_with_nan = geoiq_df[geoiq_df.isna().any(axis=1)]

print("Rows in GeoIQ CSV with at least one NaN value:")
geoiq_with_nan

Rows in GeoIQ CSV with at least one NaN value:


Unnamed: 0,url,pincode,place_name,population,area_km2,male_population,female_population


In [17]:
# Remove rows with any NaN value in any field from the original DataFrame and save to CSV
geoiq_df = geoiq_df.dropna()
geoiq_df.to_csv('processed_data/geoiq_pincode_data.csv', index=False)
geoiq_df

Unnamed: 0,url,pincode,place_name,population,area_km2,male_population,female_population
0,https://geoiq.io/places/110001---Sansad-Marg/C...,110001,110001 - Sansad Marg,250430.0,18.74,135652.0,114778.0
1,https://geoiq.io/places/110002---Indraprastha/...,110002,110002 - Indraprastha,180479.0,15.16,95754.0,84725.0
2,https://geoiq.io/places/110003---Lodi-Road/vkn...,110003,110003 - Lodi Road,187073.0,17.29,100220.0,86853.0
3,https://geoiq.io/places/110004---Rashtrapati-B...,110004,110004 - Rashtrapati Bhawan,8846.0,1.66,4851.0,3995.0
4,https://geoiq.io/places/110005---Karol-Bagh/rS...,110005,110005 - Karol Bagh,251105.0,8.61,133452.0,117653.0
...,...,...,...,...,...,...,...
18959,https://geoiq.io/places/855113---Salmari/nup9w...,855113,855113 - Salmari,385307.0,336.60,200649.0,184658.0
18960,https://geoiq.io/places/855114---Sonali/zubD8M...,855114,855114 - Sonali,256769.0,301.98,133370.0,123399.0
18961,https://geoiq.io/places/855115---Sontha/fOiVVq...,855115,855115 - Sontha,502071.0,573.34,260168.0,241903.0
18962,https://geoiq.io/places/855116---Thakurganj/at...,855116,855116 - Thakurganj,221842.0,293.80,114131.0,107711.0


In [18]:
# Add a column for population density (population per area_km2) in geoiq_df
# Only add if both population and area_km2 are present and area_km2 > 0 to avoid division by zero
geoiq_df['population_density'] = geoiq_df.apply(
    lambda row: row['population'] / row['area_km2'] if pd.notnull(row['population']) and pd.notnull(row['area_km2']) and row['area_km2'] > 0 else None,
    axis=1
)
geoiq_df

Unnamed: 0,url,pincode,place_name,population,area_km2,male_population,female_population,population_density
0,https://geoiq.io/places/110001---Sansad-Marg/C...,110001,110001 - Sansad Marg,250430.0,18.74,135652.0,114778.0,13363.393810
1,https://geoiq.io/places/110002---Indraprastha/...,110002,110002 - Indraprastha,180479.0,15.16,95754.0,84725.0,11904.947230
2,https://geoiq.io/places/110003---Lodi-Road/vkn...,110003,110003 - Lodi Road,187073.0,17.29,100220.0,86853.0,10819.722383
3,https://geoiq.io/places/110004---Rashtrapati-B...,110004,110004 - Rashtrapati Bhawan,8846.0,1.66,4851.0,3995.0,5328.915663
4,https://geoiq.io/places/110005---Karol-Bagh/rS...,110005,110005 - Karol Bagh,251105.0,8.61,133452.0,117653.0,29164.343786
...,...,...,...,...,...,...,...,...
18959,https://geoiq.io/places/855113---Salmari/nup9w...,855113,855113 - Salmari,385307.0,336.60,200649.0,184658.0,1144.702911
18960,https://geoiq.io/places/855114---Sonali/zubD8M...,855114,855114 - Sonali,256769.0,301.98,133370.0,123399.0,850.284787
18961,https://geoiq.io/places/855115---Sontha/fOiVVq...,855115,855115 - Sontha,502071.0,573.34,260168.0,241903.0,875.695050
18962,https://geoiq.io/places/855116---Thakurganj/at...,855116,855116 - Thakurganj,221842.0,293.80,114131.0,107711.0,755.078285


In [19]:
# Merge pincode_directory with geoiq_df to get pincode, state, and population
# Use the saved CSV to ensure 'pincode' column exists
pincode_dir_df = pd.read_csv('processed_data/pincode_lev1.csv')

df_demographic = pd.merge(
    pincode_dir_df,
    geoiq_df[['pincode', 'population', 'area_km2', 'population_density', 'male_population', 'female_population']],
    on='pincode',
    how='inner'
)

df_demographic

Unnamed: 0,pincode,district,statename,latitude,longitude,urban,nearest_airport_km,nearest_airport_name,nearest_railway_km,nearest_railway_name,num_hospitals,population,area_km2,population_density,male_population,female_population
0,507204,KHAMMAM,TELANGANA,16.998096,80.308768,0,73.514759,Vijayawada Airport (VGA/VOBZ),4.790836,MOTIMARI,0,26076.0,112.65,231.478029,13034.0,13042.0
1,507169,KHAMMAM,TELANGANA,17.020240,80.071730,0,94.474522,Vijayawada Airport (VGA/VOBZ),17.719886,NAGALWANCHA,0,23567.0,125.51,187.769899,11710.0,11857.0
2,507208,KHAMMAM,TELANGANA,17.155350,80.216400,0,92.975247,Vijayawada Airport (VGA/VOBZ),3.589161,CHINTA KANI,0,54915.0,206.42,266.035268,27189.0,27726.0
3,507002,KHAMMAM,TELANGANA,17.273424,80.183094,0,94.184775,Warangal Airport (WGC/VOWA),5.440535,KHAMMAM,3,209801.0,77.87,2694.246822,104113.0,105688.0
4,507168,KHAMMAM,TELANGANA,17.596000,81.056700,0,97.279182,Rajahmundry Airport (RJA/VORY),33.803966,Panduranga Pura,0,36654.0,228.09,160.699724,18600.0,18054.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17525,713217,PASCHIM BARDHAMAN,WEST BENGAL,23.572935,87.233220,0,2.536097,Raniganj AFS,2.591735,PINJRAPOL,0,5939.0,5.93,1001.517707,3112.0,2827.0
17526,713154,PURBA BARDHAMAN,WEST BENGAL,23.200172,88.083356,0,42.227284,Guskhara Airfield,2.633228,NIMO,0,10872.0,20.58,528.279883,5514.0,5358.0
17527,712222,HOOGHLY,WEST BENGAL,22.792167,88.334472,1,2.776867,Barrackpore AFS (VEPI),0.446098,BAIDYABATI,0,25902.0,15.56,1664.652956,13401.0,12501.0
17528,711113,HOWRAH,WEST BENGAL,22.606200,88.294000,1,11.338644,Behala Airport (VEBA),1.647900,DASHNAGAR,2,25701.0,1.98,12980.303030,13364.0,12337.0


In [20]:
# Save the result to processed_data folder
df_demographic.to_csv('processed_data/pincode_lev2.csv', index=False)

## Level 3

In [21]:
import pandas as pd

# Read state-wise GDP data
state_gdp_df = pd.read_csv('raw_data/state_wise_gdp.csv')

# Rename 'gdp' column to 'state_gdp'
state_gdp_df = state_gdp_df.rename(columns={'gdp': 'state_gdp'})

# Find states with NaN state_gdp values
states_with_nan_gdp = state_gdp_df[state_gdp_df['state_gdp'].isna()]['state']
print("States with NaN GDP values:")
print(states_with_nan_gdp.tolist())
print(f"Number of states with NaN GDP: {len(states_with_nan_gdp)}")

# Remove states with NaN state_gdp values
state_gdp_df = state_gdp_df.dropna(subset=['state_gdp'])
state_gdp_df

States with NaN GDP values:
['GOA', 'CHANDIGARH', 'MANIPUR', 'NAGALAND', 'ARUNACHAL PRADESH', 'MIZORAM', 'ANDAMAN AND NICOBAR ISLANDS']
Number of states with NaN GDP: 7


Unnamed: 0,state,state_gdp
0,MAHARASHTRA,40443.0
1,TAMIL NADU,27216.0
2,UTTAR PRADESH,25479.0
3,KARNATAKA,25007.0
4,GUJARAT,24258.0
5,WEST BENGAL,17009.0
6,RAJASTHAN,15284.0
7,TELANGANA,15020.0
8,ANDHRA PRADESH,14397.0
9,MADHYA PRADESH,13633.0


In [22]:
# Read pincode-directory to get pincode and state
df = pd.read_csv('processed_data/pincode_lev2.csv')

# Rename 'population' to 'pincode_population'
df = df.rename(columns={'population': 'pincode_population'})

# Merge to get GDP for each pincode's state, and drop the duplicate 'state' column
df = pd.merge(
    df,
    state_gdp_df[['state', 'state_gdp']],
    left_on='statename',
    right_on='state',
    how='inner'
).drop(columns=['state'])

df

Unnamed: 0,pincode,district,statename,latitude,longitude,urban,nearest_airport_km,nearest_airport_name,nearest_railway_km,nearest_railway_name,num_hospitals,pincode_population,area_km2,population_density,male_population,female_population,state_gdp
0,507204,KHAMMAM,TELANGANA,16.998096,80.308768,0,73.514759,Vijayawada Airport (VGA/VOBZ),4.790836,MOTIMARI,0,26076.0,112.65,231.478029,13034.0,13042.0,15020.0
1,507169,KHAMMAM,TELANGANA,17.020240,80.071730,0,94.474522,Vijayawada Airport (VGA/VOBZ),17.719886,NAGALWANCHA,0,23567.0,125.51,187.769899,11710.0,11857.0,15020.0
2,507208,KHAMMAM,TELANGANA,17.155350,80.216400,0,92.975247,Vijayawada Airport (VGA/VOBZ),3.589161,CHINTA KANI,0,54915.0,206.42,266.035268,27189.0,27726.0,15020.0
3,507002,KHAMMAM,TELANGANA,17.273424,80.183094,0,94.184775,Warangal Airport (WGC/VOWA),5.440535,KHAMMAM,3,209801.0,77.87,2694.246822,104113.0,105688.0,15020.0
4,507168,KHAMMAM,TELANGANA,17.596000,81.056700,0,97.279182,Rajahmundry Airport (RJA/VORY),33.803966,Panduranga Pura,0,36654.0,228.09,160.699724,18600.0,18054.0,15020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17231,713217,PASCHIM BARDHAMAN,WEST BENGAL,23.572935,87.233220,0,2.536097,Raniganj AFS,2.591735,PINJRAPOL,0,5939.0,5.93,1001.517707,3112.0,2827.0,17009.0
17232,713154,PURBA BARDHAMAN,WEST BENGAL,23.200172,88.083356,0,42.227284,Guskhara Airfield,2.633228,NIMO,0,10872.0,20.58,528.279883,5514.0,5358.0,17009.0
17233,712222,HOOGHLY,WEST BENGAL,22.792167,88.334472,1,2.776867,Barrackpore AFS (VEPI),0.446098,BAIDYABATI,0,25902.0,15.56,1664.652956,13401.0,12501.0,17009.0
17234,711113,HOWRAH,WEST BENGAL,22.606200,88.294000,1,11.338644,Behala Airport (VEBA),1.647900,DASHNAGAR,2,25701.0,1.98,12980.303030,13364.0,12337.0,17009.0


In [23]:
# Add a new column to df with the total population for each pincode's state
df['state_population'] = df['statename'].map(
    df.groupby('statename')['pincode_population'].sum()
)

df

Unnamed: 0,pincode,district,statename,latitude,longitude,urban,nearest_airport_km,nearest_airport_name,nearest_railway_km,nearest_railway_name,num_hospitals,pincode_population,area_km2,population_density,male_population,female_population,state_gdp,state_population
0,507204,KHAMMAM,TELANGANA,16.998096,80.308768,0,73.514759,Vijayawada Airport (VGA/VOBZ),4.790836,MOTIMARI,0,26076.0,112.65,231.478029,13034.0,13042.0,15020.0,35102892.0
1,507169,KHAMMAM,TELANGANA,17.020240,80.071730,0,94.474522,Vijayawada Airport (VGA/VOBZ),17.719886,NAGALWANCHA,0,23567.0,125.51,187.769899,11710.0,11857.0,15020.0,35102892.0
2,507208,KHAMMAM,TELANGANA,17.155350,80.216400,0,92.975247,Vijayawada Airport (VGA/VOBZ),3.589161,CHINTA KANI,0,54915.0,206.42,266.035268,27189.0,27726.0,15020.0,35102892.0
3,507002,KHAMMAM,TELANGANA,17.273424,80.183094,0,94.184775,Warangal Airport (WGC/VOWA),5.440535,KHAMMAM,3,209801.0,77.87,2694.246822,104113.0,105688.0,15020.0,35102892.0
4,507168,KHAMMAM,TELANGANA,17.596000,81.056700,0,97.279182,Rajahmundry Airport (RJA/VORY),33.803966,Panduranga Pura,0,36654.0,228.09,160.699724,18600.0,18054.0,15020.0,35102892.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17231,713217,PASCHIM BARDHAMAN,WEST BENGAL,23.572935,87.233220,0,2.536097,Raniganj AFS,2.591735,PINJRAPOL,0,5939.0,5.93,1001.517707,3112.0,2827.0,17009.0,98428148.0
17232,713154,PURBA BARDHAMAN,WEST BENGAL,23.200172,88.083356,0,42.227284,Guskhara Airfield,2.633228,NIMO,0,10872.0,20.58,528.279883,5514.0,5358.0,17009.0,98428148.0
17233,712222,HOOGHLY,WEST BENGAL,22.792167,88.334472,1,2.776867,Barrackpore AFS (VEPI),0.446098,BAIDYABATI,0,25902.0,15.56,1664.652956,13401.0,12501.0,17009.0,98428148.0
17234,711113,HOWRAH,WEST BENGAL,22.606200,88.294000,1,11.338644,Behala Airport (VEBA),1.647900,DASHNAGAR,2,25701.0,1.98,12980.303030,13364.0,12337.0,17009.0,98428148.0


In [24]:
# Calculate pincode GDP: (pincode population / state total population) * state GDP
df['pincode_gdp'] = (
    df['pincode_population'] / df['state_population']
) * df['state_gdp']

df

Unnamed: 0,pincode,district,statename,latitude,longitude,urban,nearest_airport_km,nearest_airport_name,nearest_railway_km,nearest_railway_name,num_hospitals,pincode_population,area_km2,population_density,male_population,female_population,state_gdp,state_population,pincode_gdp
0,507204,KHAMMAM,TELANGANA,16.998096,80.308768,0,73.514759,Vijayawada Airport (VGA/VOBZ),4.790836,MOTIMARI,0,26076.0,112.65,231.478029,13034.0,13042.0,15020.0,35102892.0,11.157529
1,507169,KHAMMAM,TELANGANA,17.020240,80.071730,0,94.474522,Vijayawada Airport (VGA/VOBZ),17.719886,NAGALWANCHA,0,23567.0,125.51,187.769899,11710.0,11857.0,15020.0,35102892.0,10.083965
2,507208,KHAMMAM,TELANGANA,17.155350,80.216400,0,92.975247,Vijayawada Airport (VGA/VOBZ),3.589161,CHINTA KANI,0,54915.0,206.42,266.035268,27189.0,27726.0,15020.0,35102892.0,23.497303
3,507002,KHAMMAM,TELANGANA,17.273424,80.183094,0,94.184775,Warangal Airport (WGC/VOWA),5.440535,KHAMMAM,3,209801.0,77.87,2694.246822,104113.0,105688.0,15020.0,35102892.0,89.770695
4,507168,KHAMMAM,TELANGANA,17.596000,81.056700,0,97.279182,Rajahmundry Airport (RJA/VORY),33.803966,Panduranga Pura,0,36654.0,228.09,160.699724,18600.0,18054.0,15020.0,35102892.0,15.683696
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17231,713217,PASCHIM BARDHAMAN,WEST BENGAL,23.572935,87.233220,0,2.536097,Raniganj AFS,2.591735,PINJRAPOL,0,5939.0,5.93,1001.517707,3112.0,2827.0,17009.0,98428148.0,1.026296
17232,713154,PURBA BARDHAMAN,WEST BENGAL,23.200172,88.083356,0,42.227284,Guskhara Airfield,2.633228,NIMO,0,10872.0,20.58,528.279883,5514.0,5358.0,17009.0,98428148.0,1.878750
17233,712222,HOOGHLY,WEST BENGAL,22.792167,88.334472,1,2.776867,Barrackpore AFS (VEPI),0.446098,BAIDYABATI,0,25902.0,15.56,1664.652956,13401.0,12501.0,17009.0,98428148.0,4.476028
17234,711113,HOWRAH,WEST BENGAL,22.606200,88.294000,1,11.338644,Behala Airport (VEBA),1.647900,DASHNAGAR,2,25701.0,1.98,12980.303030,13364.0,12337.0,17009.0,98428148.0,4.441294


In [25]:
# Read the state-wise cases CSV from the raw_data folder
state_cases_df = pd.read_csv('raw_data/state_wise_cases.csv')
print("State-wise cases DataFrame:")
state_cases_df

State-wise cases DataFrame:


Unnamed: 0,Sl. No.,State/UT,Cases pending for more than 40 years - Civil Cases,Cases pending for more than 40 years - Criminal Cases,Cases pending for more than 30 years - Civil Cases,Cases pending for more than 30 years - Criminal Cases,Cases pending for more than 20 years - Civil Cases,Cases pending for more than 20 years - Criminal Cases
0,1,Andaman and Nicobar,,,0.0,5.0,16.0,69.0
1,2,Andhra Pradesh,2.0,18.0,26.0,38.0,288.0,277.0
2,3,Arunachal Pradesh,,,1.0,6.0,1.0,298.0
3,4,Assam,7.0,0.0,19.0,4.0,113.0,90.0
4,5,Bihar,679.0,701.0,3749.0,5603.0,17120.0,82115.0
5,6,Chandigarh,,,1.0,1.0,6.0,3.0
6,7,Chhattisgarh,2.0,0.0,5.0,1.0,41.0,21.0
7,8,Dadra and Nagar Haveli and Daman and Diu,,,0.0,2.0,1.0,13.0
8,9,Delhi,14.0,0.0,87.0,10.0,430.0,317.0
9,10,Goa,43.0,0.0,224.0,24.0,717.0,35.0


In [26]:
# Keep only 'State/UT', 'Cases pending for more than 20 years - Civil Cases', and 'Cases pending for more than 20 years - Criminal Cases'
state_cases_df = state_cases_df[['State/UT', 'Cases pending for more than 20 years - Civil Cases', 'Cases pending for more than 20 years - Criminal Cases']]

# Optionally, rename columns for easier access
state_cases_df = state_cases_df.rename(columns={
    'State/UT': 'state',
    'Cases pending for more than 20 years - Civil Cases': 'state_civil_cases',
    'Cases pending for more than 20 years - Criminal Cases': 'state_criminal_cases'
})

# Print the cleaned DataFrame
print("Filtered State-wise cases DataFrame:")
state_cases_df

Filtered State-wise cases DataFrame:


Unnamed: 0,state,state_civil_cases,state_criminal_cases
0,Andaman and Nicobar,16.0,69.0
1,Andhra Pradesh,288.0,277.0
2,Arunachal Pradesh,1.0,298.0
3,Assam,113.0,90.0
4,Bihar,17120.0,82115.0
5,Chandigarh,6.0,3.0
6,Chhattisgarh,41.0,21.0
7,Dadra and Nagar Haveli and Daman and Diu,1.0,13.0
8,Delhi,430.0,317.0
9,Goa,717.0,35.0


In [27]:
# Drop rows where 'civil' or 'criminal' is NaN and print the number of such rows dropped
num_rows_before = len(state_cases_df)
state_cases_df = state_cases_df.dropna(subset=['state_civil_cases', 'state_criminal_cases'])
num_rows_after = len(state_cases_df)
print(f"Number of rows dropped due to NaN in 'civil' or 'criminal': {num_rows_before - num_rows_after}")
state_cases_df

Number of rows dropped due to NaN in 'civil' or 'criminal': 3


Unnamed: 0,state,state_civil_cases,state_criminal_cases
0,Andaman and Nicobar,16.0,69.0
1,Andhra Pradesh,288.0,277.0
2,Arunachal Pradesh,1.0,298.0
3,Assam,113.0,90.0
4,Bihar,17120.0,82115.0
5,Chandigarh,6.0,3.0
6,Chhattisgarh,41.0,21.0
7,Dadra and Nagar Haveli and Daman and Diu,1.0,13.0
8,Delhi,430.0,317.0
9,Goa,717.0,35.0


In [28]:
# Merge civil and criminal cases from state_cases_df into df based on state name
# First, ensure the state names match in both DataFrames
# We'll use upper-case for matching, as in the state_gdp merge

# Create a mapping from state_cases_df for fast lookup
state_cases_df['state_upper'] = state_cases_df['state'].str.upper().str.strip()
df['state_upper'] = df['statename'].str.upper().str.strip()

# Merge civil and criminal columns into df
df = df.merge(
    state_cases_df[['state_upper', 'state_civil_cases', 'state_criminal_cases']],
    on='state_upper',
    how='left'
)

# Drop the helper column
df = df.drop(columns=['state_upper'])

# Optionally, print to check merge
print("df with civil and criminal cases columns:")
df

df with civil and criminal cases columns:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_cases_df['state_upper'] = state_cases_df['state'].str.upper().str.strip()


Unnamed: 0,pincode,district,statename,latitude,longitude,urban,nearest_airport_km,nearest_airport_name,nearest_railway_km,nearest_railway_name,...,pincode_population,area_km2,population_density,male_population,female_population,state_gdp,state_population,pincode_gdp,state_civil_cases,state_criminal_cases
0,507204,KHAMMAM,TELANGANA,16.998096,80.308768,0,73.514759,Vijayawada Airport (VGA/VOBZ),4.790836,MOTIMARI,...,26076.0,112.65,231.478029,13034.0,13042.0,15020.0,35102892.0,11.157529,771.0,350.0
1,507169,KHAMMAM,TELANGANA,17.020240,80.071730,0,94.474522,Vijayawada Airport (VGA/VOBZ),17.719886,NAGALWANCHA,...,23567.0,125.51,187.769899,11710.0,11857.0,15020.0,35102892.0,10.083965,771.0,350.0
2,507208,KHAMMAM,TELANGANA,17.155350,80.216400,0,92.975247,Vijayawada Airport (VGA/VOBZ),3.589161,CHINTA KANI,...,54915.0,206.42,266.035268,27189.0,27726.0,15020.0,35102892.0,23.497303,771.0,350.0
3,507002,KHAMMAM,TELANGANA,17.273424,80.183094,0,94.184775,Warangal Airport (WGC/VOWA),5.440535,KHAMMAM,...,209801.0,77.87,2694.246822,104113.0,105688.0,15020.0,35102892.0,89.770695,771.0,350.0
4,507168,KHAMMAM,TELANGANA,17.596000,81.056700,0,97.279182,Rajahmundry Airport (RJA/VORY),33.803966,Panduranga Pura,...,36654.0,228.09,160.699724,18600.0,18054.0,15020.0,35102892.0,15.683696,771.0,350.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17231,713217,PASCHIM BARDHAMAN,WEST BENGAL,23.572935,87.233220,0,2.536097,Raniganj AFS,2.591735,PINJRAPOL,...,5939.0,5.93,1001.517707,3112.0,2827.0,17009.0,98428148.0,1.026296,13844.0,69922.0
17232,713154,PURBA BARDHAMAN,WEST BENGAL,23.200172,88.083356,0,42.227284,Guskhara Airfield,2.633228,NIMO,...,10872.0,20.58,528.279883,5514.0,5358.0,17009.0,98428148.0,1.878750,13844.0,69922.0
17233,712222,HOOGHLY,WEST BENGAL,22.792167,88.334472,1,2.776867,Barrackpore AFS (VEPI),0.446098,BAIDYABATI,...,25902.0,15.56,1664.652956,13401.0,12501.0,17009.0,98428148.0,4.476028,13844.0,69922.0
17234,711113,HOWRAH,WEST BENGAL,22.606200,88.294000,1,11.338644,Behala Airport (VEBA),1.647900,DASHNAGAR,...,25701.0,1.98,12980.303030,13364.0,12337.0,17009.0,98428148.0,4.441294,13844.0,69922.0


In [29]:
# Save the pincode-wise GDP data to the processed_data folder
df.to_csv('processed_data/pincode_lev3.csv', index=False)

# Part 2: Rental

In [30]:
import pandas as pd

# Read the rental CSV from the raw_data folder
rental_df = pd.read_csv('raw_data/rental.csv')

# Print the dataframe
print("Rental DataFrame:")
rental_df

Rental DataFrame:


Unnamed: 0,code,type,sqft,rent,rent_per_sqft
0,A01,Hub,47592.0,665637.00,13.99
1,A01,Hub,62641.0,1103550.00,17.62
2,A02,Branch,3272.0,75200.00,22.98
3,A03,Branch,4300.0,136500.00,31.74
4,A04,Hub,30376.0,713836.00,23.50
...,...,...,...,...,...
452,E08,Branch,5620.0,229336.00,40.81
453,Q34,Branch,500.0,14333.00,28.67
454,U27,Branch,2400.0,30000.00,12.50
455,Q10,Branch,150.0,5775.00,38.50


In [31]:
# Add a new column 'type_encode' to rental_df: 'Hub' -> 0, 'Branch' -> 1, and place it after 'type'
rental_df['type_encode'] = rental_df['type'].map({'Hub': 0, 'Branch': 1})
# Move 'type_encode' to be immediately after 'type'
cols = list(rental_df.columns)
if 'type' in cols and 'type_encode' in cols:
    type_idx = cols.index('type')
    cols.insert(type_idx + 1, cols.pop(cols.index('type_encode')))
    rental_df = rental_df[cols]

# Optionally, print the head of the updated DataFrame
print("Rental DataFrame with type_encode after type:")
rental_df

Rental DataFrame with type_encode after type:


Unnamed: 0,code,type,type_encode,sqft,rent,rent_per_sqft
0,A01,Hub,0,47592.0,665637.00,13.99
1,A01,Hub,0,62641.0,1103550.00,17.62
2,A02,Branch,1,3272.0,75200.00,22.98
3,A03,Branch,1,4300.0,136500.00,31.74
4,A04,Hub,0,30376.0,713836.00,23.50
...,...,...,...,...,...,...
452,E08,Branch,1,5620.0,229336.00,40.81
453,Q34,Branch,1,500.0,14333.00,28.67
454,U27,Branch,1,2400.0,30000.00,12.50
455,Q10,Branch,1,150.0,5775.00,38.50


In [32]:
# Read the DTDC pincode mapping CSV
pincode_df = pd.read_csv('raw_data/dtdc_pincode_mapping.csv')

# Merge rental_df with pincode_df on the 'code' column to get the pincode
rental_df = rental_df.merge(pincode_df[['office_code', 'pincode']], left_on='code', right_on='office_code', how='left')

# Ensure the pincode column is of integer type (not float)
rental_df['pincode'] = rental_df['pincode'].astype('Int64')  # Use 'Int64' to allow for NA values if any

# Optionally, drop the 'office_code' column if not needed
rental_df = rental_df.drop(columns=['office_code'])

# Reorder columns to make 'pincode' the first column
cols = list(rental_df.columns)
if 'pincode' in cols:
    cols.insert(0, cols.pop(cols.index('pincode')))
    rental_df = rental_df[cols]

# Show the updated dataframe
print("Rental DataFrame with Pincode:")
rental_df

Rental DataFrame with Pincode:


Unnamed: 0,pincode,code,type,type_encode,sqft,rent,rent_per_sqft
0,382330,A01,Hub,0,47592.0,665637.00,13.99
1,382330,A01,Hub,0,62641.0,1103550.00,17.62
2,390016,A02,Branch,1,3272.0,75200.00,22.98
3,360002,A03,Branch,1,4300.0,136500.00,31.74
4,394325,A04,Hub,0,30376.0,713836.00,23.50
...,...,...,...,...,...,...,...
452,625010,E08,Branch,1,5620.0,229336.00,40.81
453,123501,Q34,Branch,1,500.0,14333.00,28.67
454,244713,U27,Branch,1,2400.0,30000.00,12.50
455,306401,Q10,Branch,1,150.0,5775.00,38.50


In [33]:
# For duplicate pincodes with the same type, take the mean rent_per_sqft
# Also, print their code and type. Check if pincode/type_encode always matches code/type.

# Save the original code/type for each (pincode, type_encode) before grouping
code_type_lookup = rental_df.groupby(['pincode', 'type_encode'])[['code', 'type']].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0]).reset_index()

# Group by pincode and type_encode, take mean rent_per_sqft
rental_df_grouped = rental_df.groupby(['pincode', 'type_encode'], as_index=False)['rent_per_sqft'].mean()

# Merge back code and type for display
rental_df = pd.merge(rental_df_grouped, code_type_lookup, on=['pincode', 'type_encode'], how='left')

# Reorder columns to have code and type before type_encode
cols = list(rental_df.columns)
for col in ['code', 'type']:
    if col in cols:
        cols.insert(cols.index('type_encode'), cols.pop(cols.index(col)))
rental_df = rental_df[cols]

# Check for mismatches: for each row, does type_encode match type ('Hub'->0, 'Branch'->1), and does code match original code for that pincode/type_encode?
type_map = {'Hub': 0, 'Branch': 1}
mismatch_mask = rental_df.apply(lambda row: type_map.get(row['type'], -1) != row['type_encode'], axis=1)
num_mismatches = mismatch_mask.sum()

if num_mismatches > 0:
    print(f"\nNumber of rows where type_encode does NOT match type: {num_mismatches}")
    print(rental_df[mismatch_mask][['pincode', 'type_encode', 'type', 'code']])
else:
    print("\nAll rows have matching type_encode and type.")

rental_df


All rows have matching type_encode and type.


Unnamed: 0,pincode,code,type,type_encode,rent_per_sqft
0,110002,N33,Branch,1,83.130
1,110017,S01,Branch,1,149.410
2,110020,S13,Branch,1,40.482
3,110025,S17,Branch,1,98.290
4,110028,N03,Branch,1,72.930
...,...,...,...,...,...
376,834010,T02,Hub,0,21.060
377,842002,T05,Branch,1,29.420
378,845401,T36,Branch,1,25.890
379,846004,T26,Branch,1,32.870


In [34]:
# Find and print rows with the same pincode but different type
# This means: for each pincode, if there is more than one type (0 and 1), print those rows

# Find pincodes that have both types (0 and 1)
pincode_type_counts = rental_df.groupby('pincode')['type_encode'].nunique()
pincodes_with_both_types = pincode_type_counts[pincode_type_counts > 1].index

# Filter rental_df for these pincodes
rows_with_both_types = rental_df[rental_df['pincode'].isin(pincodes_with_both_types)]

print("Rows with same pincode but different type:")
print(rows_with_both_types)

print(f"\nNumber of such rows: {len(rows_with_both_types)}")

Rows with same pincode but different type:
     pincode  code    type  type_encode  rent_per_sqft
5     110037   N05     Hub            0         36.400
6     110037   S03  Branch            1         95.195
17    122001   L04     Hub            0         44.912
18    122001   L14  Branch            1         33.800
71    226002   U43     Hub            0         19.950
72    226002  U139  Branch            1         19.000
104   302020   Q05     Hub            0         19.720
105   302020   Q17  Branch            1         22.900
146   401208   M81     Hub            0         50.000
147   401208   M46  Branch            1         71.700
234   560064   B10     Hub            0         15.620
235   560064   B26  Branch            1         32.080
323   711409   K95     Hub            0         25.470
324   711409   K72  Branch            1         24.260
335   734010   K66     Hub            0         14.930
336   734010  K129  Branch            1         15.440

Number of such rows: 

In [35]:
rental_df.to_csv('processed_data/rental.csv', index=False)

## Part 3: Final Data

In [36]:
import pandas as pd

# Read the rental and data CSV files
rental_df = pd.read_csv('processed_data/rental.csv')
pincode_df = pd.read_csv('processed_data/pincode_lev3.csv')

# Merge: keep only rental rows, append all columns from data.csv
final_df = pd.merge(rental_df, pincode_df, on='pincode', how='left')

# Show the resulting DataFrame
final_df

Unnamed: 0,pincode,code,type,type_encode,rent_per_sqft,district,statename,latitude,longitude,urban,...,pincode_population,area_km2,population_density,male_population,female_population,state_gdp,state_population,pincode_gdp,state_civil_cases,state_criminal_cases
0,110002,N33,Branch,1,83.130,CENTRAL,DELHI,28.644500,77.223139,1.0,...,180479.0,15.16,11904.947230,95754.0,84725.0,11077.0,18941837.0,105.542344,430.0,317.0
1,110017,S01,Branch,1,149.410,SOUTH,DELHI,28.528842,77.222557,1.0,...,236041.0,10.41,22674.447646,126212.0,109829.0,11077.0,18941837.0,138.034456,430.0,317.0
2,110020,S13,Branch,1,40.482,SOUTH,DELHI,28.520000,77.290000,1.0,...,202963.0,8.44,24047.748815,110392.0,92571.0,11077.0,18941837.0,118.690766,430.0,317.0
3,110025,S17,Branch,1,98.290,BUDAUN,UTTAR PRADESH,28.542556,77.293167,1.0,...,269837.0,17.03,15844.803288,145551.0,124286.0,25479.0,221560649.0,31.030677,66062.0,203726.0
4,110028,N03,Branch,1,72.930,NEW DELHI,DELHI,28.632141,77.138852,0.0,...,119198.0,4.80,24832.916667,66506.0,52692.0,11077.0,18941837.0,69.705818,430.0,317.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,834010,T02,Hub,0,21.060,RANCHI,JHARKHAND,23.321460,85.364005,0.0,...,141654.0,450.35,314.542023,71790.0,69864.0,4610.0,36373747.0,17.953194,524.0,477.0
377,842002,T05,Branch,1,29.420,MUZAFFARPUR,BIHAR,26.120888,85.364720,1.0,...,151334.0,21.02,7199.524263,79950.0,71384.0,8544.0,120082503.0,10.767578,17120.0,82115.0
378,845401,T36,Branch,1,25.890,PURBI CHAMPARAN,BIHAR,26.565600,85.137560,1.0,...,277607.0,188.01,1476.554439,147856.0,129751.0,8544.0,120082503.0,19.752038,17120.0,82115.0
379,846004,T26,Branch,1,32.870,DARBHANGA,BIHAR,26.156167,85.892889,1.0,...,47077.0,3.95,11918.227848,24806.0,22271.0,8544.0,120082503.0,3.349579,17120.0,82115.0


In [37]:
# Print rows that have any NaN values in any columns
nan_rows = final_df[final_df.isna().any(axis=1)]
print("Rows with NaN values in any column:")
print(nan_rows)
print(f"\nNumber of rows with NaN values: {len(nan_rows)}")

Rows with NaN values in any column:
     pincode  code    type  type_encode  rent_per_sqft       district  \
21    122050   L24     Hub            0         17.530            NaN   
22    122107   L28  Branch            1         11.030            NaN   
23    123106   L05  Branch            1         12.410            NaN   
24    123501   Q34  Branch            1         17.905            NaN   
34    135001   J50  Branch            1         76.750            NaN   
46    160002   J01  Branch            1         78.750            NaN   
59    201308   N35  Branch            1         26.250            NaN   
65    209726  U141  Branch            1         51.450            NaN   
130   395007   A46  Branch            1         48.000            NaN   
150   403005   B32  Branch            1         36.750            NaN   
151   403601   B51  Branch            1         39.190            NaN   
152   403722   B65  Branch            1         20.300            NaN   
157   411015   

In [38]:
# Remove all rows with any NaN values
final_df = final_df.dropna()

final_df

Unnamed: 0,pincode,code,type,type_encode,rent_per_sqft,district,statename,latitude,longitude,urban,...,pincode_population,area_km2,population_density,male_population,female_population,state_gdp,state_population,pincode_gdp,state_civil_cases,state_criminal_cases
0,110002,N33,Branch,1,83.130,CENTRAL,DELHI,28.644500,77.223139,1.0,...,180479.0,15.16,11904.947230,95754.0,84725.0,11077.0,18941837.0,105.542344,430.0,317.0
1,110017,S01,Branch,1,149.410,SOUTH,DELHI,28.528842,77.222557,1.0,...,236041.0,10.41,22674.447646,126212.0,109829.0,11077.0,18941837.0,138.034456,430.0,317.0
2,110020,S13,Branch,1,40.482,SOUTH,DELHI,28.520000,77.290000,1.0,...,202963.0,8.44,24047.748815,110392.0,92571.0,11077.0,18941837.0,118.690766,430.0,317.0
3,110025,S17,Branch,1,98.290,BUDAUN,UTTAR PRADESH,28.542556,77.293167,1.0,...,269837.0,17.03,15844.803288,145551.0,124286.0,25479.0,221560649.0,31.030677,66062.0,203726.0
4,110028,N03,Branch,1,72.930,NEW DELHI,DELHI,28.632141,77.138852,0.0,...,119198.0,4.80,24832.916667,66506.0,52692.0,11077.0,18941837.0,69.705818,430.0,317.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,834010,T02,Hub,0,21.060,RANCHI,JHARKHAND,23.321460,85.364005,0.0,...,141654.0,450.35,314.542023,71790.0,69864.0,4610.0,36373747.0,17.953194,524.0,477.0
377,842002,T05,Branch,1,29.420,MUZAFFARPUR,BIHAR,26.120888,85.364720,1.0,...,151334.0,21.02,7199.524263,79950.0,71384.0,8544.0,120082503.0,10.767578,17120.0,82115.0
378,845401,T36,Branch,1,25.890,PURBI CHAMPARAN,BIHAR,26.565600,85.137560,1.0,...,277607.0,188.01,1476.554439,147856.0,129751.0,8544.0,120082503.0,19.752038,17120.0,82115.0
379,846004,T26,Branch,1,32.870,DARBHANGA,BIHAR,26.156167,85.892889,1.0,...,47077.0,3.95,11918.227848,24806.0,22271.0,8544.0,120082503.0,3.349579,17120.0,82115.0


In [39]:
final_df.to_csv('data.csv', index=False)