 # Data Processing Pipeline Overview
 
This notebook processes raw data from multiple sources to produce a final, clean, and usable dataset for model building.

The steps include:
- Loading and cleaning pincode, district, and state information
- Merging population and GDP data at the pincode and state levels
- Handling missing values and ensuring data consistency
- Generating features required for modeling
 
 The final output will be a dataset with one row per pincode, containing all relevant features for downstream machine learning tasks.

## Part 1: Pincode List

In [6]:
import pandas as pd

# Read the pincode directory CSV
pincode_dir_df = pd.read_csv('raw_data/pincode_directory.csv')

# Select relevant columns: pincode, district, statename
pincode_district_state_df = pincode_dir_df[['pincode', 'district', 'statename']]

# For each pincode, keep only the first occurrence of (district, statename)
first_district_state_per_pincode = pincode_district_state_df.drop_duplicates(subset=['pincode'], keep='first')

# Print debug info
print(f"Total rows in original file: {len(pincode_dir_df)}")
print(f"Unique pincodes in original file: {pincode_dir_df['pincode'].nunique()}")
print(f"\nRows after keeping first (district, statename) per pincode: {len(first_district_state_per_pincode)}")
print(f"Unique pincodes after this operation: {first_district_state_per_pincode['pincode'].nunique()}")

Total rows in original file: 165631
Unique pincodes in original file: 19584

Rows after keeping first (district, statename) per pincode: 19584
Unique pincodes after this operation: 19584


In [7]:
# Check for NaN values in 'district' and 'statename'
num_nan_district = first_district_state_per_pincode['district'].isna().sum()
num_nan_statename = first_district_state_per_pincode['statename'].isna().sum()
num_nan_both = first_district_state_per_pincode[first_district_state_per_pincode['district'].isna() & first_district_state_per_pincode['statename'].isna()].shape[0]

print(f"Number of pincodes with NaN in 'district': {num_nan_district}")
print(f"Number of pincodes with NaN in 'statename': {num_nan_statename}")
print(f"Number of pincodes with NaN in BOTH 'district' and 'statename': {num_nan_both}")

# Show some examples if any exist
if num_nan_district > 0 or num_nan_statename > 0:
    print("Rows with NaN in 'district' or 'statename':")
    print(first_district_state_per_pincode[first_district_state_per_pincode['district'].isna() | first_district_state_per_pincode['statename'].isna()])

# Remove all rows where 'district' or 'statename' is NaN
first_district_state_per_pincode = first_district_state_per_pincode.dropna(subset=['district', 'statename'])

Number of pincodes with NaN in 'district': 129
Number of pincodes with NaN in 'statename': 129
Number of pincodes with NaN in BOTH 'district' and 'statename': 129
Rows with NaN in 'district' or 'statename':
        pincode district statename
380      523261      NaN       NaN
557      533240      NaN       NaN
894      844125      NaN       NaN
960      494446      NaN       NaN
985      494111      NaN       NaN
...         ...      ...       ...
163288   411077      NaN       NaN
163300   411075      NaN       NaN
163380   431025      NaN       NaN
163560   794116      NaN       NaN
165437   736209      NaN       NaN

[129 rows x 3 columns]


In [8]:
first_district_state_per_pincode

Unnamed: 0,pincode,district,statename
0,507204,KHAMMAM,TELANGANA
2,507169,KHAMMAM,TELANGANA
3,507208,KHAMMAM,TELANGANA
7,507002,KHAMMAM,TELANGANA
9,507168,KHAMMAM,TELANGANA
...,...,...,...
165522,713154,PURBA BARDHAMAN,WEST BENGAL
165564,712222,HOOGHLY,WEST BENGAL
165579,711113,HOWRAH,WEST BENGAL
165594,721303,MEDINIPUR WEST,WEST BENGAL


In [9]:
# Save the result to processed_data folder
first_district_state_per_pincode.to_csv('processed_data/pincode_directory.csv', index=False)

## Part 2: Pincode Wise Area and Demographic Data

In [14]:
import pandas as pd

geoiq_csv_path = 'processed_data/geoiq_pincode_data.csv'
geoiq_df = pd.read_csv(geoiq_csv_path)

# Find rows with any NaN value in any field
geoiq_with_nan = geoiq_df[geoiq_df.isna().any(axis=1)]

print("Rows in GeoIQ CSV with at least one NaN value:")
geoiq_with_nan

Rows in GeoIQ CSV with at least one NaN value:


Unnamed: 0,url,pincode,place_name,population,area_km2,male_population,female_population
21,,110022,,,,,
95,,110099,,,,,
96,,110102,,,,,
97,,110106,,,,,
98,,110110,,,,,
...,...,...,...,...,...,...,...
19335,,851206,,,,,
19371,,852134,,,,,
19402,,854110,,,,,
19453,,900056,,,,,


In [16]:
# Remove rows with any NaN value in any field from the original DataFrame and save to CSV
geoiq_df = geoiq_df.dropna()
geoiq_df.to_csv('processed_data/geoiq_pincode_data.csv', index=False)
geoiq_df

Unnamed: 0,url,pincode,place_name,population,area_km2,male_population,female_population
0,https://geoiq.io/places/110001---Sansad-Marg/C...,110001,110001 - Sansad Marg,250430.0,18.74,135652.0,114778.0
1,https://geoiq.io/places/110002---Indraprastha/...,110002,110002 - Indraprastha,180479.0,15.16,95754.0,84725.0
2,https://geoiq.io/places/110003---Lodi-Road/vkn...,110003,110003 - Lodi Road,187073.0,17.29,100220.0,86853.0
3,https://geoiq.io/places/110004---Rashtrapati-B...,110004,110004 - Rashtrapati Bhawan,8846.0,1.66,4851.0,3995.0
4,https://geoiq.io/places/110005---Karol-Bagh/rS...,110005,110005 - Karol Bagh,251105.0,8.61,133452.0,117653.0
...,...,...,...,...,...,...,...
19448,https://geoiq.io/places/855113---Salmari/nup9w...,855113,855113 - Salmari,385307.0,336.60,200649.0,184658.0
19449,https://geoiq.io/places/855114---Sonali/zubD8M...,855114,855114 - Sonali,256769.0,301.98,133370.0,123399.0
19450,https://geoiq.io/places/855115---Sontha/fOiVVq...,855115,855115 - Sontha,502071.0,573.34,260168.0,241903.0
19451,https://geoiq.io/places/855116---Thakurganj/at...,855116,855116 - Thakurganj,221842.0,293.80,114131.0,107711.0


## Part 3: Pincode wise GDP Data

In [11]:
import pandas as pd

# Read state-wise GDP data
state_gdp_df = pd.read_csv('raw_data/state_wise_gdp.csv')

# Remove states with NaN GDP values
state_gdp_df = state_gdp_df.dropna(subset=['gdp'])
state_gdp_df

Unnamed: 0,state,gdp
0,MAHARASHTRA,40443.0
1,TAMIL NADU,27216.0
2,UTTAR PRADESH,25479.0
3,KARNATAKA,25007.0
4,GUJARAT,24258.0
5,WEST BENGAL,17009.0
6,RAJASTHAN,15284.0
7,TELANGANA,15020.0
8,ANDHRA PRADESH,14397.0
9,MADHYA PRADESH,13633.0


In [12]:
# Read pincode-directory to get pincode and state
pincode_dir_df = pd.read_csv('processed_data/pincode_directory.csv')

# Read geoiq pincode data to get population per pincode
geoiq_df = pd.read_csv('processed_data/geoiq_pincode_data.csv')

# Merge pincode_directory with geoiq_df to get pincode, state, and population
pincode_state_pop = pd.merge(
    pincode_dir_df[['pincode', 'statename']],
    geoiq_df[['pincode', 'population']],
    on='pincode',
    how='inner'
)

pincode_state_pop

Unnamed: 0,pincode,statename,population
0,507204,TELANGANA,26076.0
1,507169,TELANGANA,23567.0
2,507208,TELANGANA,54915.0
3,507002,TELANGANA,209801.0
4,507168,TELANGANA,36654.0
...,...,...,...
18959,713217,WEST BENGAL,5939.0
18960,713154,WEST BENGAL,10872.0
18961,712222,WEST BENGAL,25902.0
18962,711113,WEST BENGAL,25701.0


In [13]:
# Merge to get GDP for each pincode's state, and drop the duplicate 'state' column
pincode_state_pop = pd.merge(
    pincode_state_pop,
    state_gdp_df[['state', 'gdp']],
    left_on='statename',
    right_on='state',
    how='inner'
).drop(columns=['state'])

pincode_state_pop

Unnamed: 0,pincode,statename,population,gdp
0,507204,TELANGANA,26076.0,15020.0
1,507169,TELANGANA,23567.0,15020.0
2,507208,TELANGANA,54915.0,15020.0
3,507002,TELANGANA,209801.0,15020.0
4,507168,TELANGANA,36654.0,15020.0
...,...,...,...,...
18618,713217,WEST BENGAL,5939.0,17009.0
18619,713154,WEST BENGAL,10872.0,17009.0
18620,712222,WEST BENGAL,25902.0,17009.0
18621,711113,WEST BENGAL,25701.0,17009.0


In [14]:
# For each state, calculate total population
state_total_pop = pincode_state_pop.groupby('statename')['population'].sum().reset_index()
state_total_pop = state_total_pop.rename(columns={'population': 'state_total_population'})

# Merge state total population into pincode_state_pop
pincode_state_pop = pd.merge(
    pincode_state_pop,
    state_total_pop,
    on='statename',
    how='left'
)

pincode_state_pop

Unnamed: 0,pincode,statename,population,gdp,state_total_population
0,507204,TELANGANA,26076.0,15020.0,38624735.0
1,507169,TELANGANA,23567.0,15020.0,38624735.0
2,507208,TELANGANA,54915.0,15020.0,38624735.0
3,507002,TELANGANA,209801.0,15020.0,38624735.0
4,507168,TELANGANA,36654.0,15020.0,38624735.0
...,...,...,...,...,...
18618,713217,WEST BENGAL,5939.0,17009.0,99145679.0
18619,713154,WEST BENGAL,10872.0,17009.0,99145679.0
18620,712222,WEST BENGAL,25902.0,17009.0,99145679.0
18621,711113,WEST BENGAL,25701.0,17009.0,99145679.0


In [15]:
# Calculate pincode GDP: (pincode population / state total population) * state GDP
pincode_state_pop['pincode_gdp'] = (
    pincode_state_pop['population'] / pincode_state_pop['state_total_population']
) * pincode_state_pop['gdp']

# Prepare final DataFrame
pincode_gdp_df = pincode_state_pop[['pincode', 'pincode_gdp']].copy()

# Save to CSV
pincode_gdp_df.to_csv('processed_data/pincode_gdp.csv', index=False)

# Show the result
pincode_gdp_df

Unnamed: 0,pincode,pincode_gdp
0,507204,10.140174
1,507169,9.164499
2,507208,21.354795
3,507002,81.585311
4,507168,14.253640
...,...,...
18618,713217,1.018869
18619,713154,1.865153
18620,712222,4.443634
18621,711113,4.409151


In [None]:
# Save the pincode-wise GDP data to the processed_data folder
pincode_gdp_df.to_csv('processed_data/pincode_gdp.csv', index=False)

## Part 4: Final Clean Data

In [19]:
import pandas as pd

geoiq_df = pd.read_csv('processed_data/geoiq_pincode_data.csv')

# Calculate population density: population / area_km2
geoiq_df['population_density'] = geoiq_df['population'] / geoiq_df['area_km2']

# Load the pincode GDP data
pincode_gdp_df = pd.read_csv('processed_data/pincode_gdp.csv')

# Merge on 'pincode'
final_df = pd.merge(
    geoiq_df[['pincode', 'population_density']],
    pincode_gdp_df,
    on='pincode',
    how='inner'
)

# Save the final DataFrame as 'data.csv'
final_df.to_csv('data.csv', index=False)

# Show the result
final_df

Unnamed: 0,pincode,population_density,pincode_gdp
0,110001,13363.393810,146.449001
1,110002,11904.947230,105.542344
2,110003,10819.722383,109.398451
3,110004,5328.915663,5.173054
4,110005,29164.343786,146.843735
...,...,...,...
18618,855113,1144.702911,26.810854
18619,855114,850.284787,17.866782
18620,855115,875.695050,34.935655
18621,855116,755.078285,15.436453
