## Part 1: Pincode Data

In [1]:
import pandas as pd

# Read the pincode directory CSV
pincode_dir_df = pd.read_csv('raw_data/pincode_directory.csv')

# Select relevant columns: pincode, district, statename
pincode_district_state_df = pincode_dir_df[['pincode', 'district', 'statename']]

# For each pincode, keep only the first occurrence of (district, statename)
first_district_state_per_pincode = pincode_district_state_df.drop_duplicates(subset=['pincode'], keep='first')

# Print debug info
print(f"Total rows in original file: {len(pincode_dir_df)}")
print(f"Unique pincodes in original file: {pincode_dir_df['pincode'].nunique()}")
print(f"\nRows after keeping first (district, statename) per pincode: {len(first_district_state_per_pincode)}")
print(f"Unique pincodes after this operation: {first_district_state_per_pincode['pincode'].nunique()}")

# Print the number of distinct pincodes and the shape of the dataframe
num_distinct_pincodes = first_district_state_per_pincode['pincode'].nunique()
print(f"\nNumber of distinct pincodes: {num_distinct_pincodes}")
print(first_district_state_per_pincode)

# Save the result to processed_data folder
first_district_state_per_pincode.to_csv('processed_data/pincode_directory.csv', index=False)

Total rows in original file: 165631
Unique pincodes in original file: 19584

Rows after keeping first (district, statename) per pincode: 19584
Unique pincodes after this operation: 19584

Number of distinct pincodes: 19584
        pincode         district    statename
0        507204          KHAMMAM    TELANGANA
2        507169          KHAMMAM    TELANGANA
3        507208          KHAMMAM    TELANGANA
7        507002          KHAMMAM    TELANGANA
9        507168          KHAMMAM    TELANGANA
...         ...              ...          ...
165522   713154  PURBA BARDHAMAN  WEST BENGAL
165564   712222          HOOGHLY  WEST BENGAL
165579   711113           HOWRAH  WEST BENGAL
165594   721303   MEDINIPUR WEST  WEST BENGAL
165629   721638   MEDINIPUR EAST  WEST BENGAL

[19584 rows x 3 columns]
