In [5]:
import pandas as pd
import geopandas as gpd

# --- 1. Load Census Tract Shapefile ---
print("Loading Census Tract shapefile...")
tracts = gpd.read_file(r"D:\DEPI_Project\Datasets\ShapeFiles\cb_2023_us_tract_500k\cb_2023_us_tract_500k.shp")
tracts = tracts[['GEOID', 'geometry']].rename(columns={'GEOID': 'district_fips_id'})
tracts = tracts.to_crs("EPSG:4326")  # Match GPS coordinate system
print(f"Loaded {len(tracts)} census tracts.\n")

# --- Helper Function to Assign Tracts ---
def assign_census_tract(df, lat_col, lon_col, name):
    print(f"Processing {name} dataset...")
    
    # Convert to GeoDataFrame
    gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df[lon_col], df[lat_col]),
        crs="EPSG:4326"
    )

    # Spatial join: find which tract each point belongs to
    joined = gpd.sjoin(gdf, tracts, how="left", predicate="within")

    # Remove geometry (optional, for CSV export)
    joined = pd.DataFrame(joined.drop(columns=['geometry', 'index_right'], errors='ignore'))
    print(f"{name}: Assigned census tracts to {joined['district_fips_id'].notna().sum()} records.\n")
    
    return joined

# --- 2. Real Estate Dataset ---
real_estate = pd.read_csv(r"D:\DEPI_Project\Datasets\Cleaned\Real Estate\cleaned_real_estate.csv")
real_estate_with_tract = assign_census_tract(real_estate, 'Latitude', 'Longitude', 'Real Estate')
real_estate_with_tract.to_csv(r"D:\DEPI_Project\Datasets\Cleaned Data with Tracts\Real Estate\real_estate.csv", index=False)

# --- 3. Crimes Dataset ---
crimes = pd.read_csv(r"D:\DEPI_Project\Datasets\Cleaned\Crimes\cleaned_crimes.csv")
crimes_with_tract = assign_census_tract(crimes, 'Latitude', 'Longitude', 'Crimes')
crimes_with_tract.to_csv(r"D:\DEPI_Project\Datasets\Cleaned Data with Tracts\Crimes\crimes.csv", index=False)

# --- 4. Congestion Dataset ---
congestion = pd.read_csv(r"D:\DEPI_Project\Datasets\Cleaned\Congestion\cleaned_congestion.csv")

# Detect if Start_Lat / Start_Lng exist (they usually do)
lat_col = 'Start_Lat' if 'Start_Lat' in congestion.columns else 'Latitude'
lon_col = 'Start_Lng' if 'Start_Lng' in congestion.columns else 'Longitude'

congestion_with_tract = assign_census_tract(congestion, lat_col, lon_col, 'Congestion')
congestion_with_tract.to_csv(r"D:\DEPI_Project\Datasets\Cleaned Data with Tracts\Congestion\congestion.csv", index=False)

# --- 5. Census Dataset (already has FIPS, just rename to match others) ---
census = pd.read_csv(r"D:\DEPI_Project\Datasets\Cleaned\Population and Anuual Income\cleaned_population_and_annual_income.csv")
if 'district_fips_id' not in census.columns and {'state_fips', 'county_fips', 'tract_fips'}.issubset(census.columns):
    census['district_fips_id'] = (
        census['state_fips'].astype(str).str.zfill(2)
        + census['county_fips'].astype(str).str.zfill(3)
        + census['tract_fips'].astype(str).str.zfill(6)
    )
census.to_csv(r"D:\DEPI_Project\Datasets\Cleaned Data with Tracts\Population And Annual Income\population_and_annual_income.csv", index=False)

print("All datasets processed successfully and saved with census tract IDs!")


Loading Census Tract shapefile...
Loaded 85186 census tracts.

Processing Real Estate dataset...
Real Estate: Assigned census tracts to 389344 records.

Processing Crimes dataset...
Crimes: Assigned census tracts to 163296 records.

Processing Congestion dataset...
Congestion: Assigned census tracts to 1044976 records.

All datasets processed successfully and saved with census tract IDs!
