Creating the GeoID file


In [None]:
# Notebook 1: Generate crimes_with_geoids.csv

# Purpose: Loads raw crime data with coordinates and census tract centroids,
#          performs a spatial join to match each crime to the nearest census tract,
#          and saves the result (including crime details and the matched geoid)
#          to './data/crimes_with_geoids.csv'.

import pandas as pd
import geopandas as gpd
import json
import warnings
import os # For creating directory and joining paths

print("--- Starting Notebook 1: Generating crimes_with_geoids.csv ---")

# --- Configuration ---
# Define the directory where data is located and output will be saved
DATA_DIR = "./data"
# Ensure the output directory exists
os.makedirs(DATA_DIR, exist_ok=True)
print(f"Ensured data directory '{DATA_DIR}' exists.")

# Define input file paths
# Assumes raw crime data is in JSON or JSON Lines format
RAW_CRIME_JSON_PATH = os.path.join(DATA_DIR, "crime_data.json")
# Assumes centroids are in a CSV file
CENTROIDS_CSV_PATH = os.path.join(DATA_DIR, "CA_tract_centroids_2020.csv.txt")
# Define the target output CSV file path
OUTPUT_GEOID_CSV_PATH = os.path.join(DATA_DIR, "crimes_with_geoids.csv")

# Fallback for original crime file name if needed
ORIGINAL_CRIME_JSON_PATH = "data_notebook-notebook-1_dataset2.json"

# --- STEP 0: Load Data ---
print("Loading input data...")
try:
    # Load crime data from JSON
    actual_crime_path = RAW_CRIME_JSON_PATH
    if not os.path.exists(actual_crime_path):
         print(f"Warning: '{actual_crime_path}' not found. Trying fallback '{ORIGINAL_CRIME_JSON_PATH}'...")
         if os.path.exists(ORIGINAL_CRIME_JSON_PATH):
             actual_crime_path = ORIGINAL_CRIME_JSON_PATH
         else:
             raise FileNotFoundError(f"Cannot find raw crime JSON file at '{RAW_CRIME_JSON_PATH}' or '{ORIGINAL_CRIME_JSON_PATH}'")

    with open(actual_crime_path, 'r') as f:
        try:
            crime_data_list = json.load(f)
            print(f"Loaded JSON data from {actual_crime_path}")
        except json.JSONDecodeError:
            print(f"Reading {actual_crime_path} as JSON Lines format.")
            f.seek(0)
            crime_data_list = [json.loads(line.strip()) for line in f if line.strip()]
            print(f"Loaded JSON Lines data from {actual_crime_path}")

    crime_df = pd.DataFrame(crime_data_list)
    print(f"Loaded {len(crime_df)} crime records.")
    # --- Make sure 'dr_no' exists and is suitable as a key ---
    if 'dr_no' not in crime_df.columns:
        print("ERROR: 'dr_no' column is missing from the raw crime data. This ID is required for merging later.")
        exit()
    # Optional: Convert dr_no to string early if needed, though merge handles mixed types often
    # crime_df['dr_no'] = crime_df['dr_no'].astype(str)


    # Load centroids data
    if not os.path.exists(CENTROIDS_CSV_PATH):
        raise FileNotFoundError(f"Cannot find centroids file at '{CENTROIDS_CSV_PATH}'")
    centroids_df = pd.read_csv(CENTROIDS_CSV_PATH)
    print(f"Loaded {len(centroids_df)} centroid records from {CENTROIDS_CSV_PATH}.")


except FileNotFoundError as e:
    print(f"Error loading file: {e}. Make sure input files exist at the specified paths.")
    exit()
except json.JSONDecodeError as e:
    print(f"Error decoding JSON file: {e}. Check the JSON format.")
    exit()
except Exception as e:
    print(f"An error occurred during data loading: {e}")
    exit()

# --- STEP 1: Prepare Centroids Data ---
print("Preparing centroids data...")
try:
    # Create the 11-digit GEOID
    centroids_df['geoid'] = (
        centroids_df['STATEFP'].astype(str).str.zfill(2) +
        centroids_df['COUNTYFP'].astype(str).str.zfill(3) +
        centroids_df['TRACTCE'].astype(str).str.zfill(6)
    )
    # Select only needed columns for the GeoDataFrame
    centroids_prep_df = centroids_df[['geoid', 'LATITUDE', 'LONGITUDE']].copy()
    print("Generated GEOIDs for centroids.")
except KeyError as e:
    print(f"Error preparing centroids: Missing expected column - {e}")
    print(f"Available columns in centroids file: {centroids_df.columns.tolist()}")
    exit()
except Exception as e:
     print(f"An error occurred during centroid preparation: {e}")
     exit()

# --- STEP 2: Prepare Crime Data (Coordinates) ---
print("Preparing crime data coordinates...")
# Identify latitude and longitude columns (handle variations)
lat_col = None
lon_col = None
if 'lat' in crime_df.columns:
    lat_col = 'lat'
elif 'latitude' in crime_df.columns:
    lat_col = 'latitude'

if 'lon' in crime_df.columns:
    lon_col = 'lon'
elif 'longitude' in crime_df.columns:
    lon_col = 'longitude'

if not lat_col or not lon_col:
    print(f"Error: Crime data is missing latitude (checked 'lat', 'latitude') or longitude (checked 'lon', 'longitude') columns.")
    print(f"Available columns: {crime_df.columns.tolist()}")
    exit()
else:
    print(f"Using '{lat_col}' for latitude and '{lon_col}' for longitude.")

# Clean and convert coordinates
crime_df['lat_str'] = crime_df[lat_col].astype(str).str.replace('"', '')
crime_df['lon_str'] = crime_df[lon_col].astype(str).str.replace('"', '')
crime_df['lat_numeric'] = pd.to_numeric(crime_df['lat_str'], errors='coerce')
crime_df['lon_numeric'] = pd.to_numeric(crime_df['lon_str'], errors='coerce')

original_crime_count = len(crime_df)
crime_df.dropna(subset=['lat_numeric', 'lon_numeric'], inplace=True)
dropped_count = original_crime_count - len(crime_df)
if dropped_count > 0:
    print(f"Dropped {dropped_count} crime records due to invalid/missing coordinates.")

if crime_df.empty:
    print("Error: No valid crime records remaining after cleaning coordinates. Cannot proceed.")
    exit()
print("Cleaned crime coordinates.")

# --- STEP 3: Convert to GeoDataFrames (Initial CRS) ---
print("Converting to GeoDataFrames with initial CRS (EPSG:4326)...")
try:
    crime_gdf = gpd.GeoDataFrame(
        crime_df,
        geometry=gpd.points_from_xy(crime_df.lon_numeric, crime_df.lat_numeric),
        crs="EPSG:4326"  # WGS84 - standard for lat/lon
    )
    print("Created crime GeoDataFrame.")

    centroids_gdf = gpd.GeoDataFrame(
        centroids_prep_df,
        geometry=gpd.points_from_xy(centroids_prep_df.LONGITUDE, centroids_prep_df.LATITUDE),
        crs="EPSG:4326" # WGS84
    )
    print("Created centroids GeoDataFrame.")
except Exception as e:
    print(f"Error creating GeoDataFrames: {e}")
    exit()

# --- STEP 4: Reproject to a Suitable Projected CRS ---
# Using EPSG:3310 (NAD83 / California Albers) - good for statewide area/distance
projected_crs = "EPSG:3310"
print(f"Reprojecting GeoDataFrames to {projected_crs} for accurate distance calculation...")
try:
    crime_gdf_proj = crime_gdf.to_crs(projected_crs)
    centroids_gdf_proj = centroids_gdf.to_crs(projected_crs)
    print("Reprojection complete.")
except Exception as e:
    print(f"Error during reprojection: {e}")
    exit()

# --- STEP 5: Perform Spatial Join (Nearest Neighbor) on Projected Data ---
print("Performing spatial join (finding nearest centroid for each crime using projected data)...")
try:
    # Keep only geometry from projected centroids for the join itself
    centroids_join_data = centroids_gdf_proj[['geometry']]

    # Perform the nearest neighbor join on the *projected* data
    crimes_joined_proj = gpd.sjoin_nearest(
        crime_gdf_proj,
        centroids_join_data,
        how='left',
        distance_col="distance_meters" # Distance will be in meters
    )
    print("sjoin_nearest completed.")

    # --- Merge the 'geoid' back using the index ---
    # 'index_right' in crimes_joined_proj refers to the index of centroids_gdf_proj.
    # The index of centroids_gdf_proj matches the index of the original centroids_gdf
    # if we ensure the original index is clean (0, 1, 2...).
    centroids_gdf = centroids_gdf.reset_index(drop=True) # Ensure default integer index

    # Select only the 'geoid' from the original centroids GDF for merging
    geoids_to_merge = centroids_gdf[['geoid']]

    # Merge based on the index mapping from sjoin_nearest
    crimes_with_geoids_final = crimes_joined_proj.merge(
        geoids_to_merge,
        left_on='index_right', # Index from the centroids GDF in the join
        right_index=True,      # Use the actual index of geoids_to_merge
        how='left'
    )
    print("Merged 'geoid' back to the joined crime data.")
    print(f"Spatial join and geoid merge complete. Result has {len(crimes_with_geoids_final)} records.")
    # print("Columns after merge:", crimes_with_geoids_final.columns.tolist()) # Debugging columns

except Exception as e:
    print(f"Error during spatial join or merging geoid: {e}")
    # Provide more context if available
    if 'crimes_joined_proj' in locals():
         print(f"Columns in crimes_joined_proj before final merge: {crimes_joined_proj.columns.tolist()}")
    exit()

# --- STEP 6: Clean Up and Save Output ---
print("Cleaning up columns and saving results...")

# Define columns to keep for the output file.
# We need the crime identifier ('dr_no') and the matched 'geoid'.
# Include other potentially useful original crime columns.
# Avoid geometry and intermediate coordinate columns.

# Get original columns from the initial crime DataFrame
original_cols = list(crime_df.columns)
# Remove columns we definitely don't need in the final CSV for Notebook 2
cols_to_remove_from_original = ['lat_str', 'lon_str', 'lat_numeric', 'lon_numeric', 'geometry']
final_original_cols = [col for col in original_cols if col not in cols_to_remove_from_original]

# Add the columns generated/merged by the spatial process
added_cols_to_keep = ['geoid', 'distance_meters'] # Keep matched geoid and distance

# Combine the lists, ensuring 'dr_no' and 'geoid' are present and removing duplicates
cols_for_output = final_original_cols + added_cols_to_keep
# Ensure critical columns are present
if 'dr_no' not in cols_for_output:
    cols_for_output.insert(0, 'dr_no') # Add if missing
if 'geoid' not in cols_for_output:
     cols_for_output.append('geoid') # Add if missing (shouldn't happen after merge)

# Select only columns that actually exist in the final joined dataframe and remove duplicates
final_output_columns = pd.Index([col for col in cols_for_output if col in crimes_with_geoids_final.columns]).unique().tolist()

# Create the final dataframe with selected columns
output_df = crimes_with_geoids_final[final_output_columns].copy()

# Optional: Check for rows where geoid might be null after the merge (shouldn't happen with 'left' join unless centroids were missing)
null_geoids = output_df['geoid'].isnull().sum()
if null_geoids > 0:
    print(f"Warning: {null_geoids} records have a null 'geoid' after the join/merge process.")

# Save the result to the CSV file
try:
    output_df.to_csv(OUTPUT_GEOID_CSV_PATH, index=False)
    print(f"✅ Successfully saved {len(output_df)} records to {OUTPUT_GEOID_CSV_PATH}")
    # Display first few rows of the output
    print("\nFirst 5 rows of the generated file:")
    print(output_df.head())
    print("\nColumns in the generated file:")
    print(output_df.columns.tolist())

except Exception as e:
    print(f"Error saving the output file '{OUTPUT_GEOID_CSV_PATH}': {e}")

print("\n--- Finished Notebook 1 ---")