In [7]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, box

RAW_PATH = "../data/raw/Crime_Data_from_2020_to_Present.csv"  # adjust if needed
PROCESSED_DIR = "../data/processed"
os.makedirs(PROCESSED_DIR, exist_ok=True)

In [8]:
# Load
df = pd.read_csv(RAW_PATH)

# Peek
print(df.shape)
print(df.columns.tolist()[:30])

# Try to detect lat/lon columns among common names
lat_candidates = ["Latitude", "LAT", "lat", "Y", "Lat"]
lon_candidates = ["Longitude", "LON", "lon", "X", "Lng", "Long"]

def find_col(cands):
    for c in cands:
        if c in df.columns:
            return c
    return None

LAT_COL = find_col(lat_candidates)
LON_COL = find_col(lon_candidates)

if LAT_COL is None or LON_COL is None:
    raise ValueError(f"Couldn't find latitude/longitude columns. Found LAT={LAT_COL}, LON={LON_COL}")

# Try to detect a date column
date_candidates = ["Date", "DATE OCC", "DATE_OCC", "Occurred_Date", "Reported_Date", "date", "DATE"]
DATE_COL = find_col(date_candidates)
if DATE_COL is None:
    # As fallback, show columns so you can pick manually
    raise ValueError("Couldn't detect a date column. Print df.columns and choose one.")

# Standardize columns we’ll use later
df = df.rename(columns={LAT_COL: "Latitude", LON_COL: "Longitude", DATE_COL: "Date"})

(1004991, 28)
['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT', 'LON']


In [9]:
# Drop rows without coordinates
df = df.dropna(subset=["Latitude", "Longitude"]).copy()

# Coerce to numeric just in case
df["Latitude"] = pd.to_numeric(df["Latitude"], errors="coerce")
df["Longitude"] = pd.to_numeric(df["Longitude"], errors="coerce")
df = df.dropna(subset=["Latitude", "Longitude"])

# Parse date; coerce bad rows to NaT then drop
df["Date"] = pd.to_datetime(df["Date"], format="%m/%d/%Y %I:%M:%S %p", errors="coerce", utc=True)
df = df.dropna(subset=["Date"]).copy()

# Keep 2020+ if not already filtered
df = df[df["Date"].dt.year >= 2020].copy()

# Save the cleaned points (optional)
df.to_csv(f"{PROCESSED_DIR}/crime_cleaned.csv", index=False)
print("Saved cleaned points:", f"{PROCESSED_DIR}/crime_cleaned.csv", df.shape)

Saved cleaned points: ../data/processed/crime_cleaned.csv (1004991, 28)


In [10]:
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]),
    crs="EPSG:4326"
)

# Pick a good local UTM automatically (so grid sizes are real meters)
utm_crs = gdf.estimate_utm_crs()
gdf_utm = gdf.to_crs(utm_crs)
utm_crs

<Projected CRS: EPSG:32621>
Name: WGS 84 / UTM zone 21N
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: Between 60°W and 54°W, northern hemisphere between equator and 84°N, onshore and offshore. Barbados. Brazil. Canada - Newfoundland and Labrador, Quebec. French Guiana. Greenland. Guyana. St Pierre and Miquelon. Suriname.
- bounds: (-60.0, 0.0, -54.0, 84.0)
Coordinate Operation:
- name: UTM zone 21N
- method: Transverse Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich