In [1]:
import pandas as pd
import numpy as np

# === Load data ===
df = pd.read_csv("/Users/jennifercasavantes/Downloads/NYPD_Complaint_Data_Historic_20250529.csv", skipinitialspace=True, low_memory=False)

# === Keep only Manhattan ===
df = df[df["PATROL_BORO"].str.upper().str.contains("MAN", na=False)]

# === Select relevant columns ===
df = df[["CMPLNT_FR_DT", "OFNS_DESC", "LAW_CAT_CD", "Latitude", "Longitude"]]
df = df.dropna(subset=["Latitude", "Longitude"])

# === Assign severity weight ===
def assign_severity_weight(law_cat):
    if pd.isna(law_cat): return 0
    law_cat = str(law_cat).upper()
    return {"FELONY": 3, "MISDEMEANOR": 2, "VIOLATION": 1}.get(law_cat, 0)

df["severity_weight"] = df["LAW_CAT_CD"].apply(assign_severity_weight)

# === Rename columns ===
df = df.rename(columns={
    "CMPLNT_FR_DT": "Date",
    "OFNS_DESC": "Crime_Type",
    "LAW_CAT_CD": "Severity"
})

# === Clean and convert dates ===
df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
df = df.dropna(subset=["Date", "Crime_Type", "Severity"])
df = df[df["Date"] >= '2000-01-01']

# === Manhattan coordinate bounds (coarse cleanup) ===
df = df[(df["Latitude"] >= 40.70) & (df["Latitude"] <= 40.88) &
        (df["Longitude"] >= -74.02) & (df["Longitude"] <= -73.93)]

# === Remove duplicates ===
df = df.drop_duplicates(subset=["Date", "Crime_Type", "Severity", "Latitude", "Longitude"])

# === Add year column ===
df["Year"] = df["Date"].dt.year

# === Add recency weighting (linear from 0.04 to 1.0) ===
df["recency_weight"] = (df["Year"] - 1999) / 25  # 2000 = 0.04, 2025 = 1.0

# === Final weighted severity = severity × recency ===
df["weighted_severity"] = df["severity_weight"] * df["recency_weight"]

print("✅ Part 1 complete: Data cleaned and recency-weighted")
print(df[["Year", "Severity", "severity_weight", "recency_weight", "weighted_severity"]].head())


✅ Part 1 complete: Data cleaned and recency-weighted
   Year     Severity  severity_weight  recency_weight  weighted_severity
4  2024       FELONY                3             1.0                3.0
5  2024       FELONY                3             1.0                3.0
6  2024  MISDEMEANOR                2             1.0                2.0
7  2024  MISDEMEANOR                2             1.0                2.0
8  2024  MISDEMEANOR                2             1.0                2.0


In [2]:
import geopandas as gpd
from shapely.geometry import Point

# === Convert crimes to GeoDataFrame ===
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]),
    crs="EPSG:4326"  # standard lat/lon projection
)

# === Load census tract geometries ===
tracts = gpd.read_file("../census tract geofiles/manhattan_census_tracts.geojson").to_crs(epsg=4326)

# === Spatial join: assign GEOID to each crime ===
joined = gpd.sjoin(
    gdf,
    tracts[["GEOID", "geometry"]],
    how="inner",
    predicate="within"
)

print("✅ Part 2 complete: Crimes are now tagged with GEOIDs.")
print(joined[["Date", "Year", "Severity", "weighted_severity", "GEOID"]].head())


ModuleNotFoundError: No module named 'geopandas'

In [3]:
!pip install geopandas

Collecting geopandas
  Downloading geopandas-1.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting pyogrio>=0.7.2 (from geopandas)
  Downloading pyogrio-0.11.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (5.3 kB)
Collecting pyproj>=3.5.0 (from geopandas)
  Downloading pyproj-3.7.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (31 kB)
Collecting shapely>=2.0.0 (from geopandas)
  Downloading shapely-2.1.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.8 kB)
Downloading geopandas-1.1.0-py3-none-any.whl (338 kB)
Downloading pyogrio-0.11.0-cp311-cp311-macosx_12_0_arm64.whl (19.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pyproj-3.7.1-cp311-cp311-macosx_14_0_arm64.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading shapely-2.1.1-cp311-cp311-macosx_11_0_arm64.whl (1.6 MB)
[2K   [90m━━━━━━━

In [4]:
pip install geopandas

Note: you may need to restart the kernel to use updated packages.


In [5]:
conda install geopandas

Retrieving notices: done
Channels:
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [6]:
import geopandas as gpd
from shapely.geometry import Point

In [19]:
import geopandas as gpd
from shapely.geometry import Point

# === Convert crimes to GeoDataFrame ===
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]),
    crs="EPSG:4326"  # standard lat/lon projection
)

# === Load census tract geometries ===
tracts = gpd.read_file("/Users/jennifercasavantes/Downloads/manhattan_census_tracts-1.geojson").to_crs(epsg=4326)

# === Spatial join: assign GEOID to each crime ===
joined = gpd.sjoin(
    gdf,
    tracts[["GEOID", "geometry"]],
    how="inner",
    predicate="within"
)

print("✅ Part 2 complete: Crimes are now tagged with GEOIDs.")
print(joined[["Date", "Year", "Severity", "weighted_severity", "GEOID"]].head())


✅ Part 2 complete: Crimes are now tagged with GEOIDs.
        Date  Year     Severity  weighted_severity        GEOID
4 2024-12-01  2024       FELONY                3.0  36061023501
5 2024-12-31  2024       FELONY                3.0  36061021000
6 2024-12-31  2024  MISDEMEANOR                2.0  36061020600
7 2024-12-31  2024  MISDEMEANOR                2.0  36061027900
8 2024-12-31  2024  MISDEMEANOR                2.0  36061024302


In [8]:
"../census tract geofiles/manhattan_census_tracts.geojson"

'../census tract geofiles/manhattan_census_tracts.geojson'

In [17]:
try:
    import geopandas as gpd
    from shapely.geometry import Point

    # Convert to GeoDataFrame
    gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]),
        crs="EPSG:4326"
    )

    # Load census tracts
    tracts = gpd.read_file("/Users/jennifercasavantes/Downloads/manhattan_census_tracts-1.geojson").to_crs(epsg=4326)

    # Spatial join
    joined = gpd.sjoin(
        gdf,
        tracts[["GEOID", "geometry"]],
        how="inner",
        predicate="within"
    )

    print("✅ Spatial join succeeded.")
    print(joined[["Date", "Year", "Severity", "weighted_severity", "GEOID"]].head())

except Exception as e:
    print(f"❌ Something went wrong:\n{e}")

✅ Spatial join succeeded.
        Date  Year     Severity  weighted_severity        GEOID
4 2024-12-01  2024       FELONY                3.0  36061023501
5 2024-12-31  2024       FELONY                3.0  36061021000
6 2024-12-31  2024  MISDEMEANOR                2.0  36061020600
7 2024-12-31  2024  MISDEMEANOR                2.0  36061027900
8 2024-12-31  2024  MISDEMEANOR                2.0  36061024302


In [20]:
# === Group by GEOID and Year, sum weighted severity ===
grouped = joined.groupby(["GEOID", "Year"])["weighted_severity"].sum().reset_index()

# === Create full table with all years and tracts ===
years = range(2000, 2025)  # Exclude 2025 — last year with real data is 2024
all_tracts = tracts["GEOID"].unique()
full_index = pd.MultiIndex.from_product([all_tracts, years], names=["GEOID", "Year"])

# === Fill in missing years with 0 ===
grouped_full = grouped.set_index(["GEOID", "Year"]).reindex(full_index, fill_value=0).reset_index()

# === Pivot into wide format (columns: year_2000, ..., year_2024) ===
historical_wide = grouped_full.pivot(index="GEOID", columns="Year", values="weighted_severity").reset_index()

# === Rename year columns ===
historical_wide.columns = ["GEOID"] + [f"year_{col}" for col in historical_wide.columns[1:]]

# === DEBUGGING: Check for missing or uniform values in each year ===
year_cols = [col for col in historical_wide.columns if col.startswith("year_")]

for col in year_cols:
    print(f"--- {col} ---")
    print(historical_wide[col].describe())
    print("Most common values:")
    print(historical_wide[col].value_counts().head(3))
    print("\n")

print("✅ Part 3 complete: Historical scores 2000–2024 in wide format.")
print(historical_wide.head())


--- year_2000 ---
count    310.000000
mean       0.056387
std        0.098742
min        0.000000
25%        0.000000
50%        0.000000
75%        0.080000
max        0.600000
Name: year_2000, dtype: float64
Most common values:
year_2000
0.00    193
0.12     42
0.08     34
Name: count, dtype: int64


--- year_2001 ---
count    310.000000
mean       0.150194
std        0.256775
min        0.000000
25%        0.000000
50%        0.000000
75%        0.240000
max        2.080000
Name: year_2001, dtype: float64
Most common values:
year_2001
0.00    183
0.24     47
0.16     21
Name: count, dtype: int64


--- year_2002 ---
count    310.000000
mean       0.229935
std        0.391670
min        0.000000
25%        0.000000
50%        0.000000
75%        0.360000
max        2.520000
Name: year_2002, dtype: float64
Most common values:
year_2002
0.00    180
0.36     40
0.24     34
Name: count, dtype: int64


--- year_2003 ---
count    310.000000
mean       0.420645
std        0.669132
min       

In [21]:
from sklearn.linear_model import LinearRegression

# === Prepare data for modeling ===
future_years = [2025, 2026, 2027]
future_preds = []

# Loop through each tract and fit a simple linear regression to predict future years
for _, row in historical_wide.iterrows():
    geoid = row["GEOID"]
    
    # X: years 2000 to 2024
    years = np.arange(2000, 2025).reshape(-1, 1)  # up to 2024 inclusive
    
    # y: values from year_2000 to year_2024
    values = row[[f"year_{y}" for y in range(2000, 2025)]].values.reshape(-1, 1)

    # Fit linear regression model
    model = LinearRegression()
    model.fit(years, values)

    # Predict future years
    future_scores = model.predict(np.array(future_years).reshape(-1, 1)).flatten()
    future_preds.append({
        "GEOID": geoid,
        "pred_2025": round(float(future_scores[0]), 2),
        "pred_2026": round(float(future_scores[1]), 2),
        "pred_2027": round(float(future_scores[2]), 2)
    })

# Convert predictions to DataFrame
predicted_df = pd.DataFrame(future_preds)

print("✅ Part 4 complete: Future crime scores for 2025, 2026, and 2027 predicted.")
print(predicted_df.head())


✅ Part 4 complete: Future crime scores for 2025, 2026, and 2027 predicted.
          GEOID  pred_2025  pred_2026  pred_2027
0  3.606100e+10       0.00       0.00       0.00
1  3.606100e+10     446.38     465.57     484.76
2  3.606100e+10     745.04     777.00     808.97
3  3.606100e+10       0.00       0.00       0.00
4  3.606100e+10    1248.74    1302.73    1356.73


In [22]:
# === Merge historical and predicted data ===
full_df = historical_wide.merge(predicted_df, on="GEOID", how="left")

# === Normalize all year columns to 1–10 safety score ===
from sklearn.preprocessing import MinMaxScaler

# Identify year and prediction columns
score_cols = [col for col in full_df.columns if col.startswith("year_") or col.startswith("pred_")]

# Optional cleanup: remove any columns with all identical values (e.g., all 0s)
score_cols = [col for col in score_cols if full_df[col].nunique() > 1]

# Normalize (flip so higher = safer)
scaler = MinMaxScaler(feature_range=(1, 10))
norm_df = full_df.copy()
norm_scores = scaler.fit_transform(norm_df[score_cols])
norm_scores = 11 - norm_scores  # flip crime → safety score

# Update normalized columns
norm_df[score_cols] = np.round(norm_scores, 1)

# === Save to CSV ===
norm_df.to_csv("full_crime_score_by_tract_2000_2027.csv", index=False)
print("✅ Part 5 complete: File saved as 'full_crime_score_by_tract_2000_2027.csv'")
print(norm_df.head())


✅ Part 5 complete: File saved as 'full_crime_score_by_tract_2000_2027.csv'
         GEOID  year_2000  year_2001  year_2002  year_2003  year_2004  \
0  36061000100       10.0       10.0       10.0       10.0       10.0   
1  36061000201       10.0       10.0        8.7        9.0       10.0   
2  36061000202        8.8       10.0       10.0       10.0        8.1   
3  36061000500       10.0       10.0       10.0       10.0       10.0   
4  36061000600        2.2        9.0       10.0        8.4        6.1   

   year_2005  year_2006  year_2007  year_2008  ...  year_2018  year_2019  \
0       10.0       10.0       10.0       10.0  ...       10.0       10.0   
1        9.2        9.2        9.1        9.3  ...        9.1        8.9   
2        7.4        8.6        8.4        8.3  ...        8.5        8.4   
3       10.0       10.0       10.0       10.0  ...       10.0       10.0   
4        7.9        7.8        7.7        7.7  ...        7.4        6.5   

   year_2020  year_2021  year