Trying more current data from sfgov.org

In [None]:
!pip install sodapy

Collecting sodapy
  Downloading sodapy-2.2.0-py2.py3-none-any.whl.metadata (15 kB)
Downloading sodapy-2.2.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: sodapy
Successfully installed sodapy-2.2.0


Loading the data

In [None]:
from google.colab import userdata
from sodapy import Socrata
import pandas as pd

# Load credentials
DATASFAPI = userdata.get('apptoken')
username = userdata.get('username')  # Optional, needed if app token access is restricted
password = userdata.get('password')

# Initialize client
client = Socrata("data.sfgov.org", DATASFAPI, username=username, password=password)

# Parameters
dataset_id = "wg3w-h783"  # Use the correct dataset ID from your provided URL
chunk_size = 10000
offset = 0
all_results = []

# Paginate through results
while True:
    chunk = client.get(dataset_id, limit=chunk_size, offset=offset)
    if not chunk:
        break
    all_results.extend(chunk)
    offset += chunk_size
    print(f"Retrieved {offset} records...")

# Convert to DataFrame
crime = pd.DataFrame.from_records(all_results)
print(f"Total records retrieved: {len(crime)}")


Retrieved 10000 records...
Retrieved 20000 records...
Retrieved 30000 records...
Retrieved 40000 records...
Retrieved 50000 records...
Retrieved 60000 records...
Retrieved 70000 records...
Retrieved 80000 records...
Retrieved 90000 records...
Retrieved 100000 records...
Retrieved 110000 records...
Retrieved 120000 records...
Retrieved 130000 records...
Retrieved 140000 records...
Retrieved 150000 records...
Retrieved 160000 records...
Retrieved 170000 records...
Retrieved 180000 records...
Retrieved 190000 records...
Retrieved 200000 records...
Retrieved 210000 records...
Retrieved 220000 records...
Retrieved 230000 records...
Retrieved 240000 records...
Retrieved 250000 records...
Retrieved 260000 records...
Retrieved 270000 records...
Retrieved 280000 records...
Retrieved 290000 records...
Retrieved 300000 records...
Retrieved 310000 records...
Retrieved 320000 records...
Retrieved 330000 records...
Retrieved 340000 records...
Retrieved 350000 records...
Retrieved 360000 records...
R

In [None]:
#data types of crime columns

crime.dtypes

Unnamed: 0,0
incident_datetime,object
incident_date,object
incident_time,object
incident_year,object
incident_day_of_week,object
report_datetime,object
row_id,object
incident_id,object
incident_number,object
report_type_code,object


In [None]:
crime.iloc[1, :]

# Cleaning the Data

In [None]:
# truncate before data before 2022
crime = crime[crime['incident_datetime'] >= '2022-01-01T00:00:00.000']

In [None]:
# since not useful we're going to remove the rows with NAN longitude and latitude
crime = crime.dropna(subset=['latitude', 'longitude'])

In [None]:
crime = crime.dropna(subset=['incident_datetime', 'latitude', 'longitude'])

In [None]:
# count how many rows have neighborhoods attached
print((crime['analysis_neighborhood'] == 'null').sum())
# Correctly call the .isna() method on the Series and then .sum()
print(crime['analysis_neighborhood'].isna().sum())

22
76


In [None]:
# remove the na's and 'null's from crime data set column analysis_neighborhood

crime = crime[(crime['analysis_neighborhood'] != 'null') & (crime['analysis_neighborhood'].notna())]
print("Length after removing 'null' and NA:", len(crime))


Length after removing 'null' and NA: 392696


In [None]:
# remove last 7 columns from data and row_id, point, and supervisor
crime = crime.drop(columns=['row_id', 'point', 'supervisor_district_2012', 'cnn', ':@computed_region_jwn9_ihcz', ':@computed_region_26cr_cadq', ':@computed_region_qgnn_b9vv', ':@computed_region_nqbw_i6c3', ':@computed_region_h4ep_8xdi', ':@computed_region_n4xg_c4py', ':@computed_region_jg9y_a9du'])


In [None]:
print("new length of data", len(crime))

new length of data 390985


In [None]:
crime.iloc[1, :]

Unnamed: 0,18
incident_datetime,2024-06-25T17:45:00.000
incident_date,2024-06-25T00:00:00.000
incident_time,17:45
incident_year,2024
incident_day_of_week,Tuesday
report_datetime,2024-06-26T13:08:00.000
incident_id,1401726
incident_number,240398241
report_type_code,II
report_type_description,Initial


# EDA

In [None]:
# prompt: create a heatmap using crime dataset latitude and longiitude where I can filter the data for timeframe through the ['incident_datetime'] column, the incident_category column

!pip install folium
import folium
from folium.plugins import HeatMap

# Convert latitude and longitude to numeric
crime['latitude'] = pd.to_numeric(crime['latitude'])
crime['longitude'] = pd.to_numeric(crime['longitude'])

# Convert incident_datetime to datetime objects
crime['incident_datetime'] = pd.to_datetime(crime['incident_datetime'])

def create_crime_heatmap(dataframe, start_date=None, end_date=None, incident_category=None):
    """
    Creates a heatmap of crime incidents with optional filtering.

    Args:
        dataframe (pd.DataFrame): The crime data DataFrame.
        start_date (str, optional): Start date for filtering (YYYY-MM-DD). Defaults to None.
        end_date (str, optional): End date for filtering (YYYY-MM-DD). Defaults to None.
        incident_category (str, optional): Incident category to filter by. Defaults to None.

    Returns:
        folium.Map: The Folium map object with the heatmap layer.
    """
    # Filter the dataframe based on parameters
    filtered_df = dataframe.copy()
    if start_date:
        filtered_df = filtered_df[filtered_df['incident_datetime'] >= start_date]
    if end_date:
        filtered_df = filtered_df[filtered_df['incident_datetime'] <= end_date]
    if incident_category:
        filtered_df = filtered_df[filtered_df['incident_category'] == incident_category]

    # Ensure there are data points after filtering
    if filtered_df.empty:
        print("No data points found for the selected filters.")
        # Create a base map even if empty
        m = folium.Map(location=[37.7749, -122.4194], zoom_start=12)
        return m

    # Create a list of latitude and longitude pairs for the heatmap
    heat_data = filtered_df[['latitude', 'longitude']].values.tolist()

    # Create a base map centered around the San Francisco area
    m = folium.Map(location=[filtered_df['latitude'].mean(), filtered_df['longitude'].mean()], zoom_start=12)

    # Add the heatmap layer
    HeatMap(heat_data).add_to(m)

    return m

# Example Usage:

# Create a heatmap for all data
#all_crime_heatmap = create_crime_heatmap(crime)
#all_crime_heatmap

# Create a heatmap for a specific time frame
#imeframe_heatmap = create_crime_heatmap(crime, start_date='2023-01-01', end_date='2023-12-31')
#timeframe_heatmap

# Create a heatmap for a specific incident category
#category_heatmap = create_crime_heatmap(crime, incident_category='Larceny Theft')
#category_heatmap



['Lost Property' 'Other Miscellaneous' 'Other' 'Assault' 'Non-Criminal'
 'Disorderly Conduct' 'Missing Person' 'Fire Report' 'Suspicious Occ'
 'Arson' 'Larceny Theft' 'Robbery' 'Burglary' 'Traffic Violation Arrest'
 'Warrant' 'Offences Against The Family And Children' 'Recovered Vehicle'
 'Weapons Carrying Etc' 'Drug Offense' 'Malicious Mischief' 'Fraud'
 'Miscellaneous Investigation' 'Stolen Property' 'Motor Vehicle Theft'
 'Vandalism' 'Case Closure' 'Courtesy Report' 'Weapons Offense'
 'Other Offenses' 'Traffic Collision' 'Forgery And Counterfeiting'
 'Homicide' 'Suicide' 'Embezzlement' 'Drug Violation' 'Vehicle Impounded'
 nan 'Prostitution' 'Sex Offense' 'Vehicle Misplaced' 'Rape'
 'Motor Vehicle Theft?' 'Liquor Laws'
 'Human Trafficking (A), Commercial Sex Acts' 'Civil Sidewalks'
 'Suspicious' 'Gambling' 'Human Trafficking, Commercial Sex Acts'
 'Weapons Offence' 'Human Trafficking (B), Involuntary Servitude']

In [None]:
# print unique report_type_description
print(crime['report_type_description'].unique())

['Initial' 'Coplogic Initial' 'Initial Supplement' 'Vehicle Initial'
 'Vehicle Supplement' 'Coplogic Supplement']


In [None]:
# print unique incident category
print(crime['incident_category'].unique())

['Other Miscellaneous' 'Assault' 'Larceny Theft' 'Fraud'
 'Malicious Mischief' 'Motor Vehicle Theft' 'Suspicious Occ' 'Warrant'
 'Recovered Vehicle' 'Burglary' 'Missing Person' 'Drug Offense'
 'Non-Criminal' 'Traffic Collision' 'Other Offenses' 'Lost Property'
 'Robbery' 'Stolen Property' 'Disorderly Conduct' 'Weapons Offense'
 'Offences Against The Family And Children' 'Forgery And Counterfeiting'
 'Vehicle Misplaced' 'Other' 'Miscellaneous Investigation'
 'Traffic Violation Arrest' nan 'Courtesy Report' 'Weapons Carrying Etc'
 'Arson' 'Fire Report' 'Prostitution' 'Case Closure' 'Homicide' 'Suicide'
 'Vehicle Impounded' 'Embezzlement' 'Sex Offense' 'Rape' 'Vandalism'
 'Drug Violation' 'Motor Vehicle Theft?' 'Liquor Laws'
 'Human Trafficking (A), Commercial Sex Acts'
 'Human Trafficking, Commercial Sex Acts' 'Suspicious' 'Civil Sidewalks'
 'Gambling' 'Weapons Offence'
 'Human Trafficking (B), Involuntary Servitude']


In [None]:
# print unique incident sub category
print(crime['incident_subcategory'].unique())

['Lost Property' 'Other' 'Simple Assault' 'Trespass' 'Non-Criminal'
 'Intimidation' 'Missing Adult' 'Aggravated Assault' 'Missing Person'
 'Fire Report' 'Suspicious Occ' 'Arson' 'Larceny Theft - Other'
 'Robbery - Other' 'Burglary - Other' 'Larceny Theft - Shoplifting'
 'Traffic Violation Arrest' 'Larceny Theft - From Building'
 'Theft From Vehicle' 'Stalking' 'Recovered Vehicle'
 'Larceny - From Vehicle' 'Weapons Offense' 'Drug Violation' 'Drunkenness'
 'Robbery - Commercial' 'Vandalism' 'Fraud' 'Miscellaneous Investigation'
 'Stolen Property' 'Motor Vehicle Theft' 'Warrant' 'Larceny - Auto Parts'
 'Case Closure' 'Courtesy Report' 'Traffic Collision'
 'Burglary - Residential' 'Larceny Theft - Pickpocket' 'Kidnapping'
 'Loitering' 'Forgery And Counterfeiting' 'Robbery - Street'
 'Burglary - Hot Prowl' 'Disorderly Conduct' 'Homicide' 'Suicide'
 'Liquor Law Violation' 'Embezzlement' 'Burglary - Commercial'
 'Other Offenses' 'Vehicle Impounded' nan 'Prostitution'
 'Traffic Collision - Hit

In [None]:
# print unique incident description
print(crime['incident_description'].unique())
# no nlp yet

['Lost Property' 'Conspiracy' 'Death Report, Cause Unknown' 'Battery'
 'Trespassing' 'Found  Property' 'Terrorist Threats' 'Missing Adult'
 'Assault, Aggravated, W/ Other Weapon' 'Found Person' 'Fire Report'
 'Aided Case' 'Suspicious Occurrence' 'Mental Health Detention' 'Arson'
 'Theft, Other Property, >$950' 'Robbery, W/ Gun'
 'Burglary, Other Bldg., Unlawful Entry' 'Theft, Shoplifting, <$50'
 'Traffic Violation Arrest' 'Theft, From Building, $200-$950'
 'Warrant Arrest, Local SF Warrant' 'License Plate, Stolen'
 'Battery, Of A Police Officer' 'Burglary, Other Bldg., Forcible Entry'
 'Stalking' 'Vehicle, Recovered, Motorcycle'
 'Theft, From Locked Vehicle, >$950' 'Investigative Detention'
 'Firearm, Possession By Prohibited Person'
 'Theft, From Unlocked Vehicle, >$950' 'Search Warrant Service'
 'Methamphetamine Offense' 'Driving, No License Issued'
 'Alcohol, Under Influence Of In Public Place'
 'Shoplifting, Force against Agent'
 'Malicious Mischief, Vandalism to Vehicle'
 'Aided c

In [None]:
# show min max date
print(crime['incident_datetime'].min())
print(crime['incident_datetime'].max())

2022-01-01 00:00:00
2025-05-31 23:20:00


In [None]:
print(crime['analysis_neighborhood'].unique())

['Marina' 'Western Addition' 'Bernal Heights' 'Oceanview/Merced/Ingleside'
 'Chinatown' 'Hayes Valley' 'Outer Richmond' 'Tenderloin' 'Mission Bay'
 'Sunset/Parkside' 'Mission' 'Financial District/South Beach'
 'Pacific Heights' 'Nob Hill' 'South of Market' 'West of Twin Peaks'
 'Bayview Hunters Point' 'Haight Ashbury' 'Excelsior' 'Russian Hill'
 'Portola' 'Castro/Upper Market' 'Lakeshore' 'Outer Mission'
 'Lincoln Park' 'Golden Gate Park' 'North Beach' 'Potrero Hill'
 'Inner Richmond' 'Lone Mountain/USF' 'Glen Park' 'Twin Peaks'
 'Inner Sunset' 'Noe Valley' 'Presidio Heights' 'Visitacion Valley'
 'Japantown' 'Seacliff' 'Presidio' 'Treasure Island' 'McLaren Park']


# Transforming the data

Encoding day of the week

In [None]:
# encode
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
crime['day_of_week_encoded'] = le.fit_transform(crime['incident_day_of_week'])


In [None]:
#print unique day of week
print(crime['day_of_week_encoded'].unique())

[3 5 2 1 0 6 4]


Categorizing dangerous crimes

In [None]:
# dangerous crimee
high_risk_categories = { 'Assault', 'Arson', 'Burglary', 'Larceny Theft',
  'Robbery', 'Weapons Offense','Homicide', 'Sex Offense', 'Rape',
  'Human Trafficking (A), Commercial Sex Acts',
  'Human Trafficking, Commercial Sex Acts', 'Weapons Offence',
  'Human Trafficking (B), Involuntary Servitude'
}

high_risk_subcategories = {'Simple Assault', 'Aggravated Assault', 'Arson',
 'Burglary - Other', 'Larceny - From Vehicle', 'Weapons Offense' ,
 'Robbery - Commercial', 'Robbery - Other','Burglary - Commercial',
 'Burglary - Residential','Larceny Theft - Pickpocket',
 'Burglary - Hot Prowl', 'Robbery - Street', 'Homicide','Sex Offense',
 'Rape','Rape - Attempted', 'Larceny Theft - Purse Snatch',
 'Robbery - Residential','Human Trafficking, Commercial Sex Acts'
 'Manslaughter', 'Homicide - Excusable'
 }
# Apply logic: if either category OR subcategory is high-risk
def assign_risk(row):
    if (row['incident_subcategory'] in high_risk_subcategories) or (row['incident_category'] in high_risk_categories):
        return 1  # High-risk
    return 0  # Low-risk

# Apply row-wise
crime['risk_level'] = crime.apply(assign_risk, axis=1)


In [None]:
# prompt: create a heatmap for start_date='2025-01-01', end_date='2025-05-28' for incident_category in  high_risk_categories

def create_filtered_heatmap(dataframe, start_date, end_date, incident_categories):
    """
    Creates a heatmap of crime incidents for multiple incident categories within a date range.

    Args:
        dataframe (pd.DataFrame): The crime data DataFrame.
        start_date (str): Start date for filtering (YYYY-MM-DD).
        end_date (str): End date for filtering (YYYY-MM-DD).
        incident_categories (list): A list of incident categories to filter by.

    Returns:
        folium.Map: The Folium map object with the heatmap layer.
    """
    # Filter by date range
    filtered_df = dataframe[
        (dataframe['incident_datetime'] >= start_date) &
        (dataframe['incident_datetime'] <= end_date)
    ].copy()

    # Filter by incident categories
    filtered_df = filtered_df[filtered_df['incident_category'].isin(incident_categories)]

    # Ensure there are data points after filtering
    if filtered_df.empty:
        print("No data points found for the selected filters.")
        # Create a base map even if empty
        m = folium.Map(location=[37.7749, -122.4194], zoom_start=12)
        return m

    # Create a list of latitude and longitude pairs for the heatmap
    heat_data = filtered_df[['latitude', 'longitude']].values.tolist()

    # Create a base map centered around the San Francisco area
    m = folium.Map(location=[filtered_df['latitude'].mean(), filtered_df['longitude'].mean()], zoom_start=12)

    # Add the heatmap layer
    HeatMap(heat_data).add_to(m)

    return m

# Convert the set to a list for the function
high_risk_categories_list = list(high_risk_categories)

# Create the heatmap
high_risk_heatmap = create_filtered_heatmap(
    crime,
    start_date='2025-05-01',
    end_date='2025-05-28',
    incident_categories=high_risk_categories_list
)

# Display the heatmap (in Jupyter/Colab, just having the variable at the end displays it)
high_risk_heatmap

In [None]:

# Create a heatmap for a specific timeframe and category
filtered_heatmap = create_crime_heatmap(crime, start_date='2025-01-01', end_date='2025-05-28', incident_category=high_risk_categories)
filtered_heatmap


No data points found for the selected filters.


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# --- Clean time ---
crime['incident_time'] = crime['incident_time'].astype(str)
crime['incident_time_dt'] = pd.to_datetime(crime['incident_time'], format='%H:%M', errors='coerce')
crime['incident_hour'] = crime['incident_time_dt'].dt.hour
crime['incident_minute'] = crime['incident_time_dt'].dt.minute
crime = crime.dropna(subset=['incident_hour', 'incident_minute'])

# Convert string date to datetime
crime['incident_datetime'] = pd.to_datetime(crime['incident_datetime'])

# 1. Recency weight: more recent = higher weight
most_recent = crime['incident_datetime'].max()
crime['days_ago'] = (most_recent - crime['incident_datetime']).dt.days
crime['recency_weight'] = 1/(1+((crime['days_ago'])/ 180))  # sigmoid like

# 2. Severity weight: make high-risk crimes more important
def compute_severity(subcat):
    subcat = str(subcat).lower()  # lowercase for easy matching

    # Keyword-based scoring
    if 'homicide' in subcat or 'manslaughter' in subcat:
        return 3
    elif 'rape' in subcat or 'sex' in subcat:
        return 2.5
    elif 'robbery' in subcat:
        return 2.25
    elif 'assault' in subcat:
        return 2
    elif 'arson' in subcat or 'weapons' in subcat:
        return 1.75
    elif 'burglary' in subcat:
        return 1.5
    elif 'larceny' in subcat or 'theft' in subcat:
        return 1.5
    elif 'vandalism' in subcat or 'property' in subcat:
        return 1
    else:
        return 1  # default weight


crime['severity_weight'] = crime['incident_subcategory'].apply(compute_severity)

# 3. Final weight = product
crime['weight'] = crime['recency_weight'] * crime['severity_weight']

# cap
crime['weight'] = np.minimum(crime['weight'], 5)



Tuning for Threshold's

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# --- Encode day of week ---
crime['day_of_week_encoded'] = crime['day_of_week_encoded'].astype(int)
ohe = OneHotEncoder(sparse_output=False, categories='auto')
day_encoded = ohe.fit_transform(crime[['day_of_week_encoded']])
day_labels = ohe.get_feature_names_out(['day_of_week_encoded'])
day_df = pd.DataFrame(day_encoded, columns=day_labels, index=crime.index)

# --- Combine features ---
model_df = pd.concat([
    crime[['incident_hour', 'incident_minute', 'latitude', 'longitude', 'risk_level', 'weight']],
    day_df
], axis=1)

# --- Train/test split ---
X = model_df.drop(['risk_level', 'weight'], axis=1)
y = model_df['risk_level']
weights = model_df['weight']

X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, weights, stratify=y, test_size=0.2, random_state=42
)

# --- Fit classifier using weights ---
clf = RandomForestClassifier(class_weight=None, random_state=42)
clf.fit(X_train, y_train, sample_weight=w_train)

# --- Score & threshold tuning ---
y_probs = clf.predict_proba(X_test)[:, 1]

thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
for t in thresholds:
    y_pred_thresh = (y_probs > t).astype(int)
    precision = precision_score(y_test, y_pred_thresh)
    recall = recall_score(y_test, y_pred_thresh)
    f1 = f1_score(y_test, y_pred_thresh)

    print(f"Threshold: {t}")
    print(f"  Precision: {precision:.2f}")
    print(f"  Recall:    {recall:.2f}")
    print(f"  F1 Score:  {f1:.2f}\n")


Threshold: 0.3
  Precision: 0.52
  Recall:    0.78
  F1 Score:  0.63

Threshold: 0.4
  Precision: 0.55
  Recall:    0.67
  F1 Score:  0.60

Threshold: 0.5
  Precision: 0.57
  Recall:    0.54
  F1 Score:  0.56

Threshold: 0.6
  Precision: 0.61
  Recall:    0.40
  F1 Score:  0.48

Threshold: 0.7
  Precision: 0.63
  Recall:    0.28
  F1 Score:  0.39



In [None]:
# ors routing
!pip install openrouteservice folium


Collecting openrouteservice
  Downloading openrouteservice-2.3.3-py3-none-any.whl.metadata (9.2 kB)
Downloading openrouteservice-2.3.3-py3-none-any.whl (33 kB)
Installing collected packages: openrouteservice
Successfully installed openrouteservice-2.3.3


In [None]:
# from secrets
import openrouteservice
from google.colab import userdata
key = userdata.get('ors_key')
ors_client = openrouteservice.Client(key=key)


In [None]:
# route geometry from ORS
def get_route_coords(start, end):
    coords = [start, end]  # (lon, lat) pairs
    route = ors_client.directions(coords, profile='foot-walking', format='geojson')
    return route['features'][0]['geometry']['coordinates']  # list of [lon, lat]


Testing

In [None]:
# Sample coordinates in San Francisco: (longitude, latitude)
start = (-122.4194, 37.7749)  # SF downtown
end = (-122.446, 37.8017)     # Marina

try:
    route = ors_client.directions(
        coordinates=[start, end],
        profile='foot-walking',
        format='geojson'
    )

    coords = route['features'][0]['geometry']['coordinates']
    print(f"✅ ORS request successful. Route has {len(coords)} points.")
except openrouteservice.exceptions.ApiError as e:
    print(f"❌ ORS API error: {e}")
except Exception as e:
    print(f"❌ General error: {e}")


✅ ORS request successful. Route has 118 points.


Creating address to coordinates function

In [None]:
import requests

def geocode_address(address):
    url = "https://api.openrouteservice.org/geocode/search"
    params = {
        "api_key": key,
        "text": address,
        "boundary.country": "US",
        "size": 1
    }
    response = requests.get(url, params=params)
    data = response.json()
    try:
        coords = data['features'][0]['geometry']['coordinates']  # [lon, lat]
        return coords
    except (IndexError, KeyError):
        print(f"❌ Could not geocode: {address}")
        return None


Map plotting function with Folium library

In [None]:
# map
import folium
def plot_route_on_map(coords, start_coords, end_coords, risk_score, risk_per_point, rerouted=False):
    # Flip coords for folium
    latlon_coords = [(lat, lon) for lon, lat in coords]

    # Create map centered on start
    m = folium.Map(location=[start_coords[1], start_coords[0]], zoom_start=14)

    # Polyline for entire route
    color = "red" if rerouted else "blue"
    folium.PolyLine(latlon_coords, color=color, weight=5, opacity=0.8).add_to(m)

    # Mark start and end
    folium.Marker(latlon_coords[0], popup="Start", icon=folium.Icon(color="green")).add_to(m)
    folium.Marker(latlon_coords[-1], popup="End", icon=folium.Icon(color="orange")).add_to(m)

    # Midpoint marker with average risk
    folium.Marker(
        location=latlon_coords[len(latlon_coords) // 2],
        popup=f"Avg Risk: {risk_score:.2f}",
        icon=folium.Icon(color="red" if rerouted else "blue")
    ).add_to(m)

    # Per-point risk markers (subtle)
    for (lat, lon), risk in zip(latlon_coords, risk_per_point):
        folium.CircleMarker(
            location=(lat, lon),
            radius=4,
            fill=True,
            fill_opacity=0.6,
            color="crimson" if risk > 0.5 else "gray",
            tooltip=f"Risk: {risk:.2f}"
        ).add_to(m)

    return m


Rerouting function for danger > 0.5

In [None]:
# Re-import required dependencies after environment reset
import pandas as pd
import numpy as np
from shapely.geometry import Polygon, mapping
from shapely.ops import unary_union

# Multi-attempt rerouting wrapper that tries different buffer sizes to minimize route risk
# buffer size = groups all risky areas into a polygon of coords tries to categorise areas to avoid,
# then reroutes to different area
# recursively iterates through different routes, and those occurring in the polygon are ignored
def iterative_reroute_min_risk(
    coords, start, end, hour, minute, day_str,
    clf, ohe, day_labels, buffer_sizes=[0.001, 0.0015, 0.002], risk_threshold=0.5
):
    def day_index(d):
        return ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(d)

    idx = day_index(day_str)
    day_vector = ohe.transform(pd.DataFrame({'day_of_week_encoded': [idx]})).flatten()
    day_cols = ['incident_hour', 'incident_minute', 'latitude', 'longitude'] + list(day_labels)

    # Score original route
    original_scores = []
    for lon, lat in coords:
        features = [hour, minute, lat, lon] + day_vector.tolist()
        row = pd.DataFrame([features], columns=day_cols)
        prob = clf.predict_proba(row)[0, 1]
        original_scores.append(prob)

    original_risk = sum(original_scores) / len(original_scores)

    best_risk = original_risk
    best_coords = coords
    best_scores = original_scores
    best_buffer = None

    for buffer_size in buffer_sizes:
        try:
            # Identify top 20% riskiest points
            scores = []
            for lon, lat in coords:
                features = [hour, minute, lat, lon] + day_vector.tolist()
                row = pd.DataFrame([features], columns=day_cols)
                prob = clf.predict_proba(row)[0, 1]
                scores.append(prob)
            top_idxs = np.argsort(scores)[-int(len(scores) * 0.2):]
            avoid_coords = [[coords[i][0], coords[i][1]] for i in top_idxs]

            # Buffer and merge
            polygons = [
                Polygon([
                    (lon + buffer_size, lat + buffer_size),
                    (lon - buffer_size, lat + buffer_size),
                    (lon - buffer_size, lat - buffer_size),
                    (lon + buffer_size, lat - buffer_size),
                    (lon + buffer_size, lat + buffer_size)
                ])
                for lon, lat in avoid_coords
            ]
            merged_polygon = unary_union(polygons)
            avoid_geojson = mapping(merged_polygon)

            # ORS call
            route = ors_client.directions(
                coordinates=[start, end],
                profile='foot-walking',
                format='geojson',
                options={"avoid_polygons": avoid_geojson}
            )
            new_coords = route['features'][0]['geometry']['coordinates']

            # Score new route
            new_scores = []
            for lon, lat in new_coords:
                features = [hour, minute, lat, lon] + day_vector.tolist()
                row = pd.DataFrame([features], columns=day_cols)
                prob = clf.predict_proba(row)[0, 1]
                new_scores.append(prob)

            avg_risk = sum(new_scores) / len(new_scores)
            if avg_risk < best_risk:
                best_risk = avg_risk
                best_coords = new_coords
                best_scores = new_scores
                best_buffer = buffer_size
        except Exception:
            continue

    return {
        "coords": best_coords,
        "avg_risk": best_risk,
        "risk_per_point": best_scores,
        "was_rerouted": best_coords != coords,
        "buffer_used": best_buffer,
        "original_risk": original_risk
    }


In [None]:
# need
def day_index(day_str):
    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    return days.index(day_str)

Assessing Route function using score_route

In [None]:
def score_route(coords, hour, minute, day_str):
    # Round user-input minute
    minute_rounded = round_to_nearest_15(minute)

    # Convert day string to encoded and one-hot vector
    idx = day_index(day_str)
    day_encoded_array = ohe.transform(pd.DataFrame({'day_of_week_encoded': [idx]}))
    day_vector = day_encoded_array.flatten().tolist()

    risks = []
    for lon, lat in coords:
        features = [hour, minute_rounded, lat, lon] + day_vector
        columns = ['incident_hour', 'incident_minute', 'latitude', 'longitude'] + list(day_labels)

        if len(features) != len(columns):
            print(f"Feature mismatch: {len(features)} features vs {len(columns)} columns")
            continue

        row = pd.DataFrame([features], columns=columns)
        prob = clf.predict_proba(row)[0, 1]
        risks.append(prob)

    avg_risk = sum(risks) / len(risks) if risks else 0
    return avg_risk, risks


In [None]:
def assess_route(start, end, hour, minute, day_str, threshold=0.5):
    coords = get_route_coords(start, end)
    avg_risk, risk_per_point = score_route(coords, hour, minute, day_str)

    if avg_risk > threshold:
        print(f"⚠️ Route risk ({avg_risk:.2f}) exceeds threshold {threshold} — rerouting...")
    else:
        print(f"✅ Route is safe with risk score: {avg_risk:.2f}")

    return coords, avg_risk, risk_per_point

In [None]:
# add bining for 15-30 minute interval so avoids individual noise

In [None]:
# Re-import necessary libraries after kernel reset
import pandas as pd
import numpy as np
from shapely.geometry import Polygon, mapping
from shapely.ops import unary_union

# Define time rounding function
def round_to_nearest_15(minute):
    return int(round(minute / 15.0) * 15) % 60

# Define day index lookup
def day_index(d):
    return ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(d)


# Here we test the geocode and map output

In [None]:
start_address = "1342 Irving Street, San Francisco"
end_address = "327 Lincoln Way, San Francisco"
hour = 2
minute = 15
day_str = "Saturday"

start_coords = geocode_address(start_address)
end_coords = geocode_address(end_address)

# Check the unique values in crime['incident_day_of_week'] before fitting the encoder
# Make sure you use the string day name here, as assess_route now handles the encoding
coords, avg_risk, risk_per_point = assess_route(start_coords, end_coords, hour, minute, day_str)



⚠️ Route risk (0.74) exceeds threshold 0.5 — rerouting...


rerouting with risk minimization

In [None]:
result = iterative_reroute_min_risk(
    coords, start_coords, end_coords, hour, minute, day_str,
    clf=clf, ohe=ohe, day_labels=day_labels
)

result = iterative_reroute_min_risk(
    coords, start_coords, end_coords, hour, minute, day_str,
    clf=clf, ohe=ohe, day_labels=day_labels
)

print(f"🧭 Best route risk: {result['avg_risk']:.2f}")
if result["was_rerouted"] and result["avg_risk"] <= 0.5:
    print(f"✅ Rerouted with buffer: {result['buffer_used']}")
else:
    if result["avg_risk"] > 0.5:
        print("🚨 All paths are risky — consider waiting to walk.")
    else:
        print("👍 Original path is within safe limits.")

# Visualize
map_obj = plot_route_on_map(
    result["coords"],
    start_coords,
    end_coords,
    risk_score=result["avg_risk"],
    risk_per_point=result["risk_per_point"],
    rerouted=result["was_rerouted"]
)
map_obj


🧭 Best route risk: 0.73
🚨 All paths are risky — consider waiting to walk.


save in joblib

In [None]:
# at end of your notebook
import os, joblib
os.makedirs("models", exist_ok=True)

joblib.dump(clf, "models/risk_model.joblib")
joblib.dump(ohe, "models/encoder.joblib")

['models/encoder.joblib']

In [None]:
# download folder
from google.colab import drive
drive.mount('/content/drive')
#Code For going to Drive you want to download :


Mounted at /content/drive


In [None]:
# file
%cd /content/models

#Code to turn Folder into zip :
!zip -r models.zip /content/models
# Code To download zip folder :
from google.colab import files
files.download('models.zip')

/content/models
  adding: content/models/ (stored 0%)
  adding: content/models/risk_model.joblib (deflated 66%)
  adding: content/models/encoder.joblib (deflated 39%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>