# Updating 16th St. BART Plaza Crime Stats

In [68]:
import requests
import requests_cache
from playwright.async_api import async_playwright
from io import StringIO
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import os


pd.set_option('display.max_rows', 1000) 
pd.set_option('display.max_columns', 1000)
pd.set_option("display.max_colwidth", None) 
requests_cache.install_cache('api_cache', expire_after=86400)  # Cache expires after 1 day (86400 seconds)

This is the rough notebook which was used to scrape all the archival data of crimes at the 16th St. BART plaza available for the last 180 days.

- Last run on Nov.4, 2025 
- Data is from May 9, 2025 - Nov.3, 2025

## Workflow


- Go to https://www.crimemapping.com/map/location/2000%20Mission%20St%2C%20San%20Francisco%2C%20CA%2C%2094110%2C%20USA?id=#

- Click on "Where"
- Fill "2000 Mission St, San Francisco, CA, 94110, USA"
- Press enter
- Click on drop down menu at Search distance
- Click "500 feet"
- Click Apply
- Click "When"
- Click "Custom Time Range"
- Here is where it gets complicated. for now we need all the records from May 8th 2025 to Nov 3 2025. But then we have to do weekly
- Fill the "from" column as 05/08/2025
- Click "Apply"
- Click "Report"
- Wait for it to load
- Get the html.

# 1. Scrape HTML Crime Data

In [69]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless = False)

In [70]:
# now you can use page to navigate that page
page = await browser.new_page()

In [71]:
await page.goto("https://www.crimemapping.com/map/location/2000%20Mission%20St%2C%20San%20Francisco%2C%20CA%2C%2094110%2C%20USA?id=")

<Response url='https://www.crimemapping.com/map/location/2000%20Mission%20St%2C%20San%20Francisco%2C%20CA%2C%2094110%2C%20USA?id=' request=<Request url='https://www.crimemapping.com/map/location/2000%20Mission%20St%2C%20San%20Francisco%2C%20CA%2C%2094110%2C%20USA?id=' method='GET'>>

In [72]:
# Click the WHERE filter button
await page.click('#filtersWhere')

# Wait for the Where panel to open
await page.wait_for_selector('#wherePanel', state='visible')

# Fill the location search input in the Where panel
await page.fill('#locationSearchWherePanel', '2000 Mission St, San Francisco, CA, 94110, USA')

# Wait for autocomplete suggestions to appear
await page.wait_for_timeout(1000)

# Press Enter to select the first suggestion
await page.press('#locationSearchWherePanel', 'Enter')

In [73]:
# Click the dropdown to open it (optional, but good practice)
await page.click('#whereBufferDistances')

# Select "500 feet" option by its value
await page.select_option('#whereBufferDistances', value='152.4')

# Wait a moment for selection to register
await page.wait_for_timeout(500)

In [74]:
# Click the Apply button in the Where panel
await page.click('#wherePanel .btnApply')

# Wait for the filter to apply and map to update
await page.wait_for_timeout(1000)

In [75]:
# Click the WHEN filter button
await page.click('#filtersWhen')

# Wait for the When panel to open
await page.wait_for_selector('#whenPanel', state='visible')

<JSHandle preview=JSHandle@node>

In [76]:
# Click on "Custom Time Range" option
await page.click('text=Custom Time Range')

# Wait for the date pickers to appear
await page.wait_for_selector('#customDate', state='visible')

# Wait a moment for the date fields to be ready
await page.wait_for_timeout(200)

In [77]:
# Click on the "From" date input field
await page.click('#dateFrom')

# Clear any existing value and type the new date
await page.fill('#dateFrom', '05/09/2025')

# Wait a moment for the date to register
await page.wait_for_timeout(200)

In [78]:
# Click on the "To" date input field
await page.click('#dateTo')

# Clear any existing value and type the new date
await page.fill('#dateTo', '11/03/2025')

# Wait a moment for the date to register
await page.wait_for_timeout(200)

In [79]:
# Click the Apply button in the When panel
await page.click('#whenPanel .btnApply')

# Wait for the filter to apply and data to load
await page.wait_for_timeout(200)

In [80]:
# Click the REPORT button on the left sidebar
await page.click('#displayReports')

# Wait for the report page to load
await page.wait_for_selector('#divReportPage', state='visible')

# Wait a bit longer for the data table to fully load
await page.wait_for_timeout(3000)

In [81]:
records_text = await page.inner_text('.itemsCount')
total_items = int(records_text.split()[0])
items_per_page = 15
total_pages = (total_items + items_per_page - 1) // items_per_page

for page_num in range(1, total_pages + 1):
    await page.click(f'a[data-page="{page_num}"]')
    await page.wait_for_timeout(2000)
    
    html = await page.content()
    filename = f'raw_html/crime_report_2025_05_08_to_11_03_page_{page_num}.html'
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(html)

In [67]:
# Close the browser
await browser.close()
await playwright.stop()

Found 3 page buttons
Scraping page 1...
Saved raw_html/crime_report_2025_05_08_to_11_03_page_1.html
Scraping page 2...
Saved raw_html/crime_report_2025_05_08_to_11_03_page_2.html
Scraping page 3...
Saved raw_html/crime_report_2025_05_08_to_11_03_page_3.html
Total pages scraped: 3
Done!


## Formatting from HTML

### formatting one

In [82]:
# Open and read the HTML file
with open('raw_html/crime_report_2025_05_08_to_11_03_page_1.html', 'r', encoding='utf-8') as f:
    local_html = f.read()

# Parse with BeautifulSoup
soup_doc = BeautifulSoup(local_html, 'html.parser')

In [87]:
table_rows = soup_doc.find_all("tbody")[0].find_all("tr")

In [90]:
# description
table_rows[0].find_all("td")[2].string

'SIMPLE ASSAULT'

In [91]:
# incident number
table_rows[0].find_all("td")[3].string

'2510-1447'

In [92]:
# location
table_rows[0].find_all("td")[4].string

'2000 BLOCK MISSION ST'

In [93]:
# agency name
table_rows[0].find_all("td")[5].string

'BART Police'

In [97]:
# date and time
table_rows[0].find_all("td")[6].string.split(' ',1)

['10-23-2025', '6:14 PM']

### Creating the csv

### make one csv first

In [106]:
# Open and read the HTML file
with open('raw_html/crime_report_2025_05_08_to_11_03_page_1.html', 'r', encoding='utf-8') as f:
    local_html = f.read()
    
# HTML for one page
soup_doc = BeautifulSoup(local_html, 'html.parser')

In [107]:
# each tr is one row of crimes
table_rows = soup_doc.find_all("tbody")[0].find_all("tr")

In [112]:
all_rows = []
for row in table_rows:
    one_row = {}
    
    cells = row.find_all("td")
    
    one_row["description"] = cells[2].string
    one_row["incident_num"] = cells[3].string
    one_row["location"] = cells[4].string
    one_row["agency"] = cells[5].string
    one_row["date"] = cells[6].string.split(' ',1)[0]
    one_row["time"] = cells[6].string.split(' ',1)[1]
    
    all_rows.append(one_row)

In [113]:
len(all_rows)

15

In [114]:
pd.DataFrame(all_rows)

Unnamed: 0,description,incident_num,location,agency,date,time
0,SIMPLE ASSAULT,2510-1447,2000 BLOCK MISSION ST,BART Police,10-23-2025,6:14 PM
1,INTIMIDATION,2510-1179,2000 BLOCK Mission St,BART Police,10-19-2025,10:57 PM
2,DRUG EQUIPMENT VIOLATIONS,2510-1028,2000 BLOCK Mission St,BART Police,10-17-2025,8:03 AM
3,DRUG EQUIPMENT VIOLATIONS,2510-0988,2000 BLOCK MISSION ST,BART Police,10-16-2025,4:20 PM
4,SIMPLE ASSAULT,2510-0983,2000 BLOCK Mission St,BART Police,10-16-2025,3:34 PM
5,SIMPLE ASSAULT,2510-0740,2000 BLOCK MISSION ST,BART Police,10-13-2025,6:57 AM
6,DRUG EQUIPMENT VIOLATIONS,2510-0561,2000 BLOCK Mission St,BART Police,10-9-2025,4:30 PM
7,DRUG EQUIPMENT VIOLATIONS,2510-0416,2000 BLOCK Mission St,BART Police,10-7-2025,2:08 PM
8,DRUG/NARCOTIC VIOLATIONS,2510-0174,2000 BLOCK Mission St,BART Police,10-3-2025,12:35 PM
9,ALL OTHER LARCENY,2510-0208,2000 BLOCK Mission St,BART Police,10-3-2025,9:40 AM


### Alternate without saving raw html

In [None]:
records_text = await page.inner_text('.itemsCount')
total_items = int(records_text.split()[0])
items_per_page = 15
total_pages = (total_items + items_per_page - 1) // items_per_page
all_pages = []

for page_num in range(1, total_pages + 1):
    await page.click(f'a[data-page="{page_num}"]')
    await page.wait_for_timeout(2000)
    
    #get the html --> put it in a list
    html = await page.content()
    
    all_pages.append(html)

### close brower ###

In [None]:
all_crimes = []

#loop through all pages
for page in all_pages:
    
    
    # HTML for one page
    soup_doc = BeautifulSoup(page, 'html.parser')
    
    #ROWS FOR ONE PAGE
    table_rows = soup_doc.find_all("tbody")[0].find_all("tr")
    
    for row in table_rows:
        one_row = {}
        
        cells = row.find_all("td")
    
        one_row["description"] = cells[2].string
        one_row["incident_num"] = cells[3].string
        one_row["location"] = cells[4].string
        one_row["agency"] = cells[5].string
        one_row["date"] = cells[6].string.split(' ',1)[0]
        one_row["time"] = cells[6].string.split(' ',1)[1]
    
        all_crimes.append(one_row)

In [115]:
pages = os.listdir('raw_html')
all_crimes = []

#loop through all pages
for page in pages:
    
    # Open and read the HTML file
    with open(f'raw_html/{page}', 'r', encoding='utf-8') as f:
        local_html = f.read()
    
    # HTML for one page
    soup_doc = BeautifulSoup(local_html, 'html.parser')
    
    #ROWS FOR ONE PAGE
    table_rows = soup_doc.find_all("tbody")[0].find_all("tr")
    
    for row in table_rows:
        one_row = {}
        
        cells = row.find_all("td")
    
        one_row["description"] = cells[2].string
        one_row["incident_num"] = cells[3].string
        one_row["location"] = cells[4].string
        one_row["agency"] = cells[5].string
        one_row["date"] = cells[6].string.split(' ',1)[0]
        one_row["time"] = cells[6].string.split(' ',1)[1]
    
        all_crimes.append(one_row)

In [119]:
df = pd.DataFrame(all_crimes)
df.to_csv("crimes_archival.csv", index = False)