# NYC Arrests - Scraping Police Precincts Data

<hr>

This notebook scrapes the NYC Gov. site online to get data about Police Precincts.  Uses BeautifulSoup and then exports the data to a CSV file.

https://www1.nyc.gov/site/nypd/bureaus/patrol/precincts-landing.page

## Imports

In [1]:
import pandas as pd
import numpy as np
import re
import datetime

import requests
import urllib
from bs4 import BeautifulSoup
from lxml import etree

# pd.set_option('display.max_rows', 200)

## Helper Functions

In [2]:
def get_soup_data(url: str):
    """
    Given a URL, this function returns 
    a BeautifulSoup object of a website 
    parsed as lxml.
        
    Paramaters
    ----------
    url : str
      URL to be parsed.
    
    Returns
    -------
    BeautifulSoup object
      The BeautifulSoup Object parsed as lxml.
    """
    try:
        response = requests.get(url)
        if not response.status_code == 200:
            print("HTTP error", response.status_code)
        else:
            try:
                page_data_soup = BeautifulSoup(response.content, 'lxml')
                return page_data_soup
            except:
                print("Something went wrong with BeautifulSoup parsing")
    except:
        print('Something went wrong with requests.get (possible bad URL)')
        
def get_precinct_info(url: str) -> tuple:
    """
    https://www1.nyc.gov/site/nypd/bureaus/patrol/precincts/1st-precinct.page
    
    Given a url to the Police Precincts page,
    returns the text for the commanding officer
    and the precinct description
    
    Parameters
    ----------
    url : str
      URL of the Police Precinct information
    
    Returns
    -------
    Tuple(str, str)
      A tuple of the commanding officer and a description
      of the police precinct
    """
    page_data_soup = get_soup_data(url)
    
    try:
        soup_div = page_data_soup.find('div', attrs={'class': 'about-description'})
        soup_p = soup_div.find_all('p')
        officer = soup_p[0].get_text()
        description = soup_p[1].get_text()
        return (officer, description)
    except:
        print("Error parsing tag within BeautifulSoup object")
        
def get_precinct_number(precinct_name: str) -> int:
    """
    Given a precinct name, returns the integer
    representation of that precinct.
    
    Note: Some precincts have named names. These are manually mapped.
    
    Parameters
    ----------
    precinct_name : str
      The precinct name.
      Example: 1st Precinct, 123rd Precinct
      
    Returns
    -------
    int
      The precinct name as an integer only.
    """
    MAPPING_PRECINCTS = {
        'Midtown South Precinct': 14,
        'Midtown North Precinct': 18,
        'Central Park Precinct': 22
    }
    pattern = r'^(\d{1,3})\D+'
    match = re.search(pattern, precinct_name)
    
    if match:
        return int(match.group(1))
    else:
        return MAPPING_PRECINCTS[precinct_name]
    
def clean_officer_text(string: str) -> str:
    """
    Cleans the commanding officer text from the web
    and just returns the commanding officer

    Parameters
    ----------
    string : str
      The commanding offer name with a bunch of additional descriptions.
      Example: Commanding Officer: Captain Joel Rosenthal
      
    Returns
    -------
    str
      The commanding officer's name only (includes title).
    """
    pattern = r'^Commanding Officer:\s*(.+)$'
    match = re.search(pattern, string)
    if match:
        return match.group(1)
    else:
        return string

def get_precinct_data(
    url : str
    ):
    """
    Returns a Pandas DataFrame of all the precincts
    in NYC with some metadata.
    
    Parameters
    ----------
    url : str
      URL of the Police Precinct landing page
      https://www1.nyc.gov/site/nypd/bureaus/patrol/precincts-landing.page
      
    Returns
    -------
    Pandas DataFrame
      A DataFrame with all the precincts in NYC with 
      some metadata.
    """
    list_of_precincts = list()
    
    page_data_soup = get_soup_data(url)
    try:
        soup_table = page_data_soup.find('table')
    except:
        print("Error parsing tag within BeautifulSoup object")

    for tag in soup_table.find_all('tr'):

        # Find the rows of the boroughs
        soup_th = tag.find('th', class_='subhead')
        if soup_th:
            borough = soup_th.get_text()

        # Find all rows of precinct now
        soup_td = tag.find('td', attrs={'data-label': 'Precinct'})
        if soup_td:
            precinct_name = soup_td.get_text()
            url = 'https://www1.nyc.gov' + str(soup_td.find('a').get('href'))
            telephone = tag.find_all('td')[1].get_text() #note website didn't do this consistently
            address = tag.find_all('td')[2].get_text()
            officer, description = get_precinct_info(url)

            dict_current_precinct = {
                "Precinct Name": precinct_name,
                "Borough": borough,
                "Address": address,
                "Telephone": telephone,
                "URL": url,
                'Commanding Officer': officer,
                'Description': description
            }
            list_of_precincts.append(dict_current_precinct)
            
    df = pd.DataFrame(list_of_precincts)
    
    # Get the precinct numbers
    df['Precinct Number'] = df.apply(lambda x: get_precinct_number(x['Precinct Name']), axis=1)
    
    # Clean Commanding Officer name
    df['Commanding Officer'] = df.apply(lambda x: clean_officer_text(x['Commanding Officer']), axis=1)
    
    # Add todays day to indicate when data was scraped
    today = datetime.date.today()
    df['Scraped on'] = today
    
    return df

# Get Data

In [3]:
df = get_precinct_data(url='https://www1.nyc.gov/site/nypd/bureaus/patrol/precincts-landing.page')
print(df.shape)
df

(77, 9)


Unnamed: 0,Precinct Name,Borough,Address,Telephone,URL,Commanding Officer,Description,Precinct Number,Scraped on
0,1st Precinct,Manhattan,16 Ericsson Place,212-334-0611,https://www1.nyc.gov/site/nypd/bureaus/patrol/...,Captain Joel Rosenthal,The 1st Precinct serves an area that consists ...,1,2024-05-11
1,5th Precinct,Manhattan,19 Elizabeth Street,212-334-0711,https://www1.nyc.gov/site/nypd/bureaus/patrol/...,Deputy Inspector Tao Chen,The 5th Precinct serves the southeastern edge ...,5,2024-05-11
2,6th Precinct,Manhattan,233 West 10 Street,212-741-4811,https://www1.nyc.gov/site/nypd/bureaus/patrol/...,Captain Jason S. Zeikel,The 6th Precinct serves the southwestern Manha...,6,2024-05-11
3,7th Precinct,Manhattan,19 1/2 Pitt Street,212-477-7311,https://www1.nyc.gov/site/nypd/bureaus/patrol/...,Deputy Inspector Christopher M. Treubig,The 7th Precinct serves Manhattan's Lower East...,7,2024-05-11
4,9th Precinct,Manhattan,321 East 5 Street,212-477-7811,https://www1.nyc.gov/site/nypd/bureaus/patrol/...,Captain Pamela A. Jeronimo,The 9th Precinct serves the area from East Hou...,9,2024-05-11
...,...,...,...,...,...,...,...,...,...
72,115th Precinct,Queens,92-15 Northern Boulevard,718-533-2002,https://www1.nyc.gov/site/nypd/bureaus/patrol/...,Deputy Inspector Eileen T. Downing,The 115th Precinct serves a northern portion o...,115,2024-05-11
73,120th Precinct,Staten Island,78 Richmond Terrace,718-876-8500,https://www1.nyc.gov/site/nypd/bureaus/patrol/...,Deputy Inspector Stephen Spataro,The 120th Precinct serves the North Shore of S...,120,2024-05-11
74,121st Precinct,Staten Island,970 Richmond Avenue,718-697-8700,https://www1.nyc.gov/site/nypd/bureaus/patrol/...,Deputy Inspector Eric J. Waldhelm,The 121st Precinct serves the northwestern sho...,121,2024-05-11
75,122nd Precinct,Staten Island,2320 Hylan Boulevard,718-667-2211,https://www1.nyc.gov/site/nypd/bureaus/patrol/...,Captain Luigi Carrubba,The 122nd Precinct serves a portion of the Sou...,122,2024-05-11


## Some checks for data quality

In [4]:
display(df[df['Precinct Number'].isnull()]) # Should be empty

Unnamed: 0,Precinct Name,Borough,Address,Telephone,URL,Commanding Officer,Description,Precinct Number,Scraped on


# Export data to CSV

In [5]:
df.to_csv('../Data/police_precincts.csv')