# NYC Arrests - Crime

<hr>

This notebook extracts NYC arrest data from NYC OpenData. Data is then processed and exported to a CSV file, which is then imported into Tableau for data visualization.

Before running this notebook, ensure you have the following shape files:
<ul>
   <li>Neighborhood Tabluation Areas (NTA)</li>
   <li>Police Precincts</li> 
</ul>
    
You will need an application token from NYC OpenData.  Save this in a text file named `app-token.txt`.

Uncomment various sections for a more detailed explanation.

## Imports

In [1]:
# Make sure to install necessary packages before running
# GeoPandas might be a bit difficult to install

import os
import datetime
import sys
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sodapy import Socrata

import geopandas
from shapely.geometry import Point

## Constants

In [2]:
SOCRATA_DOMAIN_NYC = 'data.cityofnewyork.us'
APPLICATION_TOKEN_FILE_LOCATION = '../app-token.txt'

# NYPD Arrest Data (Year to Date)
# https://data.cityofnewyork.us/Public-Safety/NYPD-Arrest-Data-Year-to-Date-/uip8-fykc
SOCRATA_DATASET_IDENTIFIER_NY_DATA_YTD = 'uip8-fykc'
# NYPD Arrest Data (Historic)
# https://data.cityofnewyork.us/Public-Safety/NYPD-Arrests-Data-Historic-/8h9b-rp9u/about_data
SOCRATA_DATASET_IDENTIFIER_NY_DATA_HIS = '8h9b-rp9u'

# Helper Functions

In [3]:
def connect_to_data(
    socrata_domain : str,
    application_token_file : str
    ):
    """
    Connects to the data API.
    
    Note: The sodapy Python package must be installed.
        
    Paramaters
    ----------
    socrata_domain : str
      Domain to connect to.
      Example: 'data.cityofnewyork.us'
    application_token_file : str
      The file location where you have stored the application token.
      Example: '../app-token.txt'
    
    Returns
    -------
    Socrata Client
      The Socrata client    
    
    """
    # Read application token
    with open(application_token_file, 'r') as fp:
        contents = fp.read()
    socrata_token = contents

    # Connect to data
    client = Socrata(socrata_domain, socrata_token)
    
    return client

In [4]:
def get_arrest_data(
    socrata_client,
    start_date : str,
    dataset_identifier_historical : str,
    dataset_identifier_ytd : str,
    ):
    """
    Gets arrest data from start_date to
    the last quarter of the current year.
        
    Paramaters
    ----------
    socrata_client : Socrata Client
      Socrata Client to NYC data
    start_date : str
      The start date to start getting arrest data.
      Example: '2019-01-01'
    dataset_identifier_historical : str
      The unique identifier for the historical arrest data.
      Example: '8h9b-rp9u'
      https://data.cityofnewyork.us/Public-Safety/NYPD-Arrests-Data-Historic-/8h9b-rp9u/about_data
    dataset_identifier_ytd : str
      The unique identifier for the ytd arrest data.
    
    Returns
    -------
    Pandas DataFrame
      A DataFrame of the arrest data.
    """
    # loop through the results (to prevent data from crashing)
    loop_size = 10000

    data = []
    results = True
    i = 0
    
    # Get data from start_date to end of previous calendar year
    while results:
        results = client.get(
            dataset_identifier_historical, 
            # where="ARREST_DATE >= '2019-01-01T00:00:00.000'",
            where = f"ARREST_DATE >= '{start_date}T00:00:00.000'",
            # select="column1,column2,etc",
            # order="column1 ASC",
            # can also do other operations 
            limit=loop_size,
            offset=loop_size * i)
        i += 1
        data.extend(results)
        
    # Get data from year to date
    now = datetime.datetime.now()
    current_year =  now.year

    results = True
    i = 0
    while results:
        results = client.get(
            dataset_identifier_ytd, 
            where= f"ARREST_DATE >= '{current_year}-01-01T00:00:00.000'",
            limit=loop_size,
            offset=loop_size * i)
        i += 1
        data.extend(results)
    
    results_df = pd.DataFrame(data)
    
    return results_df

In [5]:
def clean_arrest_data(
    df
    ):
    """
    Cleans the arrest data and returns
    a DataFrame.
        
    Paramaters
    ----------
    df : Pandas DataFrame
      Data of the arrest data
    
    Returns
    -------
    Pandas DataFrame
      A DataFrame of the cleaned arrest data.    
    """
    # change arrest_boro abbreviation to long form
    boro_dict = {
        'B': 'Bronx',
        'S': 'Staten Island',
        'K': 'Brooklyn',
        'M': 'Manhattan',
        'Q': 'Queens',
    }
    df['arrest_boro'] = df.arrest_boro.replace(boro_dict)

    # change perp_sex abbreviation to long form
    perp_sex_dict = {
        'F': 'Female',
        'M': 'Male',
    }
    df['perp_sex'] = df.perp_sex.replace(perp_sex_dict)

    # change law_cat_cd abbreviation to long form
    law_cat_cd_dict = {
        'F': 'Felony',
        'M': 'Misdemeanor',
        'V': 'Violation',
        'I': 'Traffic Infraction',
    }
    df['law_cat_cd'] = df['law_cat_cd'].replace(law_cat_cd_dict)
    
    # change perp_race from all caps to title case
    df['perp_race'] = df['perp_race'].apply(lambda x: x.title())

    # change perp_race from all caps to title case
    df['ofns_desc'] = df['ofns_desc'].astype(str)
    df['ofns_desc'] = df['ofns_desc'].apply(lambda x: x.title())

    # change latitude and longitude from string to numeric
    df['latitude'] = pd.to_numeric(df['latitude'])
    df['longitude'] = pd.to_numeric(df['longitude'])
    
    return df

In [6]:
def _get_ntaname(
    shapely_point,
    ) -> str:
    """
    Returns the neighborhood name in NYC given a Shapely Point
    
    Paramaters
    ----------
    shapely_point : Shapely Point
      Shapely Point
      
    gdf_locations
    
    Returns
    -------
    str
      The name of the neighboorhood given by the Shapely Point     
    """
    filter_ = gdf_locations['geometry'].contains(shapely_point)
    ntaname_pd_series = gdf_locations.loc[filter_, 'ntaname']
    if len(ntaname_pd_series)==0:
        return "Unknown"
    else:
        ntaname = ntaname_pd_series.to_string(index=False).strip()
        return ntaname
    
def reverse_geocoding(
    df,
    nta_file_loc : str,
    ):
    """
    Adds a column of the neighborhood name within NYC (for Tableau)
        
    Paramaters
    ----------
    df : Pandas DataFrame
      Cleaned data of the arrest data
    nta_file_loc : str
      shp file for the Neighborhood Tabulation Areas (NTA)
    
    
    Returns
    -------
    Pandas DataFrame
      A DataFrame of the cleaned arrest data.      
    """
    gdf_locations = geopandas.read_file(nta_file_loc)

    # convert to GeoDataFrame and create a column of shapely points
    gdf = geopandas.GeoDataFrame(
        df, geometry=geopandas.points_from_xy(x=df.longitude, y=df.latitude)
    )

    def _get_ntaname(
        shapely_point,
        ) -> str:
        """
        Returns the neighborhood name in NYC given a Shapely Point

        Paramaters
        ----------
        shapely_point : Shapely Point
          Shapely Point

        gdf_locations

        Returns
        -------
        str
          The name of the neighboorhood given by the Shapely Point     
        """
        filter_ = gdf_locations['geometry'].contains(shapely_point)
        ntaname_pd_series = gdf_locations.loc[filter_, 'ntaname']
        if len(ntaname_pd_series) == 0:
            return "Unknown"
        else:
            ntaname = ntaname_pd_series.to_string(index=False).strip()
            return ntaname

    # create a new column with the nyc neighborhood
    # is there a faster way to do this? vectorize?
    gdf['neighborhood'] = gdf['geometry'].map(_get_ntaname)
    
    return gdf

## Connect to client

In [7]:
client = connect_to_data(
    socrata_domain=SOCRATA_DOMAIN_NYC,
    application_token_file=APPLICATION_TOKEN_FILE_LOCATION    
)

### Get data from NYC OpenData

In [8]:
df = get_arrest_data(
    socrata_client=client,
    start_date='2019-01-01',
    dataset_identifier_historical=SOCRATA_DATASET_IDENTIFIER_NY_DATA_HIS,
    dataset_identifier_ytd=SOCRATA_DATASET_IDENTIFIER_NY_DATA_YTD
)

print(df.shape)
df.head()

(990804, 20)


Unnamed: 0,arrest_key,arrest_date,pd_cd,pd_desc,ky_cd,ofns_desc,law_code,law_cat_cd,arrest_boro,arrest_precinct,jurisdiction_code,age_group,perp_sex,perp_race,x_coord_cd,y_coord_cd,latitude,longitude,lon_lat,geocoded_column
0,191717184,2019-01-01T00:00:00.000,105,STRANGULATION 1ST,106,FELONY ASSAULT,PL 1211200,F,K,68,0,45-64,F,WHITE,982473,168770,40.62991922600003,-74.00640337899993,"{'type': 'Point', 'coordinates': [-74.00640337...",
1,191727066,2019-01-01T00:00:00.000,729,"FORGERY,ETC.,UNCLASSIFIED-FELO",113,FORGERY,PL 1657200,F,M,18,0,45-64,M,BLACK,989151,216116,40.75987183500007,-73.98231033699993,"{'type': 'Point', 'coordinates': [-73.98231033...",
2,191719214,2019-01-01T00:00:00.000,114,OBSTR BREATH/CIRCUL,344,ASSAULT 3 & RELATED OFFENSES,PL 1211100,M,M,18,0,25-44,M,BLACK,988048,216131,40.75991354900003,-73.98629182099995,"{'type': 'Point', 'coordinates': [-73.98629182...",
3,191735321,2019-01-01T00:00:00.000,339,"LARCENY,PETIT FROM OPEN AREAS,",341,PETIT LARCENY,PL 1552500,M,Q,100,0,<18,M,BLACK,1031046,150775,40.58040328600004,-73.83153598499996,"{'type': 'Point', 'coordinates': [-73.83153598...",
4,191706674,2019-01-01T00:00:00.000,101,ASSAULT 3,344,ASSAULT 3 & RELATED OFFENSES,PL 1200000,M,B,44,0,25-44,M,WHITE HISPANIC,1005834,245441,40.84033594400006,-73.92199556399999,"{'type': 'Point', 'coordinates': [-73.92199556...",


In [9]:
df = clean_arrest_data(df)
print(df.shape)
df.head()

(990804, 20)


Unnamed: 0,arrest_key,arrest_date,pd_cd,pd_desc,ky_cd,ofns_desc,law_code,law_cat_cd,arrest_boro,arrest_precinct,jurisdiction_code,age_group,perp_sex,perp_race,x_coord_cd,y_coord_cd,latitude,longitude,lon_lat,geocoded_column
0,191717184,2019-01-01T00:00:00.000,105,STRANGULATION 1ST,106,Felony Assault,PL 1211200,Felony,Brooklyn,68,0,45-64,Female,White,982473,168770,40.629919,-74.006403,"{'type': 'Point', 'coordinates': [-74.00640337...",
1,191727066,2019-01-01T00:00:00.000,729,"FORGERY,ETC.,UNCLASSIFIED-FELO",113,Forgery,PL 1657200,Felony,Manhattan,18,0,45-64,Male,Black,989151,216116,40.759872,-73.98231,"{'type': 'Point', 'coordinates': [-73.98231033...",
2,191719214,2019-01-01T00:00:00.000,114,OBSTR BREATH/CIRCUL,344,Assault 3 & Related Offenses,PL 1211100,Misdemeanor,Manhattan,18,0,25-44,Male,Black,988048,216131,40.759914,-73.986292,"{'type': 'Point', 'coordinates': [-73.98629182...",
3,191735321,2019-01-01T00:00:00.000,339,"LARCENY,PETIT FROM OPEN AREAS,",341,Petit Larceny,PL 1552500,Misdemeanor,Queens,100,0,<18,Male,Black,1031046,150775,40.580403,-73.831536,"{'type': 'Point', 'coordinates': [-73.83153598...",
4,191706674,2019-01-01T00:00:00.000,101,ASSAULT 3,344,Assault 3 & Related Offenses,PL 1200000,Misdemeanor,Bronx,44,0,25-44,Male,White Hispanic,1005834,245441,40.840336,-73.921996,"{'type': 'Point', 'coordinates': [-73.92199556...",


## Quick exploration of data

In [10]:
print(df.shape)

(990804, 20)


In [11]:
df.columns

Index(['arrest_key', 'arrest_date', 'pd_cd', 'pd_desc', 'ky_cd', 'ofns_desc',
       'law_code', 'law_cat_cd', 'arrest_boro', 'arrest_precinct',
       'jurisdiction_code', 'age_group', 'perp_sex', 'perp_race', 'x_coord_cd',
       'y_coord_cd', 'latitude', 'longitude', 'lon_lat', 'geocoded_column'],
      dtype='object')

In [12]:
# Count by year
df['arrest_date'].str[:4].value_counts()

2023    226872
2019    214617
2022    189774
2021    155507
2020    140413
2024     63621
Name: arrest_date, dtype: int64

### Prepare data for spatial analysis
##### Note: this step may take a long time

In [13]:
df = reverse_geocoding(
    df=df,
    nta_file_loc=r'..\Data\Spatial Files\Neighborhood Tabulation Areas (NTA)\geo_export_0c82a76e-3045-414c-9b45-6c529ffc990f.shp'
)

In [14]:
print(df.shape)
df.head()

(990804, 22)


Unnamed: 0,arrest_key,arrest_date,pd_cd,pd_desc,ky_cd,ofns_desc,law_code,law_cat_cd,arrest_boro,arrest_precinct,...,perp_sex,perp_race,x_coord_cd,y_coord_cd,latitude,longitude,lon_lat,geocoded_column,geometry,neighborhood
0,191717184,2019-01-01T00:00:00.000,105,STRANGULATION 1ST,106,Felony Assault,PL 1211200,Felony,Brooklyn,68,...,Female,White,982473,168770,40.629919,-74.006403,"{'type': 'Point', 'coordinates': [-74.00640337...",,POINT (-74.00640 40.62992),Dyker Heights
1,191727066,2019-01-01T00:00:00.000,729,"FORGERY,ETC.,UNCLASSIFIED-FELO",113,Forgery,PL 1657200,Felony,Manhattan,18,...,Male,Black,989151,216116,40.759872,-73.98231,"{'type': 'Point', 'coordinates': [-73.98231033...",,POINT (-73.98231 40.75987),Midtown-Midtown South
2,191719214,2019-01-01T00:00:00.000,114,OBSTR BREATH/CIRCUL,344,Assault 3 & Related Offenses,PL 1211100,Misdemeanor,Manhattan,18,...,Male,Black,988048,216131,40.759914,-73.986292,"{'type': 'Point', 'coordinates': [-73.98629182...",,POINT (-73.98629 40.75991),Midtown-Midtown South
3,191735321,2019-01-01T00:00:00.000,339,"LARCENY,PETIT FROM OPEN AREAS,",341,Petit Larceny,PL 1552500,Misdemeanor,Queens,100,...,Male,Black,1031046,150775,40.580403,-73.831536,"{'type': 'Point', 'coordinates': [-73.83153598...",,POINT (-73.83154 40.58040),Breezy Point-Belle Harbor-Rockaway Park-Broad ...
4,191706674,2019-01-01T00:00:00.000,101,ASSAULT 3,344,Assault 3 & Related Offenses,PL 1200000,Misdemeanor,Bronx,44,...,Male,White Hispanic,1005834,245441,40.840336,-73.921996,"{'type': 'Point', 'coordinates': [-73.92199556...",,POINT (-73.92200 40.84034),Highbridge


In [15]:
df.columns

Index(['arrest_key', 'arrest_date', 'pd_cd', 'pd_desc', 'ky_cd', 'ofns_desc',
       'law_code', 'law_cat_cd', 'arrest_boro', 'arrest_precinct',
       'jurisdiction_code', 'age_group', 'perp_sex', 'perp_race', 'x_coord_cd',
       'y_coord_cd', 'latitude', 'longitude', 'lon_lat', 'geocoded_column',
       'geometry', 'neighborhood'],
      dtype='object')

In [16]:
# unknown neighborhoods (remove in Tableau)
unknown_neighborhoods_df = df[df['neighborhood'] == "Unknown"]

print(df.shape)
# print(unknown_neighborhoods_df)

(990804, 22)


In [17]:
### Drop unnecessary columns
df.drop(
    columns=[
        'arrest_key',
        'x_coord_cd', 
        'y_coord_cd', 
        'geometry',
#         ':@computed_region_efsh_h5xi',
#         ':@computed_region_f5dn_yrer',
#         ':@computed_region_yeji_bk3q',
#         ':@computed_region_92fq_4b7q',
#         ':@computed_region_sbqj_enih',
    ], inplace=True)

In [18]:
# Export Pandas DataFrame to csv
today = datetime.date.today()
df.to_csv(f"../Archived Files/{today}_nypd-arrests.csv", index_label="unique_id")

# Other

In [19]:
# # Unauthenticated client only works with public data sets. Note 'None'
# # in place of application token, and no username or password:
# client = Socrata("data.cityofnewyork.us", None)

# # Example authenticated client (needed for non-public datasets):
# client = Socrata('data.cityofnewyork.us',
#                  MyAppToken,
#                  username="user@example.com",
#                  password="AFakePassword")

# # First 2000 results, returned as JSON from API / converted to Python list of
# # dictionaries by sodapy.
# results = client.get(socrata_dataset_identifier, limit=1000)

# Convert to pandas DataFrame
# results_df = pd.DataFrame.from_records(results)

In [20]:
# results = client.get('8h9b-rp9u', limit=1000)
# example_df = pd.DataFrame.from_records(results)
# example_df.head()

In [21]:
# # Metadata
# metadata = client.get_metadata(SOCRATA_DATASET_IDENTIFIER_NY_DATA_HIS)
# metadata_columns = [x['name'] for x in metadata['columns']]
# metadata_columns

In [22]:
# # Metadata for a particular column
# meta_arrest_col_date = [x for x in metadata['columns'] if x['name'] == 'ARREST_DATE'][0]
# meta_arrest_col

In [23]:
# get date range of data
# note: string is returned
# most_recent_date = meta_arrest_col_date['cachedContents']['largest']
# oldest_date = meta_arrest_col_date['cachedContents']['smallest']
# num_of_results = int(meta_arrest_col_date['cachedContents']['not_null'])

# print("Most recent date: " + most_recent_date)
# print("Oldest date: " + oldest_date)
# print("Total number of not null results: "+ str(num_of_results))

In [24]:
# df = pd.read_csv(r'C:\Users\alexc\Documents\Side Projects\nyc-arrests\Archived Files\2021-05-20_nypd-arrests.csv')

In [25]:
# df.head()