In [9]:
# import libraries
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import pickle

In [10]:
# import xlsx NYPD stop-and-frisk data
yr2023 = pd.read_excel(
    "https://github.com/annejscott/602_final-project/raw/main/data/raw/nypd-stop-frisk-2023.xlsx",
                    na_values=["(null)"])
yr2022 = pd.read_excel(
    "https://github.com/annejscott/602_final-project/raw/main/data/raw/nypd-stop-frisk-2022.xlsx",
                        na_values=["(null)"]) 
# merge dataframes
nypd = pd.concat([yr2023, yr2022])

In [11]:
# fix date/time columns
nypd["STOP_FRISK_DATE"] = pd.to_datetime(nypd["STOP_FRISK_DATE"], format="mixed")
nypd["STOP_FRISK_TIME"] = pd.to_datetime(
    nypd["STOP_FRISK_TIME"], errors="coerce", format="mixed")

In [12]:
 # fix boolean values
nypd.replace({"Y": True, "S": True, "V": True, "N": False}, inplace=True)
nypd["ID_CARD_IDENTIFIES_OFFICER_FLAG"] = nypd["ID_CARD_IDENTIFIES_OFFICER_FLAG"].replace({"I": True})

In [13]:
# fix string columns
str_columns = ["OFFICER_NOT_EXPLAINED_STOP_DESCRIPTION",
        "SUSPECT_ARREST_OFFENSE",
        "DEMEANOR_OF_PERSON_STOPPED",
        "SUSPECT_HEIGHT",
        "SUSPECT_OTHER_DESCRIPTION",
        "STOP_LOCATION_APARTMENT",
        "STOP_LOCATION_FULL_ADDRESS",
        "STOP_LOCATION_STREET_NAME",
        "STOP_LOCATION_PATROL_BORO_NAME",]
nypd[str_columns] = nypd[str_columns].astype("str")

In [14]:
# Define function
def make_categories(column_name):
    categories = nypd[column_name].dropna().unique()
    nypd[column_name] = pd.Categorical(nypd[column_name],
                                    categories=categories)

In [15]:
# fix categorical columns
make_categories('RECORD_STATUS_CODE')
make_categories('ISSUING_OFFICER_RANK')
make_categories('SUPERVISING_OFFICER_RANK')
make_categories('LOCATION_IN_OUT_CODE')
make_categories('JURISDICTION_CODE')
make_categories('JURISDICTION_DESCRIPTION')
make_categories('SUSPECTED_CRIME_DESCRIPTION')
make_categories('SUMMONS_OFFENSE_DESCRIPTION')
make_categories('SUSPECT_SEX')
make_categories('SUSPECT_RACE_DESCRIPTION')
make_categories('SUSPECT_BODY_BUILD_TYPE')
make_categories('SUSPECT_EYE_COLOR')
make_categories('SUSPECT_HAIR_COLOR')
make_categories('STOP_LOCATION_SECTOR_CODE')
make_categories('STOP_LOCATION_BORO_NAME')

In [16]:
race_mapping = {
    'BLACK': 'Black',
    'WHITE HISPANIC': 'white',
    'BLACK HISPANIC': 'Mixed Race',
    'ASIAN / PACIFIC ISLANDER': 'Asian',
    'WHITE': 'white',
    'AMERICAN INDIAN/ALASKAN NATIVE': 'American Indigenous',
    'MIDDLE EASTERN/SOUTHWEST ASIAN': 'Other'
}

# standardize race descriptions
nypd['SUSPECT_RACE_DESCRIPTION'] = nypd['SUSPECT_RACE_DESCRIPTION'].replace(race_mapping)

In [17]:
# put boro in title case
nypd['STOP_LOCATION_BORO_NAME'] = nypd['STOP_LOCATION_BORO_NAME'].str.title()

In [18]:
# remove NA and 0 values

nypd['STOP_LOCATION_X'] = nypd['STOP_LOCATION_X'].replace(0, np.nan)
nypd['STOP_LOCATION_Y'] = nypd['STOP_LOCATION_Y'].replace(0, np.nan)

nypd = nypd.dropna(subset=['STOP_LOCATION_X', 'STOP_LOCATION_Y'])

In [19]:
nypd.to_csv('/Users/opportunity/Documents/MSDS/2024-spring/DATA602/602_final-project/data/processed/numeric/nypd.csv', index=False)

In [20]:
# Define the file path
output_filepath = '/Users/opportunity/Documents/MSDS/2024-spring/DATA602/602_final-project/data/processed/numeric/nypd.pkl'

# Open the file in write mode
with open(output_filepath, 'wb') as file:
    # Save the processed DataFrame as a pickle object
    pickle.dump(nypd, file)

In [21]:
# convert latitude and longitude columns to geopandas geometry type
geometry = [
    Point(x, y) for x, y in zip(nypd["STOP_LOCATION_X"], nypd["STOP_LOCATION_Y"])
    ]
nypd_geo = gpd.GeoDataFrame(nypd, geometry=geometry,
                        crs='EPSG:2263')

In [22]:
# Filter out points with coordinates (0, 0)
nypd_geo = nypd_geo[(nypd_geo.geometry.x != 0.0) & (nypd_geo.geometry.y != 0)]


In [23]:
nypd_geo.to_csv('/Users/opportunity/Documents/MSDS/2024-spring/DATA602/602_final-project/data/interim/nypd_geo.csv', index=False)

In [24]:
# Define the file path
output_filepath = '/Users/opportunity/Documents/MSDS/2024-spring/DATA602/602_final-project/data/interim/nypd_geo.pkl'

# Open the file in write mode
with open(output_filepath, 'wb') as file:
    # Save the processed DataFrame as a pickle object
    pickle.dump(nypd_geo, file)