# Exploratory Data Analysis of Crime/Arrest Data

Source:
https://data.cityofnewyork.us/Public-Safety/NYC-crime/qb7u-rbmr/data (crime)  
https://data.cityofnewyork.us/Public-Safety/NYPD-Arrest-Data-Year-to-Date-/uip8-fykc/about_data (arrest)  

EDA Rundown
- Filter out what dataset to use
- Filter out uncessary columns
- Address invalid/missing values
- Manipulate values
- Analyze trends between certain features
- Visualize these trends

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [89]:
crime_unclean = pd.read_csv("Datasets/CrimeData/crime_shortened_data.csv")
print(crime_unclean.columns, crime_unclean.shape)

Index(['CMPLNT_NUM', 'ADDR_PCT_CD', 'BORO_NM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM',
       'CMPLNT_TO_DT', 'CMPLNT_TO_TM', 'CRM_ATPT_CPTD_CD', 'LAW_CAT_CD',
       'LOC_OF_OCCUR_DESC', 'OFNS_DESC', 'PD_CD', 'PD_DESC', 'PREM_TYP_DESC',
       'RPT_DT', 'STATION_NAME', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX',
       'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX', 'X_COORD_CD', 'Y_COORD_CD',
       'Latitude', 'Longitude', 'Lat_Lon', 'New Georeferenced Column'],
      dtype='object') (182810, 28)


# columns to remove

- haddevelop
- housing
- jurisdiciton
- ky_cd
- parks_nm
- rpt_dt
- station
- transit_district
- patrol_boro
- pd, pd_desc
- transit district


In [91]:
# remove_col = ['HADEVELOPT','HOUSING_PSA', 'JURISDICTION_CODE', 'JURIS_DESC', 'KY_CD','PARKS_NM', 'PATROL_BORO','TRANSIT_DISTRICT']
# crime_unclean.drop(remove_col, axis=1, inplace=True)
# crime_unclean.shape

In [92]:
crime_unclean["ADDR_PCT_CD"].unique()

array([123, 121, 122, 120,  72,  68,   1,  62,  76,  10,   6,  60,  66,
         5,  84,  13,  14,  18,  78,   9,   7,  20,  61,  63,  75,  17,
        88,  22,  24,  70,  19,  26,  77,  90,  71,  79,  94, 108,  28,
        67,  30,  23, 114,  32,  33,  25,  34,  83,  81, 109,  44,  40,
       113,  46,  73, 104, 100,  50,  69,  45,  42,  52,  48,  41, 115,
       110, 112,  43,  47,  49, 102, 106, 107, 111, 103, 101, 105])

In [93]:
print(crime_unclean.shape)


(182810, 28)


In [94]:
crime_unclean.dtypes

CMPLNT_NUM                    int64
ADDR_PCT_CD                   int64
BORO_NM                      object
CMPLNT_FR_DT                 object
CMPLNT_FR_TM                 object
CMPLNT_TO_DT                 object
CMPLNT_TO_TM                 object
CRM_ATPT_CPTD_CD             object
LAW_CAT_CD                   object
LOC_OF_OCCUR_DESC            object
OFNS_DESC                    object
PD_CD                       float64
PD_DESC                      object
PREM_TYP_DESC                object
RPT_DT                       object
STATION_NAME                 object
SUSP_AGE_GROUP               object
SUSP_RACE                    object
SUSP_SEX                     object
VIC_AGE_GROUP                object
VIC_RACE                     object
VIC_SEX                      object
X_COORD_CD                  float64
Y_COORD_CD                  float64
Latitude                    float64
Longitude                   float64
Lat_Lon                      object
New Georeferenced Column    

In [95]:
crime_unclean.isna().sum()
crime_unclean = crime_unclean.dropna()



In [96]:
crime_unclean

Unnamed: 0,CMPLNT_NUM,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,CRM_ATPT_CPTD_CD,LAW_CAT_CD,LOC_OF_OCCUR_DESC,...,SUSP_SEX,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon,New Georeferenced Column
0,288019777,123,STATEN ISLAND,2024-06-05,15:00:00,06/05/2024,15:30:00,COMPLETED,MISDEMEANOR,FRONT OF,...,U,25-44,WHITE,F,924768.0,134938.0,40.536852,-74.213994,"(40.536852, -74.213994)",POINT (-74.213994 40.536852)
1,291485031,123,STATEN ISLAND,2024-08-09,12:00:00,08/10/2024,13:30:00,COMPLETED,FELONY,INSIDE,...,U,25-44,ASIAN / PACIFIC ISLANDER,M,924010.0,137020.0,40.542560,-74.216738,"(40.54256, -74.216738)",POINT (-74.216738 40.54256)
2,289124020,121,STATEN ISLAND,2024-06-26,09:25:00,06/26/2024,09:34:00,COMPLETED,MISDEMEANOR,INSIDE,...,F,UNKNOWN,UNKNOWN,D,947007.0,171330.0,40.636860,-74.134187,"(40.63686, -74.134187)",POINT (-74.134187 40.63686)
3,289932543,122,STATEN ISLAND,2024-07-11,17:45:00,07/11/2024,17:56:00,COMPLETED,MISDEMEANOR,(null),...,M,UNKNOWN,UNKNOWN,E,952994.0,151611.0,40.582758,-74.112525,"(40.58275811372917, -74.1125254092425)",POINT (-74.1125254092425 40.58275811372917)
4,293061371,120,STATEN ISLAND,2024-09-10,14:15:00,09/10/2024,14:20:00,COMPLETED,MISDEMEANOR,INSIDE,...,(null),UNKNOWN,UNKNOWN,D,948498.0,162493.0,40.612610,-74.128769,"(40.61261, -74.128769)",POINT (-74.128769 40.61261)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182805,291958084,101,QUEENS,2024-08-16,13:00:00,08/16/2024,13:15:00,COMPLETED,VIOLATION,INSIDE,...,F,25-44,BLACK,F,1053146.0,160736.0,40.607594,-73.751871,"(40.607594, -73.751871)",POINT (-73.751871 40.607594)
182806,292459204,105,QUEENS,2024-08-26,14:30:00,08/26/2024,14:47:00,COMPLETED,VIOLATION,(null),...,U,45-64,BLACK,F,1060282.0,188319.0,40.683242,-73.725863,"(40.68324233051407, -73.72586270028577)",POINT (-73.72586270028577 40.68324233051407)
182807,288333679,105,QUEENS,2024-06-08,10:30:00,06/08/2024,11:00:00,COMPLETED,VIOLATION,FRONT OF,...,M,45-64,BLACK,F,1057738.0,204079.0,40.726529,-73.734865,"(40.7265293769835, -73.7348645378882)",POINT (-73.7348645378882 40.7265293769835)
182808,291946590,105,QUEENS,2024-08-19,20:42:00,08/19/2024,20:50:00,COMPLETED,MISDEMEANOR,(null),...,M,45-64,BLACK,M,1057637.0,201157.0,40.718502,-73.735259,"(40.718501800070065, -73.73525943158317)",POINT (-73.73525943158317 40.718501800070065)


In [97]:
crime_unclean['CMPLNT_FR_DT'].dtypes

dtype('O')

In [98]:
crime_unclean['CMPLNT_FR_DT'] = pd.to_datetime(crime_unclean['CMPLNT_FR_DT'], format='%m/%d/%Y', errors='coerce')
# Filter rows with dates from 2020 onwards
# crime_unclean['CMPLNT_FR_DT'].dtypes
filtered_df = crime_unclean[crime_unclean['CMPLNT_FR_DT'] >= '2024-06-01']


In [99]:
filtered_df.sort_values(by='CMPLNT_FR_DT')


Unnamed: 0,CMPLNT_NUM,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,CRM_ATPT_CPTD_CD,LAW_CAT_CD,LOC_OF_OCCUR_DESC,...,SUSP_SEX,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon,New Georeferenced Column
