# Data clean
data:
1. NYPD Complaint Data Historic
https://data.cityofnewyork.us/resource/qgea-i56i.csv
2. Local Area Unemployment Statistics
https://data.ny.gov/resource/5hyu-bdh8.csv
3. Shapefiles of NYC zip codes 
https://data.cityofnewyork.us/Business/Zip-Code-Boundaries/i8iw-xf4u/data?no_mobile=true .

- Downlowad the NYPD Complaint Data Historic and Local Area Unemployment Statistics first, using the given links


In [1]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
from matplotlib.pylab import plt
import numpy as np
%matplotlib inline
import warnings
import geopandas as gpd
from urllib.request import urlopen 
import json
import urllib
warnings.filterwarnings('ignore')
from zipfile import ZipFile

# Local Area Unemployment Statistics

In [2]:
# Using ZipFile to read large csv file
#zip_file = ZipFile('Local_Area_Unemployment_Statistics__Beginning_1976.csv.zip')
#Unemployment_data = pd.read_csv(zip_file.open('Local_Area_Unemployment_Statistics__Beginning_1976.csv'))
Unemployment_data = pd.read_csv('Local_Area_Unemployment_Statistics__Beginning_1976.csv')

In [3]:
Unemployment_data.shape

(89078, 7)

In [4]:
Unemployment_data.head()

Unnamed: 0,Area,Year,Month,Labor Force,Employed,Unemployed,Unemployment Rate
0,New York State,2023,1,9594400,9155000,439400,4.6
1,New York State,2023,2,9664000,9232800,431200,4.5
2,New York State,2023,3,9687400,9295600,391900,4.0
3,New York State,2023,4,9652700,9296800,355900,3.7
4,New York State,2023,5,9720600,9349700,370900,3.8


## Data Cleaning Tasks
1. Data year from 2012 to 2022
2. area within New York City
The boroughs are the Bronx, Brooklyn, Manhattan, Queens, and Staten Island. 
Each borough is coextensive with a respective county of the State of New York: The Bronx is Bronx County, Brooklyn is Kings County, Manhattan is New York County, Queens is Queens County, and Staten Island is Richmond County.
3. Converting an Datetime feild

In [5]:
# Keep data only from year 2012 to 2022
sanityindex = ((Unemployment_data['Year'] >=2012) & (Unemployment_data['Year'] <=2022))
Unemployment_data = Unemployment_data.loc[sanityindex]
Unemployment_data.shape

(29172, 7)

In [6]:
# Unique Area 
Unemployment_data.Area.unique()

array(['New York State', 'Albany County', 'Allegany County',
       'Bronx County', 'Broome County', 'Cattaraugus County',
       'Cayuga County', 'Chautauqua County', 'Chemung County',
       'Chenango County', 'Clinton County', 'Columbia County',
       'Cortland County', 'Delaware County', 'Dutchess County',
       'Erie County', 'Essex County', 'Franklin County', 'Fulton County',
       'Genesee County', 'Greene County', 'Hamilton County',
       'Herkimer County', 'Jefferson County', 'Kings County',
       'Lewis County', 'Livingston County', 'Madison County',
       'Monroe County', 'Montgomery County', 'Nassau County',
       'New York County', 'Niagara County', 'Oneida County',
       'Onondaga County', 'Ontario County', 'Orange County',
       'Orleans County', 'Oswego County', 'Otsego County',
       'Putnam County', 'Queens County', 'Rensselaer County',
       'Richmond County', 'Rockland County', 'Saratoga County',
       'Schenectady County', 'Schoharie County', 'Schuyler 

In [7]:
# Bronx, Brooklyn, Manhattan, Queens, and Staten Island
NYC_counties = ['Bronx County','Kings County','New York County','Queens County','Richmond County']
Unemployment_data = Unemployment_data[Unemployment_data['Area'].isin(NYC_counties)]
Unemployment_data.shape

(660, 7)

In [8]:
# Combine Year and Month, convert to Datetime
Unemployment_data['Date'] = pd.to_datetime(Unemployment_data['Year'].astype(str) + '-' +
                                           Unemployment_data['Month'].astype(str).str.zfill(2))

In [9]:
Unemployment_data.reset_index(inplace = True, drop = True)

In [10]:
Unemployment_data.head()

Unnamed: 0,Area,Year,Month,Labor Force,Employed,Unemployed,Unemployment Rate,Date
0,Bronx County,2022,1,612200,546100,66100,10.8,2022-01-01
1,Bronx County,2022,2,612400,551600,60800,9.9,2022-02-01
2,Bronx County,2022,3,607800,555000,52800,8.7,2022-03-01
3,Bronx County,2022,4,600700,552800,47900,8.0,2022-04-01
4,Bronx County,2022,5,599000,554700,44300,7.4,2022-05-01


In [11]:
# Save the cleaned data
Unemployment_data.to_csv("Unemployment_data_cleaned.csv")

## NYPD Complaint Data Historic

The data disctionary is here:https://data.cityofnewyork.us/api/views/qgea-i56i/files/b21ec89f-4d7b-494e-b2e9-f69ae7f4c228?download=true&filename=NYPD_Complaint_Incident_Level_Data_Footnotes.pdf

The focus of this notebook would be on following columns - 'CMPLNT_NUM','BORO_NM','CMPLNT_FR_DT','CMPLNT_FR_TM','OFNS_DESC','PARKS_NM','Latitude', 'Longitude'

'CMPLNT_NUM' is a unique id for each complaint, 
'BORO' is name of borough where complaint was reported,
'CMPLNT_FR_DT' and 'CMPLNT_FR_TM' are date and time of complaint respectively, 
'OFNS_DESC' is the type of offence reported, 
'PARKS_NM' is name of park where complaint recorded (if any) and 
'Latitude', 'Longitude' are location of complaint.

In [None]:
# NYPD Complaint Data Historic 
# Using ZipFile to read large csv file
#zip_file = ZipFile('NYPD_Complaint_Data_Historic_20231206.csv.zip')
#Complaint_data = pd.read_csv(zip_file.open('NYPD_Complaint_Data_Historic_20231206.csv'))
Complaint_data = pd.read_csv('NYPD_Complaint_Data_Historic_20231206.csv')

In [None]:
Complaint_data.shape

In [None]:
Complaint_data.columns

## Select the interested columns

In [None]:
selectedCol = ['CMPLNT_NUM','BORO_NM','CMPLNT_FR_DT','CMPLNT_FR_TM','LAW_CAT_CD','OFNS_DESC','PARKS_NM','Latitude', 'Longitude']
Complaint_data = Complaint_data[selectedCol]
Complaint_data.shape

In [None]:
Complaint_data.head()

## Data Cleaning task

### 1. Filter out missing/wrong date and times, missing borough name and duplicate complaints from the data

In [None]:
# Ensure date as datetime
Complaint_data['CMPLNT_FR_DT'] = pd.to_datetime(Complaint_data['CMPLNT_FR_DT'],errors = 'coerce')

In [None]:
# 
Complaint_data['BORO_NM'].unique()

In [None]:
# 'CMPLNT_FR_DT' using NaT indicating missing data
Complaint_data[Complaint_data['CMPLNT_FR_DT'].isna() == True ].head()

In [None]:
# 'CMPLNT_FR_TM' using '(null)' indicating missing data
Complaint_data[Complaint_data['CMPLNT_FR_TM'] ==  '(null)'].head()

In [None]:
# 'BORO_NM' using '(null)' indicating missing data
Complaint_data[Complaint_data['BORO_NM'] ==  '(null)'].head()

In [None]:
# filter out missing/wrong date and times, missing borough name
sanityindex = ((Complaint_data['CMPLNT_FR_DT'].isna() == False) 
               & (Complaint_data['CMPLNT_FR_DT']!= '(null)')
               & (Complaint_data['CMPLNT_FR_TM'].isna() == False) 
               & (Complaint_data['CMPLNT_FR_TM']!= '(null)')
               & (Complaint_data['BORO_NM'] != '(null)'))
Complaint_data = Complaint_data.loc[sanityindex]
Complaint_data.shape

In [None]:
# fitering duplicate complaint number 
Complaint_data = Complaint_data.drop_duplicates(subset = ['CMPLNT_NUM'])
Complaint_data.shape

### 2. Remove rows where location is parks or greenspace, Keep rows for 2012 - 2022

In [None]:
# If a crime has occured outside a parkspace, this value would be NaN
sanityindex = (Complaint_data['PARKS_NM'] == '(null)')
Complaint_data = Complaint_data.loc[sanityindex]
Complaint_data.shape

In [None]:
# Check the statring and ending date and time
dateStart = Complaint_data['CMPLNT_FR_DT'].min()
dateEnd = Complaint_data['CMPLNT_FR_DT'].max()
timeStart = Complaint_data[Complaint_data['CMPLNT_FR_DT']==dateStart]['CMPLNT_FR_TM'].min()
timeEnd = Complaint_data[Complaint_data['CMPLNT_FR_DT']==dateEnd]['CMPLNT_FR_TM'].max()
# Combine
dtStart = pd.to_datetime(str(dateStart)+' '+str(timeStart))
dtEnd = pd.to_datetime(str(dateEnd)+' '+str(timeEnd))

In [None]:
## check the timeline of data
(dtStart, dtEnd)

In [None]:
# Romove data before 2012
Complaint_data = Complaint_data[(Complaint_data['CMPLNT_FR_DT'] >= pd.to_datetime('01/01/2012'))]
Complaint_data.shape

In [None]:
# Check the statring and ending date and time
dateStart = Complaint_data['CMPLNT_FR_DT'].min()
dateEnd = Complaint_data['CMPLNT_FR_DT'].max()
timeStart = Complaint_data[Complaint_data['CMPLNT_FR_DT']==dateStart]['CMPLNT_FR_TM'].min()
timeEnd = Complaint_data[Complaint_data['CMPLNT_FR_DT']==dateEnd]['CMPLNT_FR_TM'].max()
# Combine
dtStart = pd.to_datetime(str(dateStart)+' '+str(timeStart))
dtEnd = pd.to_datetime(str(dateEnd)+' '+str(timeEnd))

In [None]:
## check the timeline of cleaned data
(dtStart, dtEnd)

### 3. keep type 1 crimes as defined by FBI from the data :

The crime type is present in the 'OFNS_DESC' column. 
Keep the following categories: 'ARSON', 'BURGLARY', 'FELONY ASSAULT', 'GRAND LARCENY' ,'GRAND LARCENY OF MOTOR VEHICLE', 'MURDER & NON-NEGL. MANSLAUGHTER', 'RAPE', 'ROBBERY'

In [None]:
crimetypes = ['ARSON', 'BURGLARY', 'FELONY ASSAULT', 'GRAND LARCENY' ,'GRAND LARCENY OF MOTOR VEHICLE', 'MURDER & NON-NEGL. MANSLAUGHTER', 'RAPE', 'ROBBERY']
sanityindex = Complaint_data['OFNS_DESC'].isin(crimetypes) 
Complaint_data = Complaint_data.loc[sanityindex]
Complaint_data.shape

In [None]:
Complaint_data.reset_index(inplace = True, drop = True)
Complaint_data

### 4. Keep rows with location within NYC


In [None]:
## zip codes map
zips = gpd.read_file('ZIP_CODE_040114/ZIP_CODE_040114.shp')
zips.head()

In [None]:
# chekc Projection
zips.crs

In [None]:
# make points from given latitudes and longitudes
from shapely.geometry import Point
geometry = [Point(xy) for xy in zip(Complaint_data.Longitude, Complaint_data.Latitude)]

geoComplaintLatLon = gpd.GeoDataFrame(Complaint_data,geometry=geometry,crs={'init': 'EPSG:4326'})
geoComplaintLatLon.shape

In [None]:
# remove rows with location outside NYC

# spatial join 
ComplaintDatawithin = gpd.sjoin(geoComplaintLatLon,zips.to_crs({'init': 'epsg:4326'}), op='within', how='inner')
ComplaintDatawithin.shape

In [None]:
# Drop duplicates 
ComplaintDatawithin = ComplaintDatawithin.drop_duplicates(subset='CMPLNT_NUM')
ComplaintDatawithin.shape

In [None]:
ComplaintDatawithin.head()

In [None]:
# Keep the selected interested columns only 
selectedCol = ['CMPLNT_NUM','BORO_NM','CMPLNT_FR_DT','CMPLNT_FR_TM','LAW_CAT_CD','OFNS_DESC','PARKS_NM','Latitude', 'Longitude']
Complaint_data = ComplaintDatawithin[selectedCol]
Complaint_data.shape

In [None]:
Complaint_data.reset_index(inplace = True, drop = True)

In [None]:
Complaint_data.head()

In [None]:
# Save the cleaned data
Complaint_data.to_csv("Complaint_data_cleaned.csv")