In [1]:
import pandas as pd
import requests
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from shapely.geometry import Point
from shapely import wkt
from geopy.distance import geodesic, lonlat
import datetime
from tqdm import tqdm
import re
tqdm.pandas()
%matplotlib inline

In [2]:
file_path1 = '/Users/yokij/Desktop/NYPD_Complaint_Data_Current__Year_To_Date.csv'
data1 = pd.read_csv(file_path1, low_memory=False)
file_path2 = '/Users/yokij/Desktop/NYPD_Complaint_Data_Historic.csv'
data2 = pd.read_csv(file_path2, low_memory=False)
data1 = data1.drop(columns=['New Georeferenced Column'])
df = pd.concat([data1, data2])

In [3]:
#Keep records from January 2018 to May 2023
df['CMPLNT_FR_DT'] = pd.to_datetime(df['CMPLNT_FR_DT'], errors='coerce')
df = df.sort_values(by='CMPLNT_FR_DT', ascending=True)
df = df[(df['CMPLNT_FR_DT'] >= pd.Timestamp('2018-01-01')) & (df['CMPLNT_FR_DT'] <= pd.Timestamp('2023-05-31'))]

In [4]:
#delete data if its category is NaN
df = df[df['KY_CD'].notna()] #KY_CD is more general than PD_CD
df = df[df['LAW_CAT_CD'].notna()] #This records three categories(felony,misdemeanor and violation)
df = df[df['Lat_Lon'].notna()]

In [5]:
df.head()

Unnamed: 0,CMPLNT_NUM,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,CRM_ATPT_CPTD_CD,HADEVELOPT,HOUSING_PSA,...,SUSP_SEX,TRANSIT_DISTRICT,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon
6053750,177774631,23.0,MANHATTAN,2018-01-01,16:00:00,04/03/2018,11:50:00,COMPLETED,(null),(null),...,U,,UNKNOWN,UNKNOWN,E,998751.0,226901.0,40.789463,-73.947634,"(40.7894632995555, -73.9476340039424)"
6058680,178119610,67.0,BROOKLYN,2018-01-01,00:01:00,04/11/2018,14:20:00,COMPLETED,(null),(null),...,(null),,<18,BLACK,F,997843.0,175671.0,40.648851,-73.951017,"(40.6488507469884, -73.951016510623)"
6055205,173133785,43.0,BRONX,2018-01-01,01:58:00,01/01/2018,03:05:00,COMPLETED,(null),(null),...,(null),,<18,WHITE HISPANIC,M,1020219.0,239110.0,40.822912,-73.870041,"(40.8229123084767, -73.8700413043181)"
6054120,178674915,24.0,MANHATTAN,2018-01-01,00:01:00,04/27/2018,12:45:00,COMPLETED,(null),(null),...,U,,UNKNOWN,UNKNOWN,D,993369.0,229307.0,40.796074,-73.967067,"(40.7960743128304, -73.9670667458109)"
6060528,173154147,50.0,BRONX,2018-01-01,19:00:00,01/01/2018,19:45:00,COMPLETED,(null),(null),...,F,,25-44,BLACK HISPANIC,M,1010914.0,260940.0,40.882862,-73.903574,"(40.8828621313214, -73.9035744897024)"


In [6]:
#To be consistent with modzcta's crs
def swap_coordinates(point):
    if isinstance(point, Point):
        return Point(point.y, point.x)
    return point

df['Lat_Lon'] = df['Lat_Lon'].apply(swap_coordinates)

In [7]:
df.head()

Unnamed: 0,CMPLNT_NUM,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,CRM_ATPT_CPTD_CD,HADEVELOPT,HOUSING_PSA,...,SUSP_SEX,TRANSIT_DISTRICT,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon
6053750,177774631,23.0,MANHATTAN,2018-01-01,16:00:00,04/03/2018,11:50:00,COMPLETED,(null),(null),...,U,,UNKNOWN,UNKNOWN,E,998751.0,226901.0,40.789463,-73.947634,"(40.7894632995555, -73.9476340039424)"
6058680,178119610,67.0,BROOKLYN,2018-01-01,00:01:00,04/11/2018,14:20:00,COMPLETED,(null),(null),...,(null),,<18,BLACK,F,997843.0,175671.0,40.648851,-73.951017,"(40.6488507469884, -73.951016510623)"
6055205,173133785,43.0,BRONX,2018-01-01,01:58:00,01/01/2018,03:05:00,COMPLETED,(null),(null),...,(null),,<18,WHITE HISPANIC,M,1020219.0,239110.0,40.822912,-73.870041,"(40.8229123084767, -73.8700413043181)"
6054120,178674915,24.0,MANHATTAN,2018-01-01,00:01:00,04/27/2018,12:45:00,COMPLETED,(null),(null),...,U,,UNKNOWN,UNKNOWN,D,993369.0,229307.0,40.796074,-73.967067,"(40.7960743128304, -73.9670667458109)"
6060528,173154147,50.0,BRONX,2018-01-01,19:00:00,01/01/2018,19:45:00,COMPLETED,(null),(null),...,F,,25-44,BLACK HISPANIC,M,1010914.0,260940.0,40.882862,-73.903574,"(40.8828621313214, -73.9035744897024)"


In [8]:
url = 'https://data.cityofnewyork.us/resource/pri4-ifjk.geojson'
zipcode = gpd.read_file(url)
zipcode.head()

Unnamed: 0,modzcta,label,zcta,pop_est,geometry
0,10001,"10001, 10118","10001, 10119, 10199",23072,"MULTIPOLYGON (((-73.98774 40.74407, -73.98819 ..."
1,10002,10002,10002,74993,"MULTIPOLYGON (((-73.99750 40.71407, -73.99709 ..."
2,10003,10003,10003,54682,"MULTIPOLYGON (((-73.98864 40.72293, -73.98876 ..."
3,10026,10026,10026,39363,"MULTIPOLYGON (((-73.96201 40.80551, -73.96007 ..."
4,10004,10004,10004,3028,"MULTIPOLYGON (((-74.00827 40.70772, -74.00937 ..."


In [9]:
# Function to convert string coordinates to Point geometry
def create_point_from_string(coord_str):
    match = re.match(r'\(([^,]+), ([^)]+)\)', coord_str)
    if match:
        lat, lon = map(float, match.groups())
        return Point(lon, lat)
    return None

df['geometry'] = df['Lat_Lon'].apply(create_point_from_string)
gdf = gpd.GeoDataFrame(df, geometry='geometry')
gdf.set_crs(epsg=4326, inplace=True)
zipcode.set_crs(epsg=4326, inplace=True)
result = gpd.sjoin(gdf, zipcode, how="left", op='within')

  for i, (idx, item) in enumerate(geometry.iteritems())


In [10]:
result.set_index('CMPLNT_FR_DT', inplace=True)

In [11]:
weekly_data = result.groupby('modzcta').resample('W').size().reset_index(name='total_count')

In [12]:
weekly_data.head()

Unnamed: 0,modzcta,CMPLNT_FR_DT,total_count
0,10001,2018-01-07,4
1,10001,2018-01-14,3
2,10001,2018-01-21,1
3,10001,2018-01-28,4
4,10001,2018-02-04,3


In [14]:
weekly_data.to_csv('weekly_data.csv', index=False)