In [1]:
import pandas as pd
import pyproj
import numpy as np
import datetime

In [2]:
complaints = pd.read_csv('NYPD_Complaint_Map_Historic_.csv', usecols=(0, 1, 2, 7, 10, 19, 20, 21, 22), thousands=',' )
subways = pd.read_csv('NYC_Subway_Entrances.csv')

In [3]:
print(complaints.info())
print('-----------------------------')
print(complaints.head(1))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5580035 entries, 0 to 5580034
Data columns (total 9 columns):
CMPLNT_NUM          int64
CMPLNT_FR_DT        object
CMPLNT_FR_TM        object
OFNS_DESC           object
CRM_ATPT_CPTD_CD    object
X_COORD_CD          float64
Y_COORD_CD          float64
Latitude            float64
Longitude           float64
dtypes: float64(4), int64(1), object(4)
memory usage: 383.2+ MB
None
-----------------------------
   CMPLNT_NUM CMPLNT_FR_DT CMPLNT_FR_TM OFNS_DESC CRM_ATPT_CPTD_CD  \
0   101109527   12/31/2015     23:45:00   FORGERY        COMPLETED   

   X_COORD_CD  Y_COORD_CD   Latitude  Longitude  
0   1007314.0    241257.0  40.828848 -73.916661  


In [4]:
complaints['CMPLNT_FR_DT'] = pd.to_datetime(complaints['CMPLNT_FR_DT'], format = '%m/%d/%Y', errors='coerce' )
complaints['CMPLNT_FR_TM'] = pd.to_timedelta(complaints['CMPLNT_FR_TM'], errors='coerce' )
print(complaints.info())
print(complaints.head(1))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5580035 entries, 0 to 5580034
Data columns (total 9 columns):
CMPLNT_NUM          int64
CMPLNT_FR_DT        datetime64[ns]
CMPLNT_FR_TM        timedelta64[ns]
OFNS_DESC           object
CRM_ATPT_CPTD_CD    object
X_COORD_CD          float64
Y_COORD_CD          float64
Latitude            float64
Longitude           float64
dtypes: datetime64[ns](1), float64(4), int64(1), object(2), timedelta64[ns](1)
memory usage: 383.2+ MB
None
   CMPLNT_NUM CMPLNT_FR_DT CMPLNT_FR_TM OFNS_DESC CRM_ATPT_CPTD_CD  \
0   101109527   2015-12-31     23:45:00   FORGERY        COMPLETED   

   X_COORD_CD  Y_COORD_CD   Latitude  Longitude  
0   1007314.0    241257.0  40.828848 -73.916661  


In [5]:
for col in complaints:
    print(complaints[col].isnull().values.sum())

0
684
48
18892
7
195868
195868
195868
195868


In [6]:
complaints = complaints.dropna(subset=[['X_COORD_CD', 'Y_COORD_CD', 'Latitude', 'Longitude']])
for col in complaints:
    print(complaints[col].isnull().values.sum())
print('--------------------')
print(complaints.info())
print(complaints[['X_COORD_CD', 'Y_COORD_CD']].head(5))

0
661
46
18152
6
0
0
0
0
--------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5384167 entries, 0 to 5580034
Data columns (total 9 columns):
CMPLNT_NUM          int64
CMPLNT_FR_DT        datetime64[ns]
CMPLNT_FR_TM        timedelta64[ns]
OFNS_DESC           object
CRM_ATPT_CPTD_CD    object
X_COORD_CD          float64
Y_COORD_CD          float64
Latitude            float64
Longitude           float64
dtypes: datetime64[ns](1), float64(4), int64(1), object(2), timedelta64[ns](1)
memory usage: 410.8+ MB
None
   X_COORD_CD  Y_COORD_CD
0   1007314.0    241257.0
1   1043991.0    193406.0
2    999463.0    231690.0
3   1060183.0    177862.0
4    987606.0    208148.0


In [7]:
#Now to reduce entries even further so there are not roughly 5.4 million. Let's focus on the top 10 crime types from 2016.
complaints_2016 = complaints[(complaints['CMPLNT_FR_DT'] > datetime.date(2016,1,1)) & (complaints['CMPLNT_FR_DT'] 
                             < datetime.date(2016,12,31))]
print(complaints_2016.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458831 entries, 5101461 to 5578367
Data columns (total 9 columns):
CMPLNT_NUM          458831 non-null int64
CMPLNT_FR_DT        458831 non-null datetime64[ns]
CMPLNT_FR_TM        458831 non-null timedelta64[ns]
OFNS_DESC           458782 non-null object
CRM_ATPT_CPTD_CD    458831 non-null object
X_COORD_CD          458831 non-null float64
Y_COORD_CD          458831 non-null float64
Latitude            458831 non-null float64
Longitude           458831 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(1), object(2), timedelta64[ns](1)
memory usage: 35.0+ MB
None


In [8]:
print(complaints_2016['OFNS_DESC'].value_counts())

PETIT LARCENY                           79479
HARRASSMENT 2                           64618
ASSAULT 3 & RELATED OFFENSES            51774
CRIMINAL MISCHIEF & RELATED OF          47310
GRAND LARCENY                           41872
DANGEROUS DRUGS                         22529
OFF. AGNST PUB ORD SENSBLTY &           21895
FELONY ASSAULT                          20511
ROBBERY                                 15354
MISCELLANEOUS PENAL LAW                 13573
BURGLARY                                12791
DANGEROUS WEAPONS                       10384
OFFENSES AGAINST PUBLIC ADMINI           8782
VEHICLE AND TRAFFIC LAWS                 6533
GRAND LARCENY OF MOTOR VEHICLE           6222
FORGERY                                  5997
INTOXICATED & IMPAIRED DRIVING           5862
THEFT-FRAUD                              3913
CRIMINAL TRESPASS                        3899
FRAUDS                                   3044
OTHER OFFENSES RELATED TO THEF           1700
UNAUTHORIZED USE OF A VEHICLE     

In [9]:
#Let's only use the top 9 offenses for this study since 10 is a non-specific crime.
list_9 = ['PETIT LARCENY', 'HARRASSMENT 2', 'ASSAULT 3 & RELATED OFFENSES', 'CRIMINAL MISCHIEF & RELATED OF', 
           'GRAND LARCENY', 'DANGEROUS DRUGS', 'OFF. AGNST PUB ORD SENSBLTY &', 'FELONY ASSAULT', 'ROBBERY']
complaints_9 = complaints_2016[complaints_2016.OFNS_DESC.isin(list_9)]

print(complaints_9.info())

#Let's reset the index of the crimes so it starts at 0
complaints_9 = complaints_9.reset_index()

print(complaints_9.head(2))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365342 entries, 5101461 to 5578367
Data columns (total 9 columns):
CMPLNT_NUM          365342 non-null int64
CMPLNT_FR_DT        365342 non-null datetime64[ns]
CMPLNT_FR_TM        365342 non-null timedelta64[ns]
OFNS_DESC           365342 non-null object
CRM_ATPT_CPTD_CD    365342 non-null object
X_COORD_CD          365342 non-null float64
Y_COORD_CD          365342 non-null float64
Latitude            365342 non-null float64
Longitude           365342 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(1), object(2), timedelta64[ns](1)
memory usage: 27.9+ MB
None
     index  CMPLNT_NUM CMPLNT_FR_DT CMPLNT_FR_TM      OFNS_DESC  \
0  5101461   252907137   2016-12-30     18:45:00  HARRASSMENT 2   
1  5101607   777218175   2016-12-30     18:50:00        ROBBERY   

  CRM_ATPT_CPTD_CD  X_COORD_CD  Y_COORD_CD   Latitude  Longitude  
0        COMPLETED   1026014.0    255772.0  40.868619 -73.849000  
1        COMPLETED   1009305.0   

In [10]:
print(subways.info())
print('-----------------------------')
print(subways.head(1))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1928 entries, 0 to 1927
Data columns (total 5 columns):
OBJECTID    1928 non-null int64
URL         1928 non-null object
NAME        1899 non-null object
the_geom    1928 non-null object
LINE        1928 non-null object
dtypes: int64(1), object(4)
memory usage: 75.4+ KB
None
-----------------------------
   OBJECTID                                URL  \
0      1734  http://web.mta.info/nyct/service/   

                                      NAME  \
0  Birchall Ave & Sagamore St at NW corner   

                                       the_geom LINE  
0  POINT (-73.86835600032798 40.84916900104506)  2-5  


In [11]:
subways_loc = pd.DataFrame(columns=['ID', 'geom'])
subways_loc[['ID', 'geom']] = subways[['OBJECTID', 'the_geom']]
print (subways_loc.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1928 entries, 0 to 1927
Data columns (total 2 columns):
ID      1928 non-null int64
geom    1928 non-null object
dtypes: int64(1), object(1)
memory usage: 30.2+ KB
None


In [12]:
subways_loc = pd.DataFrame(subways_loc.geom.str.split(' ').tolist(),
                                   columns = ['point','Longitude', 'Latitude'])
print (subways_loc.head(1))

   point            Longitude            Latitude
0  POINT  (-73.86835600032798  40.84916900104506)


In [13]:
subways_loc['Longitude'] = subways_loc['Longitude'].map(lambda x: str(x)[1:])
subways_loc['Latitude'] = subways_loc['Latitude'].map(lambda x: str(x)[:-1])
print (subways_loc.head(1))

   point           Longitude           Latitude
0  POINT  -73.86835600032798  40.84916900104506


In [14]:
subways_loc = subways_loc[['Latitude', 'Longitude']]
subways_loc[['Latitude', 'Longitude']] = subways_loc[['Latitude', 'Longitude']].astype(float)
print (subways_loc.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1928 entries, 0 to 1927
Data columns (total 2 columns):
Latitude     1928 non-null float64
Longitude    1928 non-null float64
dtypes: float64(2)
memory usage: 30.2 KB
None


In [15]:
WGS84_Z18 = pyproj.Proj(init = "EPSG:4326", preserve_units=True)
NYS_LIZ = pyproj.Proj(init = "ESRI:102718", preserve_units=True)

In [16]:
def coord_transform(df):
    x = pd.Series()
    y = pd.Series()
    for idx, val in enumerate(df['Latitude']):
        x1, y1 = pyproj.transform(WGS84_Z18, NYS_LIZ, df['Longitude'][idx], df['Latitude'][idx])
        x.set_value(idx, x1)
        y.set_value(idx, y1)
    df['x'] = x
    df['y'] = y
    return df
    
subways_loc_trans = coord_transform(subways_loc)

#Check to make sure all entries look correct
print(subways_loc_trans.head(5))
print(subways_loc_trans.info())

    Latitude  Longitude             x              y
0  40.849169 -73.868356  1.020671e+06  248679.990550
1  40.849128 -73.868213  1.020710e+06  248665.112299
2  40.841223 -73.873499  1.019252e+06  245782.869704
3  40.841453 -73.872892  1.019420e+06  245866.910219
4  40.840815 -73.879623  1.017558e+06  245631.832277
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1928 entries, 0 to 1927
Data columns (total 4 columns):
Latitude     1928 non-null float64
Longitude    1928 non-null float64
x            1928 non-null float64
y            1928 non-null float64
dtypes: float64(4)
memory usage: 60.3 KB
None


In [26]:
split = np.array_split(subways_loc_trans, 20)



df1 = pd.DataFrame(index=complaints_9.index.copy(), columns=subways_loc_trans.index.copy())

for idx_crime, x_crime in enumerate(complaints_9['X_COORD_CD']):
    y_crime = complaints_9['Y_COORD_CD'].iloc[idx_crime]
    for idx_subway, x_subway in enumerate(subways_loc_trans['x']):
        y_subway = subways_loc_trans['y'].iloc[idx_subway]
        dist = np.sqrt((x_crime - x_subway)**2 + (y_crime - y_subway)**2)
        df1.iloc[idx_crime, idx_subway] = dist
return df1