#### Initial Set Up

In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Remove dataFrame display size restrictions
#pd.set_option("display.max_rows", None, "display.max_columns", None)

# Create path
path = "resources/crimes.csv"

# Read in csv
df = pd.read_csv(path)

# Grab original dimensions before clean
original_dimensions = df.shape
print(f'The original dimensions of the crime dataset (rows/columns): {original_dimensions}') 

The original dimensions of the crime dataset (rows/columns): (313385, 24)


In [2]:
# Display first 5 records
df.head()

Unnamed: 0,Complaint_ID,Boro_Name,Start_Date,End_Date,Complaint_Code,Complaint_Cat,Complaint_Desc,Suspect_Age,Suspect_Race,Suspect_Gender,...,Longitude_Crime,Latitude,Longitude,Lat2,Lat2.1,Long2,Long2.1,Zip,City,State
0,144739528,STATEN ISLAND,8/17/2020,8/17/2020,578,VIOLATION,HARRASSMENT 2,45-64,UNKNOWN,M,...,-74.235092,40.508274,-74.24387,40.51,40.51,-74.24,-74.24,10307,Staten Island,NY
1,269149414,STATEN ISLAND,11/16/2020,11/16/2020,344,MISDEMEANOR,ASSAULT 3 & RELATED OFFENSES,45-64,WHITE,M,...,-74.236141,40.508274,-74.24387,40.51,40.51,-74.24,-74.24,10307,Staten Island,NY
2,815686392,STATEN ISLAND,11/18/2020,11/28/2020,109,FELONY,GRAND LARCENY,UNKNOWN,UNKNOWN,F,...,-74.242663,40.508274,-74.24387,40.51,40.51,-74.24,-74.24,10307,Staten Island,NY
3,134612657,STATEN ISLAND,11/12/2020,11/12/2020,344,MISDEMEANOR,ASSAULT 3 & RELATED OFFENSES,UNKNOWN,UNKNOWN,U,...,-74.239979,40.508274,-74.24387,40.51,40.51,-74.24,-74.24,10307,Staten Island,NY
4,500683857,STATEN ISLAND,4/7/2020,4/7/2020,351,MISDEMEANOR,CRIMINAL MISCHIEF & RELATED OF,UNKNOWN,UNKNOWN,M,...,-74.243459,40.508274,-74.24387,40.51,40.51,-74.24,-74.24,10307,Staten Island,NY


#### Clean Up

In [3]:
# Remove irrelevant columns
df = df[df.columns.difference(['End_Date', 'Suspect_Gender', 'Latitude', 'Longitude', 'Long2.1', 'Lat2.1', 'Suspect_Age', 'Suspect_Race', 'Victim_Age', 'Victim_Race', 'Victim_Gender', 
                              'City', 'State'])]

df.columns

columns_removed = df.shape
print(f'The dimensions of the crime dataset after removing irrelevant columns: {columns_removed}') 

The dimensions of the crime dataset after removing irrelevant columns: (313385, 11)


In [4]:
# Remove all complaints that were recorded before 2020
df = df[df['Start_Date'].str.contains("2020", na=False)]

years_removed = df.shape
print(f'The dimensions of the crime dataset after excluding all years that do not represent 2020 (rows/columns): {years_removed}') 

The dimensions of the crime dataset after excluding all years that do not represent 2020 (rows/columns): (309865, 11)


In [5]:
# Rename column names
df = df.rename(columns={"Complaint_ID": "complaint_id", "Boro_Name": "borough", "Start_Date": "complaint_date","Complaint_Cat": "category", "Complaint_Code": "complaint_code", "Complaint_Desc": "complaint_desc", "Zip": "zipcode", "Lat2": "lat2", "Long2": "long2", "Latitude_Crime": "lat_crime", "Longitude_Crime": "long_crime"}) 

# Convert all strings to lowercase
df["borough"] = df["borough"].str.lower()
df["category"] = df["category"].str.lower()
df["complaint_desc"] = df["complaint_desc"].str.lower()


In [6]:
# Drop all rows where zipcode column = NaN
df = df.dropna(subset=['zipcode'])

rows_after_zipcode = len(df)
print(f'After removing all rows with a NaN value under zipcode column, there were {rows_after_zipcode} rows remaining')

After removing all rows with a NaN value under zipcode column, there were 309865 rows remaining


In [7]:
# Drop all duplicates on complaint id column since that should be unique
df = df.drop_duplicates(subset=['complaint_id'])


select_distinct = len(df)
print(f'After removing all duplicate rows {select_distinct}, there were remaining')

After removing all duplicate rows 118257, there were remaining


In [8]:
# Convert complaint_date to datetime
df.complaint_date = pd.to_datetime(df.complaint_date, format='%m/%d/%Y')

In [9]:
# Create a column to denote if the danger level is low or high

df["danger_level"] = ""
df.loc[df["complaint_desc"] == 'sex crimes', "danger_level"] = 'high'
df.loc[df["complaint_desc"] == 'rape', "danger_level"] = 'high'
df.loc[df["complaint_desc"] == 'dangerous weapons', "danger_level"] = 'high'
df.loc[df["complaint_desc"] == 'felony sex crimes', "danger_level"] = 'high'
df.loc[df["complaint_desc"] == 'burglary', "danger_level"] = 'high'
df.loc[df["complaint_desc"] == 'robbery', "danger_level"] = 'high'
df.loc[df["complaint_desc"] == 'arson', "danger_level"] = 'high'

df['danger_level'] = df['danger_level'].replace('', np.nan, regex=True)
df['danger_level'] = df['danger_level'].fillna('low')



#### Add Categorical Encoding & Binary Values

In [10]:
# # Convert type of columns to 'category'
df['category'] = df['category'].astype('category')

# Assigning numerical values and store in another column
df['category_tier'] = df['category'].cat.codes
df

Unnamed: 0,borough,category,complaint_code,complaint_desc,complaint_id,lat2,lat_crime,long2,long_crime,complaint_date,zipcode,danger_level,category_tier
0,staten island,violation,578,harrassment 2,144739528,40.51,40.506788,-74.24,-74.235092,2020-08-17,10307,low,2
1,staten island,misdemeanor,344,assault 3 & related offenses,269149414,40.51,40.507428,-74.24,-74.236141,2020-11-16,10307,low,1
2,staten island,felony,109,grand larceny,815686392,40.51,40.514922,-74.24,-74.242663,2020-11-18,10307,low,0
3,staten island,misdemeanor,344,assault 3 & related offenses,134612657,40.51,40.511812,-74.24,-74.239979,2020-11-12,10307,low,1
4,staten island,misdemeanor,351,criminal mischief & related of,500683857,40.51,40.511437,-74.24,-74.243459,2020-04-07,10307,low,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
313380,bronx,violation,578,harrassment 2,601861845,40.90,40.899198,-73.86,-73.858230,2020-05-08,10470,low,2
313381,bronx,violation,578,harrassment 2,883034470,40.90,40.897260,-73.86,-73.855250,2020-05-22,10470,low,2
313382,bronx,violation,578,harrassment 2,464564363,40.90,40.898259,-73.86,-73.855816,2020-05-23,10470,low,2
313383,bronx,violation,578,harrassment 2,411266231,40.90,40.900696,-73.86,-73.857113,2020-05-08,10470,low,2


In [11]:
# Generate binary values using get_dummies for crime category
dum_df = pd.get_dummies(df, columns=["category"], prefix=["type_is"] )

# Merge with main df
crime_df = df.merge(dum_df)
crime_df


Unnamed: 0,borough,category,complaint_code,complaint_desc,complaint_id,lat2,lat_crime,long2,long_crime,complaint_date,zipcode,danger_level,category_tier,type_is_felony,type_is_misdemeanor,type_is_violation
0,staten island,violation,578,harrassment 2,144739528,40.51,40.506788,-74.24,-74.235092,2020-08-17,10307,low,2,0,0,1
1,staten island,misdemeanor,344,assault 3 & related offenses,269149414,40.51,40.507428,-74.24,-74.236141,2020-11-16,10307,low,1,0,1,0
2,staten island,felony,109,grand larceny,815686392,40.51,40.514922,-74.24,-74.242663,2020-11-18,10307,low,0,1,0,0
3,staten island,misdemeanor,344,assault 3 & related offenses,134612657,40.51,40.511812,-74.24,-74.239979,2020-11-12,10307,low,1,0,1,0
4,staten island,misdemeanor,351,criminal mischief & related of,500683857,40.51,40.511437,-74.24,-74.243459,2020-04-07,10307,low,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118252,bronx,violation,578,harrassment 2,601861845,40.90,40.899198,-73.86,-73.858230,2020-05-08,10470,low,2,0,0,1
118253,bronx,violation,578,harrassment 2,883034470,40.90,40.897260,-73.86,-73.855250,2020-05-22,10470,low,2,0,0,1
118254,bronx,violation,578,harrassment 2,464564363,40.90,40.898259,-73.86,-73.855816,2020-05-23,10470,low,2,0,0,1
118255,bronx,violation,578,harrassment 2,411266231,40.90,40.900696,-73.86,-73.857113,2020-05-08,10470,low,2,0,0,1


In [12]:
# Generate binary values using get_dummies for danger level
dum_df1 = pd.get_dummies(df, columns=["danger_level"], prefix=["danger_level_is"] )

# Merge with main df
crime_df = crime_df.merge(dum_df1)
crime_df

Unnamed: 0,borough,category,complaint_code,complaint_desc,complaint_id,lat2,lat_crime,long2,long_crime,complaint_date,zipcode,danger_level,category_tier,type_is_felony,type_is_misdemeanor,type_is_violation,danger_level_is_high,danger_level_is_low
0,staten island,violation,578,harrassment 2,144739528,40.51,40.506788,-74.24,-74.235092,2020-08-17,10307,low,2,0,0,1,0,1
1,staten island,misdemeanor,344,assault 3 & related offenses,269149414,40.51,40.507428,-74.24,-74.236141,2020-11-16,10307,low,1,0,1,0,0,1
2,staten island,felony,109,grand larceny,815686392,40.51,40.514922,-74.24,-74.242663,2020-11-18,10307,low,0,1,0,0,0,1
3,staten island,misdemeanor,344,assault 3 & related offenses,134612657,40.51,40.511812,-74.24,-74.239979,2020-11-12,10307,low,1,0,1,0,0,1
4,staten island,misdemeanor,351,criminal mischief & related of,500683857,40.51,40.511437,-74.24,-74.243459,2020-04-07,10307,low,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118252,bronx,violation,578,harrassment 2,601861845,40.90,40.899198,-73.86,-73.858230,2020-05-08,10470,low,2,0,0,1,0,1
118253,bronx,violation,578,harrassment 2,883034470,40.90,40.897260,-73.86,-73.855250,2020-05-22,10470,low,2,0,0,1,0,1
118254,bronx,violation,578,harrassment 2,464564363,40.90,40.898259,-73.86,-73.855816,2020-05-23,10470,low,2,0,0,1,0,1
118255,bronx,violation,578,harrassment 2,411266231,40.90,40.900696,-73.86,-73.857113,2020-05-08,10470,low,2,0,0,1,0,1


In [13]:
# Reorder columns
crime_df = crime_df[['complaint_id', 'borough', 'category', 'complaint_code', 'complaint_desc', 'lat2', 'lat_crime', 'long2', 'long_crime', 'complaint_date', 'zipcode', 'category', 'category_tier', 'type_is_felony', 'type_is_misdemeanor', 'type_is_violation', 'danger_level', 'danger_level_is_high', 'danger_level_is_low']]

In [14]:
# Export Clean DataFrame to CSV
crime_df.to_csv("output/crime_data_clean.csv", index=False)

In [15]:
%pprint
crime_df.columns.tolist()
crime_df

Pretty printing has been turned OFF


Unnamed: 0,complaint_id,borough,category,complaint_code,complaint_desc,lat2,lat_crime,long2,long_crime,complaint_date,zipcode,category.1,category_tier,type_is_felony,type_is_misdemeanor,type_is_violation,danger_level,danger_level_is_high,danger_level_is_low
0,144739528,staten island,violation,578,harrassment 2,40.51,40.506788,-74.24,-74.235092,2020-08-17,10307,violation,2,0,0,1,low,0,1
1,269149414,staten island,misdemeanor,344,assault 3 & related offenses,40.51,40.507428,-74.24,-74.236141,2020-11-16,10307,misdemeanor,1,0,1,0,low,0,1
2,815686392,staten island,felony,109,grand larceny,40.51,40.514922,-74.24,-74.242663,2020-11-18,10307,felony,0,1,0,0,low,0,1
3,134612657,staten island,misdemeanor,344,assault 3 & related offenses,40.51,40.511812,-74.24,-74.239979,2020-11-12,10307,misdemeanor,1,0,1,0,low,0,1
4,500683857,staten island,misdemeanor,351,criminal mischief & related of,40.51,40.511437,-74.24,-74.243459,2020-04-07,10307,misdemeanor,1,0,1,0,low,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118252,601861845,bronx,violation,578,harrassment 2,40.90,40.899198,-73.86,-73.858230,2020-05-08,10470,violation,2,0,0,1,low,0,1
118253,883034470,bronx,violation,578,harrassment 2,40.90,40.897260,-73.86,-73.855250,2020-05-22,10470,violation,2,0,0,1,low,0,1
118254,464564363,bronx,violation,578,harrassment 2,40.90,40.898259,-73.86,-73.855816,2020-05-23,10470,violation,2,0,0,1,low,0,1
118255,411266231,bronx,violation,578,harrassment 2,40.90,40.900696,-73.86,-73.857113,2020-05-08,10470,violation,2,0,0,1,low,0,1


In [16]:
crime_df.dtypes

complaint_id                     int64
borough                         object
category                      category
complaint_code                   int64
complaint_desc                  object
lat2                           float64
lat_crime                      float64
long2                          float64
long_crime                     float64
complaint_date          datetime64[ns]
zipcode                          int64
category                      category
category_tier                     int8
type_is_felony                   uint8
type_is_misdemeanor              uint8
type_is_violation                uint8
danger_level                    object
danger_level_is_high             uint8
danger_level_is_low              uint8
dtype: object

In [17]:
crime_df

Unnamed: 0,complaint_id,borough,category,complaint_code,complaint_desc,lat2,lat_crime,long2,long_crime,complaint_date,zipcode,category.1,category_tier,type_is_felony,type_is_misdemeanor,type_is_violation,danger_level,danger_level_is_high,danger_level_is_low
0,144739528,staten island,violation,578,harrassment 2,40.51,40.506788,-74.24,-74.235092,2020-08-17,10307,violation,2,0,0,1,low,0,1
1,269149414,staten island,misdemeanor,344,assault 3 & related offenses,40.51,40.507428,-74.24,-74.236141,2020-11-16,10307,misdemeanor,1,0,1,0,low,0,1
2,815686392,staten island,felony,109,grand larceny,40.51,40.514922,-74.24,-74.242663,2020-11-18,10307,felony,0,1,0,0,low,0,1
3,134612657,staten island,misdemeanor,344,assault 3 & related offenses,40.51,40.511812,-74.24,-74.239979,2020-11-12,10307,misdemeanor,1,0,1,0,low,0,1
4,500683857,staten island,misdemeanor,351,criminal mischief & related of,40.51,40.511437,-74.24,-74.243459,2020-04-07,10307,misdemeanor,1,0,1,0,low,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118252,601861845,bronx,violation,578,harrassment 2,40.90,40.899198,-73.86,-73.858230,2020-05-08,10470,violation,2,0,0,1,low,0,1
118253,883034470,bronx,violation,578,harrassment 2,40.90,40.897260,-73.86,-73.855250,2020-05-22,10470,violation,2,0,0,1,low,0,1
118254,464564363,bronx,violation,578,harrassment 2,40.90,40.898259,-73.86,-73.855816,2020-05-23,10470,violation,2,0,0,1,low,0,1
118255,411266231,bronx,violation,578,harrassment 2,40.90,40.900696,-73.86,-73.857113,2020-05-08,10470,violation,2,0,0,1,low,0,1
