#### Initial Set Up

In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Remove dataFrame display size restrictions
#pd.set_option("display.max_rows", None, "display.max_columns", None)

# Create path
path = "resources/crimes.csv"

# Read in csv
df = pd.read_csv(path)

# Display first 5 records
df.head()

# Grab original dimensions before clean
original_dimensions = df.shape
print(f'The original dimensions of the crime dataset (rows/columns): {original_dimensions}') 

The original dimensions of the crime dataset (rows/columns): (313385, 24)


#### Clean Up

In [2]:
# Remove irrelevant columns
df = df[df.columns.difference(['End_Date', 'Suspect_Gender', 'Long2.1', 'Lat2.1', 'Suspect_Age', 'Suspect_Race', 'Victim_Age', 'Victim_Race', 'Victim_Gender', 
                              'City', 'State'])]

df.columns

columns_removed = df.shape
print(f'The dimensions of the crime dataset after removing irrelevant columns: {columns_removed}') 

The dimensions of the crime dataset after removing irrelevant columns: (313385, 13)


In [3]:
# Remove all complaints that were recorded before 2020
df = df[df['Start_Date'].str.contains("2020", na=False)]

years_removed = df.shape
print(f'The dimensions of the crime dataset after excluding all years that do not represent 2020 (rows/columns): {years_removed}') 

The dimensions of the crime dataset after excluding all years that do not represent 2020 (rows/columns): (309865, 13)


In [4]:
# Rename column names
df = df.rename(columns={"Complaint_ID": "complaint_id", "Boro_Name": "borough", "Start_Date": "complaint_date","Complaint_Cat": "category", "Complaint_Code": "complaint_code", "Complaint_Desc": "description", "Zip": "zip"}) 

# Convert all strings to lowercase
df["borough"] = df["borough"].str.lower()
df["category"] = df["category"].str.lower()
df["description"] = df["description"].str.lower()

#### Add Classifications

In [5]:
# Add classification for category column
    # felony: tier 1
    # misdemeanor: tier 2
    # violation: tier 3

# create a list of our conditions

# conditions = [
#     (df['category'] == 'felony'),
#     (df['category'] == 'misdemeanor'),
#     (df['category'] == 'violation'),
#     ]

# # create a list of the values we want to assign for each condition
# values = ['1', '2', '3']

# # create a new column and use np.select to assign values to it using our lists as arguments
# df['category_tier'] = np.select(conditions, values)

# # display updated DataFrame
# df.head()

In [11]:
# # converting type of columns to 'category'
df['category'] = df['category'].astype('category')

# # Assigning numerical values and storing in another column
df['category_tier'] = df['category'].cat.codes
# df

In [12]:
# generate binary values using get_dummies
dum_df = pd.get_dummies(df, columns=["category"], prefix=["Type_is"] )
# merge with main df bridge_df on key values
bridge_df = df.merge(dum_df)
bridge_df

Unnamed: 0,borough,category,complaint_code,description,complaint_id,Lat2,Latitude,Latitude_Crime,Long2,Longitude,Longitude_Crime,complaint_date,zip,category_tier,Type_is_felony,Type_is_misdemeanor,Type_is_violation
0,staten island,violation,578,harrassment 2,144739528,40.51,40.508274,40.506788,-74.24,-74.24387,-74.235092,8/17/2020,10307,2,0,0,1
1,staten island,misdemeanor,344,assault 3 & related offenses,269149414,40.51,40.508274,40.507428,-74.24,-74.24387,-74.236141,11/16/2020,10307,1,0,1,0
2,staten island,felony,109,grand larceny,815686392,40.51,40.508274,40.514922,-74.24,-74.24387,-74.242663,11/18/2020,10307,0,1,0,0
3,staten island,misdemeanor,344,assault 3 & related offenses,134612657,40.51,40.508274,40.511812,-74.24,-74.24387,-74.239979,11/12/2020,10307,1,0,1,0
4,staten island,misdemeanor,351,criminal mischief & related of,500683857,40.51,40.508274,40.511437,-74.24,-74.24387,-74.243459,4/7/2020,10307,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309860,bronx,violation,578,harrassment 2,601861845,40.90,40.900629,40.899198,-73.86,-73.86072,-73.858230,5/8/2020,10470,2,0,0,1
309861,bronx,violation,578,harrassment 2,883034470,40.90,40.900629,40.897260,-73.86,-73.86072,-73.855250,5/22/2020,10470,2,0,0,1
309862,bronx,violation,578,harrassment 2,464564363,40.90,40.900629,40.898259,-73.86,-73.86072,-73.855816,5/23/2020,10470,2,0,0,1
309863,bronx,violation,578,harrassment 2,411266231,40.90,40.900629,40.900696,-73.86,-73.86072,-73.857113,5/8/2020,10470,2,0,0,1


In [None]:
# # creating instance of one-hot-encoder
# enc = OneHotEncoder(handle_unknown='ignore')

# # passing bridge-types-cat column (label encoded values of bridge_types)
# enc_df = pd.DataFrame(enc.fit_transform(df[['category_tier']]).toarray())
# enc_df

# # merge with main df on key values
# crimes_df = df.join(enc_df)
# crimes_df

In [None]:
# # generate binary values using get_dummies
# dum_df = pd.get_dummies(bridge_df, columns=["Bridge_Types"], prefix=["Type_is"] )
# # merge with main df bridge_df on key values
# bridge_df = bridge_df.join(dum_df)
# bridge_df