### New York City Crash Analysis

##### Import pandas, plotly and matplotlib

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

import matplotlib.pyplot as plt
%matplotlib inline

#### Load Crash data

In [None]:
columnDataTypes = {
    'ON STREET NAME' : 'str', 
    'COLLISION_ID' : 'str', 
    'VEHICLE_TYPE' : 'str',
    'BOROUGH' : 'str', 
    'ZIP CODE' : 'str',
} 

dateOpts = { 'DATETIME' : [0, 1]}

columnsToRead = [ 'CRASH_DATE', 'CRASH_TIME', 'BOROUGH', 'ZIP CODE', 'COLLISION_ID', 'VEHICLE_TYPE',
                 'LATITUDE', 'LONGITUDE', 'ON_STREET_NAME', 
                 'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
                 'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
                 'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
                 'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED', ]
# 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2']

columnNames = [ 'CRASH_DATE','CRASH_TIME','BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE',
                'LOCATION','ON_STREET_NAME','CROSS STREET NAME','OFF STREET NAME',
                'NUMBER OF PERSONS INJURED','NUMBER OF PERSONS KILLED',
                'NUMBER OF PEDESTRIANS INJURED','NUMBER OF PEDESTRIANS KILLED',
                'NUMBER OF CYCLIST INJURED','NUMBER OF CYCLIST KILLED',
                'NUMBER OF MOTORIST INJURED','NUMBER OF MOTORIST KILLED',
               'CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2',
               'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4',
               'CONTRIBUTING FACTOR VEHICLE 5', 'COLLISION_ID',
               'VEHICLE_TYPE', 'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 
               'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5' ]

df1 = pd.read_csv('crashes.csv', 
                  dtype = columnDataTypes,
                  names = columnNames,
                  parse_dates = dateOpts, 
                  header = 0,
                  infer_datetime_format = True,
                  skipinitialspace = True,
                  usecols = columnsToRead )

In [None]:
df1

In [None]:
for column in df1.columns:
  print(column, " - ", pd.api.types.infer_dtype( df1[column]) )

In [None]:
df1['degree'] = df1[['NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
                     'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
                     'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED']].sum(axis = 1)

In [None]:
fig = px.histogram(df1, x="degree")
fig.show()

In [None]:
#df1.drop( axis = 0, index = df1[ df1['degree'] == 0 ].index, inplace = True )

In [None]:
fig = px.histogram(df1, x="degree")
fig.show()

In [None]:
df1.shape

In [None]:
df1['pedestrians'] = df1[['NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',]].sum(axis = 1)

In [None]:
df1['pedestrians'] = df1['pedestrians'].apply(lambda x : 1 if x > 0 else 0) 

In [None]:
df1['pedestrians'].value_counts()

In [None]:
classMap = {
    0 : 'S',
    1 : 'M',
    2 : 'P',
    3 : 'MP',
    4 : 'C',
    5 : 'MC',
    6 : 'CP',
    7 : 'MPC'
}
df1['class'] = df1['class'].map(classMap)
df1['class'].value_counts()

In [None]:
fig = px.histogram(df1, x="class")
fig.show()

#### -ignore- Drop data before 2019

In [None]:
df1.set_index(['DATETIME'], inplace=True)

In [None]:
df1.shape

In [None]:
df1.columns

In [None]:
df1.drop( axis = 0, index = df1[:"12/31/2018"].index, inplace = True)
df1.shape

In [None]:
df2 = df1[ (df1.LATITUDE > 40.4) & (df1.LATITUDE < 41.0) & df1.LATITUDE.notna() ]

In [None]:
df2.shape

In [None]:
df3 = df2[(df2.LONGITUDE > -74.4) & (df2.LONGITUDE < -73.6) & df2.LONGITUDE.notna() ]

In [None]:
df3.shape

In [None]:
df4 = df3.dropna( axis = 0, subset = ['VEHICLE_TYPE'] )

In [None]:
df4.shape

In [None]:
df4['STREET_NAME'] = df4['ON_STREET_NAME'].str.strip()

In [None]:
df4.shape

In [None]:
df4.columns

In [None]:
df4.reset_index(inplace = True)
df4.columns

In [None]:
df4.insert( loc = 1, column = 'Week',  value = df4['DATETIME'].dt.isocalendar().week )

In [None]:
week53index = df4[df4.Week == 53].index
df4.drop( axis = 0, index = week53index, inplace = True)

In [None]:
df4.insert( loc = 1, column = 'Weekday', value  = df4['DATETIME'].dt.weekday )

In [None]:
#df4.insert( loc = 1, column = 'Year', value = df4['DATETIME'].dt.year.astype('str') )

In [None]:
timeOfDayMap = {
    1 : "weehours", 2 : "weehours", 3 : "weehours", 4 : "weehours",
    5 : "dawn", 6 : "dawn", 7 :  "dawn",
    8 : "morning", 9 : "morning", 10 : "morning", 11 : "morning",
    12 : "noon", 13 : "noon", 14 : "noon",
    15 : "afternoon", 16 : "afternoon",
    17 : "evening", 18 : "evening", 19 : "evening", 20 : "evening",
    21 : "night", 22 : "night", 23 : "night", 0 : "midnight", 
}
df4.insert( loc = 1, column = 'timeOfDay', value = df4['DATETIME'].dt.hour.map(timeOfDayMap))

In [None]:
df4.insert( loc = 1, column = 'hour', value = df4['DATETIME'].dt.hour)

In [None]:
df4.shape

In [None]:
df4.columns

In [None]:
df4.dtypes

In [None]:
df4

In [None]:
#df4.loc[ df4.VEHICLE_TYPE.isna(), ('VEHICLE_TYPE') ] = "unknown"
#df4.VEHICLE_TYPE.isna()

In [None]:
fig = px.histogram(df4, x="class")
fig.show()

In [None]:
vehicleColumns = ['COLLISION_ID','CRASH_DATE','CRASH_TIME','VEHICLE_ID','VEHICLE_TYPE','TRAVEL_DIRECTION','VEHICLE_OCCUPANTS','DRIVER_SEX','PRE_CRASH','POINT_OF_IMPACT','VEHICLE_DAMAGE','PUBLIC_PROPERTY_DAMAGE','CONTRIBUTING_FACTOR_1','CONTRIBUTING_FACTOR_2']
vehicleColumns = ['COLLISION_ID','VEHICLE_TYPE', 'TRAVEL_DIRECTION','DRIVER_SEX',]
columnDataTypes = { 'COLLISION_ID' : 'str','VEHICLE_TYPE': 'str', 'VEHICLE_ID' : 'str', 'VEHICLE_MAKE' : 'str', 'TRAVEL_DIRECTION' : 'str','DRIVER_SEX' : 'str', 'PRE_CRASH' : 'str' } 
carDF = pd.read_csv('vehicles.csv', dtype = columnDataTypes, usecols = vehicleColumns )

In [None]:
dishaMap = { 
    'North' : 'north', 
    'East' : 'east', 
    'Southeast' : 'southeast', 
    'Unknown': np.nan, 
    'Northwest' : 'northwest', 
    'Northeast' : 'northeast',
    'West' : 'west', 
    'South' : 'south', 
    'Southwest' : 'southwest', 
    'E' : 'east', 
    'N' : 'north', 
    'W' : 'west', 
    'S' : 'south', 
    '-' : np.nan }

In [None]:
carDF.TRAVEL_DIRECTION = carDF.TRAVEL_DIRECTION.map(dishaMap)

In [None]:
carDF2 = carDF.dropna( axis = 0, subset = ['DRIVER_SEX', 'TRAVEL_DIRECTION'] )

In [None]:
carDF2

In [None]:
carDF2.VEHICLE_TYPE[carDF2.VEHICLE_TYPE.isna()] = "unknown"

In [None]:
carDF2.VEHICLE_TYPE.isna().value_counts()

In [None]:
#print("{")
#for x in list(carDF.VEHICLE_TYPE.unique()):
#    print("'" + x + "' : ," )
#print("}")

In [None]:
# Select VEHICLE_ID, TRAVEL_DIRECTION, VEHICLE_OCCUPANTS, DRIVER_SEX, PRE_CRASH where collision id and vehicle type
carCrashDF = pd.merge(df4, carDF2, left_on=['COLLISION_ID', 'VEHICLE_TYPE'], right_on =['COLLISION_ID', 'VEHICLE_TYPE'] )

In [None]:
carCrashDF.columns

In [None]:
carCrashDF

In [None]:
#list(carCrashDF.VEHICLE_TYPE.unique())

In [None]:
personColumns = [ 'COLLISION_ID', 'PERSON_AGE', 'PED_ROLE', 'PERSON_SEX']
columnTypes = {  'COLLISION_ID' : 'str', 'VEHICLE_ID' : 'str', 'PERSON_AGE' : 'Int64', 'PED_ROLE' : 'str', 'PERSON_SEX' : 'str'}
personDF = pd.read_csv('person.csv', dtype = columnTypes, usecols = personColumns )
personDF

In [None]:
personDF2 = personDF.dropna( axis = 0, subset = ['PERSON_SEX', 'PERSON_AGE'] )

In [None]:
driverDF = personDF2[ (personDF2.PED_ROLE == "Driver") & (personDF2.PERSON_SEX != 'U') ].copy()
driverDF.shape

In [None]:
driverDF.PERSON_SEX.value_counts()

In [None]:
driverCarCrashDF = pd.merge(carCrashDF, driverDF,left_on=['COLLISION_ID', 'DRIVER_SEX'], right_on =['COLLISION_ID', 'PERSON_SEX'] )
driverCarCrashDF

In [None]:
#ATV, bicycle, car/suv, ebike, escooter, truck/bus, motorcycle, other)

vehicleTypeMap = { 'Station Wagon/Sport Utility Vehicle' : 'suv', 
  'Moped' : 'motorcycle', 
  'Tanker' : 'truck',
  '4 dr sedan' : 'car', 
  'E-Scooter': 'motorcycle', 
  'Taxi' : 'car', 
  'Motorcycle': 'motorcycle', 
  'Sedan' : 'car', 
  'E-Bike' : 'ebike',
  'SCOOTER': 'motorcycle', 
  'Box Truck': 'truck', 
  'Bike': 'bicycle', 
  'Motorscooter': 'motorcycle', 
  'TAXI' : 'car' ,
  'PASSENGER VEHICLE' : 'bus', 
   np.nan: 'other', 
  'Tractor Truck Diesel': 'truck', 
  'Pick-up Truck': 'truck',
  'BUS' : 'bus', 
  'SPORT UTILITY / STATION WAGON' : 'suv', 
  'Tow Truck / Wrecker': 'truck',
  '3-Door' : 'car', 
  'Flat Bed' : 'truck', 
  'Van' : 'bus', 
  'OTHER' : 'other', 
  'SMALL COM VEH(4 TIRES) ' : 'truck',
  'Dump': 'truck', 
  'LIVERY VEHICLE' : 'van', 
  'MOTORCYCLE' : 'motorcycle', 
  'E-Bik' : 'ebike', 
  'Concrete Mixer': 'truck',
  'VAN' : 'bus', 
  'UNKNOWN' : 'unknown', 
  'Bus' : 'bus', 
  'BICYCLE' : 'bicycle',
  'LARGE COM VEH(6 OR MORE TIRES)': 'truck', 
  'AMBU': 'truck', 
  'AMBULANCE' : 'truck',
  'PICK-UP TRUCK': 'truck', 
  'Convertible': 'car', 
  '2 dr sedan' : 'car', 
  'FIRE TRUCK': 'truck',
  'seagr': 'other', 
  'Gas Scoote' : 'motorcycle', 
  'E BIK' : 'ebike', 
  'MOPED': 'motorcycle', 
  'Ambulance' : 'truck', 
  'Com' : 'truck',
  'Open Body' : 'car', 
  'P/SH' : 'suv',  
  'Minicycle' : 'bicycle', 
  'Garbage or Refuse': 'truck', 
  'BICYC' : 'bicycle',
  'POSTAL TRU': 'truck', 
  'PK' : 'truck', 
  'Motorbike' : 'motorcycle', 
  'School Bus' : 'bus' , 
  'Hopper' : 'truck', 
  'UTILITY' : 'truck',
  'Carry All' : 'truck', 
  'bus' : 'bus', 
  'DELIV': 'truck', 
  'TRuck': 'truck', 
  'dump': 'truck', 
  'BOBCA': 'utility', 
  'FORKL': 'utility',
  'NY CITY MT' : 'bus', 
  'MACK' : 'truck', 
  'limou' : 'car', 
  'GARBA' : 'truck', 
  'STAK' : 'truck', 
  'Refrigerated Van' : 'truck',
  'MTA B' : 'bus', 
  'Power Shov' : 'utility', 
  'scoo': 'motorcycle', 
  'Dodge' : 'truck', 
  'Fork lift' : 'utility', 
  'motor' : 'motorcycle',
  'TRUCK' : 'truck', 
  'LIMO' : 'car', 
  'tow' : 'truck', 
  'PICK UP' : 'truck', 
  'scooter': 'motorcycle', 
  'fork' : 'utility', 
  'fire' : 'truck',
  'E-Sco' : 'ebike', 
  'Lift Boom' : 'utility', 
  'Forklift' : 'utility', 
  'GARBAGE TR' : 'truck', 
  'Flat Rack' : 'truck',
  'Multi-Wheeled Vehicle' : 'truck', 
  'Tractor Truck Gasoline' : 'truck', 
  'Pedicab' : 'bicycle',
  'Posta' : 'truck', 
  'Pass' : 'bus', 
  'USPS' : 'truck', 
  'MOTOR' : 'motorcycle', 
  'moped' : 'motorcycle', 
  'FDNY' : 'truck', 
  'TRAC' : 'truck', 
  'van' : 'bus',
  'Rescu' : 'truck', 
  'FORK' : 'utility', 
  'Chassis Cab' : 'truck', 
  'COM' : 'truck', 
  '4D' : 'suv', 
  'Pick' : 'truck', 
  'DELV' : 'truck',
  'kick scoot' : 'bicycle', 
  'Yamaha': 'motorcycle', 
  'Tow Truck' : 'truck', 
  'Beverage Truck' : 'truck',
  'PEPSI DELI' : 'truck', 
  'PEDICAB' : 'other', 
  'pedicab' : 'other', 
  'Pickup with mounted Camper' : 'truck',
  'NYPD' : 'car', 
  'Armored Truck' : 'truck', 
  'COMM' : 'truck', 
  'Sweeper' : 'truck', 
  'TRANSIT' : 'bus',  
  'POWER' : 'truck',
  'fdny' : 'truck', 
  'SUV' : 'suv', 
  'SCHOO' : 'bus', 
  'ELECT' : 'ebike', 
  'BOX TRUCK' : 'truck',  
  'TRACTOR' : 'truck', 
  'AMBUL' : 'truck',
  'A,n' : 'other', 
  'Suv' : 'suv', 
  'FDNY EMS' : 'truck', 
  'truck' : 'truck', 
  'OMR' : 'other', 
  'TRACT' : 'truck', 
  'FLATBED' : 'truck',
  'unkow' : 'other', 
  'MOTOR SCOO': 'motorcycle', 
  'US POSTAL' : 'truck', 
  '2 WHE' : 'motorcycle', 
  'Lunch Wagon' : 'truck',
  'WAGON' : 'truck', 
  'OMT' : 'truck', 
  'Stake or Rack' : 'truck', 
  'nat grid t' : 'truck', 
  'police rep' : 'car',
  'TRAIL' : 'truck', 
  'SCHOOL BUS' : 'bus', 
  'FIRE' : 'truck', 
  'SEGWA' : 'ebike', 
  'Ambul' : 'truck', 
  'trail' : 'truck',
  'Bulk Agriculture' : 'utility', 
  'U-HAL' : 'truck', 
  'Scoot': 'motorcycle', 
  'E-scooter' : 'ebike', 
  'AMBULENCE' : 'truck',
  'SELF INSUR' : 'other', 
  'Motorscoot': 'motorcycle', 
  'Subr' : 'bus', 
  'CONST' : 'truck', 
  'tr/tr' : 'truck', 
  'SCOOT': 'motorcycle',
  'Post offic' : 'truck', 
  'Mopped': 'motorcycle', 
  'work' : 'truck', 
  'FIRET' : 'truck', 
  'WHITE' : 'truck', 
  'TANK' : 'truck',
  'Unknown' : 'other', 
  'CEMENT TRU' : 'truck', 
  'TOW TRUCK' : 'truck', 
  'MOPAD': 'motorcycle', 
  '50cc mini': 'motorcycle',
  'APPOR' : '', 
  'Scooter': 'motorcycle', 
  'ROAD SWEEP' : 'truck', 
  'SELF' : 'truck', 
  'AMB' : 'truck', 
  'GAS SCOOTE' : 'motorcycle',
  'RV' : 'bus', 
  'Motor' : 'motorcycle', 
  'gas scoote' : 'motorcycle', 
  'UNKNO' : 'unknown', 
  'Tractor' : 'truck', 
  'SEDAN' : 'car',
  'Ford picku' : 'truck', 
  'ELECTRIC S' : 'escooter', 
  'PAS' : 'bus', 
  'Elect' : 'ebike', 
  'suv' : 'suv', 
  'US MA' : 'truck',
  'Dump truck' : 'truck', 
  'comm' : 'truck',  
  'Fire' : 'truck', 
  'PC' : 'truck', 
  'COMME' : 'truck', 
  'DUMP' : 'truck', 
  'Minibike': 'motorcycle',
  'Snow Plow' : 'truck', 
  'PSD' : 'truck', 
  'Trail' : 'truck', 
  'trailer' : 'truck', 
  'COMMERCIAL' : 'truck', 
  'MOPD': 'motorcycle',
  'BOX T' : 'truck', 
  'SUBR' : 'bus', 
  'ambul' : 'truck', 
  'E-BIK' : 'ebike', 
  'DEIV' : 'truck', 
  'FDNY FIRE' : 'truck',
  'Enclosed Body - Removable Enclosure' : 'truck', 
  'GEICO' : 'other' , 
  'LMB'  : 'truck',
  'Fire Truck' : 'truck', 
  'scoot': 'motorcycle', 
  'fire truck' : 'truck', 
  'Grumman LL' : 'truck', 
  'posta' : 'truck', 
  'SCL' : 'bus',
  'E-bike' : 'ebike', 
  'Tractor tr' : 'truck', 
  'garbage tr' : 'truck', 
  'VAN T' : 'truck', 
  'BOX H' : 'truck', 
  'stree' : 'car',
  'tow t' : 'truck', 
  'Cat 9' : 'truck', 
  'VERZION VA' : 'truck', 
  'School bus' : 'bus', 
  'SANIT' : 'truck',
  'E REVEL SC' : 'escooter', 
  'Motorized' : 'bicycle', 
  'box t' : 'truck', 
  'OIL T' : 'truck', 
  'TRAILER' : 'truck', 
  'schoo' : 'bus' }

In [None]:
driverCarCrashDF.insert( loc = 1, column = 'vehicleType', value = driverCarCrashDF['VEHICLE_TYPE'].map(vehicleTypeMap))

In [None]:
driverCarCrashDF.vehicleType = driverCarCrashDF.vehicleType.apply ( lambda x : x if x in ['suv', 'truck', 'car', 'motorcycle', 'bicycle', 'bus', 'ebike', 'utility', 'escooter'] else "other" )
driverCarCrashDF.vehicleType

In [None]:
driverCarCrashDF

In [None]:
#df4.vehicleType.hist()
fig = px.histogram(driverCarCrashDF, x="vehicleType")
fig.show()

In [None]:
#df4.timeOfDay.hist()
times = [ 'weehours', 'dawn', 'morning', 'noon', 'afternoon', 'evening', 'night', 'midnight']
fig = px.histogram(driverCarCrashDF, 
                   x="timeOfDay", 
                   category_orders=dict(timeOfDay = times ))
fig.show()

In [None]:
Weeks = sorted(driverCarCrashDF.Week.unique().astype(int))
fig = px.histogram(driverCarCrashDF, x="Week", category_orders=dict(Week=Weeks))
fig.show()

In [None]:
streets = driverCarCrashDF[['LATITUDE', 'LONGITUDE', 'STREET_NAME']].copy()
#.drop_duplicates(subset = ['ON STREET NAME'], inplace=True)
streets.drop_duplicates(subset = ['LATITUDE', 'LONGITUDE'], inplace=True)
streets.to_csv('streets.csv', index=False)

In [None]:
driverCarCrashDF2 = driverCarCrashDF.groupby(['STREET_NAME', 'Week', 'degree'], group_keys=False).apply(lambda x: x.sample(frac=0.6))

In [None]:
driverCarCrashDF2.shape

In [None]:
fig = px.histogram(driverCarCrashDF2, x="class")
fig.show()

In [None]:
#df4.vehicleType.hist()
fig = px.histogram(driverCarCrashDF2, x="vehicleType")
fig.show()

In [None]:
#df4.timeOfDay.hist()
times = [ 'weehours', 'dawn', 'morning', 'noon', 'afternoon', 'evening', 'night', 'midnight']
fig = px.histogram(driverCarCrashDF2, 
                   x="timeOfDay", 
                   category_orders=dict(timeOfDay = times ))
fig.show()

In [None]:
Weeks = sorted(driverCarCrashDF2.Week.unique().astype(int))
fig = px.histogram(driverCarCrashDF2, x="Week", category_orders=dict(Week=Weeks))
fig.show()

In [None]:
for column in driverCarCrashDF2.columns:
  print(column, " - ", pd.api.types.infer_dtype( driverCarCrashDF2[column]) )

In [None]:
#driverCarCrashDF.to_csv('personCarCrash_big.csv')
#driverCarCrashDF2.to_csv('personCarCrash_sampled.csv')

In [None]:
#columnsToAggregate = ['LATITUDE', 'LONGITUDE', 'COLLISION_ID',
#                      'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
#                      'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
#                      'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
#                      'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED', 'degree' ]#
#
#df6 =  df5.groupby(['ON STREET NAME', 'Week', 'Weekday'], as_index=False)[columnsToAggregate].mean( numeric_only = False)

In [None]:
#df6.shape

In [None]:
#df6.columns

In [None]:
#df6[columnsToAggregate].sum(axis=0)

In [None]:
#import numpy as np
#
#columnsToAggregate = ['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
#                      'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
#                      'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
#                      'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED' ]
#df6[columnsToAggregate] = np.log(df6[columnsToAggregate])

In [None]:
# dawn, morning,  noon,   afternoon,   dusk   evening,    night,  weehours
# 5-8,  8-11,    11-14,    14-17,     17-20,  20-23,      23-2,   2-5

In [None]:
#px.set_mapbox_access_token(open(".mapbox_token").read())
#fig = px.scatter_mapbox(driverCarCrashDF2, 
#                        lat="LATITUDE", 
#                        lon="LONGITUDE",
#                        color="degree",
#                        color_continuous_scale=px.colors.sequential.Jet, 
#                        size_max=150, 
#                        zoom=10,
#                        height=800)
#fig.show()

In [None]:
driverCarCrashDF2.shape

In [None]:
driverCarCrashDF2.notna().sum()

#### Clustering

In [None]:
driverCarCrashDF2.columns

In [None]:
driverCarCrashDF2.TRAVEL_DIRECTION.unique()

In [None]:
driverCarCrashDF3 = driverCarCrashDF2[['hour', 'Weekday', 'Week', 
                       'LATITUDE', 'LONGITUDE',  'STREET_NAME',
                       'NUMBER OF PERSONS INJURED','NUMBER OF PERSONS KILLED', 
                       'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED', 
                       'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED', 
                       'NUMBER OF MOTORIST INJURED','NUMBER OF MOTORIST KILLED', 
                       'degree',
                       'vehicleType', 'TRAVEL_DIRECTION', 
                       'DRIVER_SEX', 'PERSON_AGE', ]].copy()

In [None]:
driverCarCrashDF3.shape

In [None]:
for column in driverCarCrashDF3.columns:
  print(column, " - ", pd.api.types.infer_dtype( driverCarCrashDF3[column]) )

In [None]:
vehicleOHE_DF = pd.get_dummies(driverCarCrashDF3.vehicleType)

In [None]:
directionOHE_DF = pd.get_dummies(driverCarCrashDF3.TRAVEL_DIRECTION)

In [None]:
sexOHE_DF = pd.get_dummies(driverCarCrashDF3.DRIVER_SEX)

In [None]:
driverCarCrashDF4 = driverCarCrashDF3.join( [vehicleOHE_DF, directionOHE_DF, sexOHE_DF] )
driverCarCrashDF4 = driverCarCrashDF3

In [None]:
driverCarCrashDF4.columns

In [None]:
driverCarCrashDF4.dropna( axis = 0, subset = ['NUMBER OF PERSONS INJURED'], inplace = True )

In [None]:
#driverCarCrashDF5 = driverCarCrashDF4.drop(
#    labels = ['vehicleType', 'TRAVEL_DIRECTION', 'DRIVER_SEX',  'STREET_NAME', 'degree',
#              'NUMBER OF PERSONS INJURED', 
#              'NUMBER OF PEDESTRIANS INJURED',
#              'NUMBER OF CYCLIST INJURED', 
#              'NUMBER OF MOTORIST INJURED', ], 
#    axis = 1)

In [None]:
driverCarCrashDF5 = driverCarCrashDF4.drop(
    labels = ['vehicleType', 'TRAVEL_DIRECTION', 'DRIVER_SEX',  'STREET_NAME', 'degree',
              'NUMBER OF PERSONS KILLED',
              'NUMBER OF PEDESTRIANS KILLED',
              'NUMBER OF CYCLIST KILLED',
              'NUMBER OF MOTORIST KILLED'], 
    axis = 1)

In [None]:
driverCarCrashDF5.shape

In [None]:
driverCarCrashDF5.isna().sum()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(driverCarCrashDF5)

In [None]:
X

In [None]:
from sklearn.cluster import KMeans

In [None]:
cluster = KMeans(4)
model = cluster.fit(X)

In [None]:
driverCarCrashDF4['cluster'] = model.labels_

In [None]:
r1 = driverCarCrashDF4.groupby(['LATITUDE', 'LONGITUDE'])['cluster'].max()
#d1 = r1.reset_index()
#d1
r1.value_counts()

In [None]:
r2 = driverCarCrashDF4.groupby(['STREET_NAME'])['cluster'].max()
r2.value_counts()
#d2 = r2.reset_index()
#d2

In [None]:
r3 = driverCarCrashDF4.groupby(['cluster'])['NUMBER OF PERSONS KILLED'].sum()
r3

In [None]:
r4 = driverCarCrashDF4.groupby(['cluster'])['NUMBER OF PERSONS INJURED'].sum()
r4

In [None]:
r4.sort_values()

In [None]:
newClusters = { x: i for i, x in enumerate(list(r4.sort_values().index))}
newClusters

In [None]:
driverCarCrashDF4['cluster2'] = driverCarCrashDF4.cluster.map(newClusters)
driverCarCrashDF4['cluster2']

In [None]:
px.set_mapbox_access_token(open(".mapbox_token").read())

In [None]:
#fig = px.density_mapbox(driverCarCrashDF4, 
#                        lat="LATITUDE", 
#                        lon="LONGITUDE",
#                        z='cluster2', 
#                        radius=10,
#                        zoom=10,
#                        height=800,
#                        mapbox_style="stamen-terrain")
#fig.show()

In [None]:
#fig = px.scatter_mapbox(driverCarCrashDF4, 
#                        lat="LATITUDE", 
#                        lon="LONGITUDE",
#                        color="cluster2",
#                        color_continuous_scale=px.colors.sequential.Jet, 
#                        size_max=150, 
#                        zoom=10,
#                        height=800)
#fig.show()