In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Chicago traffic crashes prediction

You've been given a crucial role as a data scientist for Chicago. 
Your job is to predict which accidents might require a response, like medical aid, towing, or both.
You'll analyze factors like accident location, road conditions, speed limits, and time. 
Chicago wants to use this information to better allocate its resources, considering factors like weather and time of day.

**Note**: This dataset is a small subset of the one available at the [Chicago Data Portal](https://data.cityofchicago.org/). 
We've chosen this subset because you'll be using a `KNeighborsClassifier`, which performs efficiently with small to medium-sized datasets but can be quite slow with larger ones. 
In future assignments, you'll work with the entire dataset.

In [2]:
# load data
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/traffic_crashes_Chicago.csv'
data = pd.read_csv(url, index_col='CRASH_DATE', parse_dates=True)
data.to_csv('traffic_crashes_Chicago.csv')
data

Unnamed: 0_level_0,CRASH_RECORD_ID,RD_NO,CRASH_DATE_EST_I,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,...,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION,YEAR
CRASH_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-03-31 07:34:00,a0cdc2e317e24a87ffb5ed39a0f1ab99054fe04167615b...,JG205578,,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,RAIN,DAYLIGHT,REAR END,DIVIDED - W/MEDIAN (NOT RAISED),...,0.0,2.0,0.0,7,6,3,41.909494,-87.747824,POINT (-87.747823796021 41.909493550808),2023
2023-06-26 16:15:00,00e93310a117dc0228ee5e00affc77ab0bd3334e54db75...,JG317047,,20,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,SIDESWIPE SAME DIRECTION,OTHER,...,0.0,2.0,0.0,16,2,6,,,,2023
2023-04-07 17:15:00,07c772b5d5b0264284f35a7769114ae681037a123d9872...,JG214567,,15,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,ANGLE,PARKING LOT,...,0.0,2.0,0.0,17,6,4,41.834402,-87.616894,POINT (-87.61689418428 41.834401691989),2023
2023-02-01 16:00:00,4d0d885dfa2da00a8d196c58a8d4f249c3c697fb478ecb...,JG138027,,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,REAR END,DIVIDED - W/MEDIAN (NOT RAISED),...,0.0,2.0,0.0,16,4,2,41.962140,-87.645937,POINT (-87.645936592224 41.962140154293),2023
2023-02-27 09:55:00,2630202e4794a8b4dd665b5ad172b09f0be849937eb5f7...,JG167558,,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,RAIN,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,ONE-WAY,...,0.0,2.0,0.0,9,2,2,41.891604,-87.625307,POINT (-87.625306944978 41.89160410607),2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-23 08:41:00,39e3191301443098210c420c84157dec9a9fcd3b982f51...,JG271497,Y,25,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,SIDESWIPE SAME DIRECTION,DIVIDED - W/MEDIAN (NOT RAISED),...,0.0,2.0,0.0,8,3,5,41.707211,-87.628239,POINT (-87.628239101889 41.707211473793),2023
2023-04-21 21:15:00,ee4a15023569327d9ac20fe8a06dbc79aa4e353c7dcdf9...,JG232982,,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",PARKED MOTOR VEHICLE,NOT DIVIDED,...,0.0,1.0,0.0,21,6,4,41.659773,-87.636651,POINT (-87.636650606697 41.659773314849),2023
2023-09-14 15:15:00,a0e267ee446b134cbdab5b9ae1f64c698a622c87984768...,JG424169,,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,...,0.0,2.0,0.0,15,5,9,41.793506,-87.711398,POINT (-87.711398027946 41.793506266409),2023
2023-10-11 08:51:00,9316f75a7f7d6aee7380b7347705907c336b427cea5898...,,,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,NOT DIVIDED,...,0.0,2.0,0.0,8,4,10,41.799679,-87.733000,POINT (-87.732999967493 41.799679470254),2023


In [None]:
# all object type df columns
list(data.select_dtypes(include=['object']).columns)

Train, fine-tune, and test a `KNeighborsClassifier` model for predicting the `CRASH_TYPE` column in the dataset. Create a brief report summarizing your findings for the city of Chicago.

In [None]:
# drop columns with more than 90% missing values
data = data.dropna(thresh=0.9*len(data), axis=1)
data.isna().sum()

In [None]:
data.drop(['CRASH_RECORD_ID', 'RD_NO', 'LATITUDE', 'LONGITUDE', 'LOCATION'], axis=1, inplace=True)

In [None]:
# your code here
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

knnclf = KNeighborsClassifier()

# classification pipeline
categorical_features = list(data.select_dtypes(include=['object']).columns)
# drop CRASH_TYPE from categorical_features
categorical_features.remove('CRASH_TYPE')

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
    ('imputer', SimpleImputer(strategy='mean'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
])

pipe_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn_clf', knnclf)
])

pipe_clf

In [None]:
data.columns

In [None]:
data

In [None]:
# grid params
param_grid = {
    'preprocessor__cat__encoder__handle_unknown': ['error', 'ignore'],
    'knn_clf__n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'knn_clf__weights': ['uniform', 'distance'],
    'knn_clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# grid search
grid_clf = GridSearchCV(pipe_clf, param_grid, cv=5, scoring='accuracy')

# split data
X = data.drop(columns=['CRASH_TYPE'])
y = data['CRASH_TYPE']

X_train, X_test, y_train, y_test = train_test_split(X, y)

# fit grid search
grid_clf.fit(X_train, y_train)

# best params
grid_clf.best_params_

# best score
grid_clf.best_score_

# test score
grid_clf.score(X_test, y_test)

# confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay

y_pred = grid_clf.predict(X_test)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)