In [19]:
import json
import os
import sklearn
import numpy
import xgboost as xgb
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from collections import namedtuple
import sys
from math import log,tan,pi,radians
import pandas as pd
from sodapy import Socrata
from config import CHI_API_KEY
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

## Access Data

In [199]:
client = Socrata("data.cityofchicago.org", app_token = CHI_API_KEY)

results = client.get("85ca-t3if", limit= 10000)
results_df = pd.DataFrame.from_records(results)

## Chicago Neighborhood Label

In [200]:
Pt = namedtuple('Pt','x,y')
Edge = namedtuple('Edge','a,b')
Poly = namedtuple('Poly','name,edges')
_eps = 1e-5
_huge = sys.float_info.max
_tiny = sys.float_info.min

In [201]:
def load_json():
	file_in = open(os.getcwd()+"/Neighborhoods_2012_polygons.json")
	d = json.load(file_in)
	return d

In [202]:
def spherical_mercator_projection(longitude,latitude):
    x = -longitude
    y = log(tan(radians(pi/4 + latitude/2)))
    return (x,y)

In [203]:
def rayintersectseg(p, edge):
    a,b = edge
    if a.y > b.y:
        a,b = b,a
    if p.y == a.y or p.y == b.y:
        p = Pt(p.x, p.y + _eps)
    intersect = False
 
    if (p.y > b.y or p.y < a.y) or (
        p.x > max(a.x, b.x)):
        return False
 
    if p.x < min(a.x, b.x):
        intersect = True
    else:
        if abs(a.x - b.x) > _tiny:
            m_red = (b.y - a.y) / float(b.x - a.x)
        else:
            m_red = _huge
        if abs(a.x - p.x) > _tiny:
            m_blue = (p.y - a.y) / float(p.x - a.x)
        else:
            m_blue = _huge
        
        intersect = m_blue >= m_red
    return intersect

In [204]:
def is_odd(x): 
	return x%2 == 1

def ispointinside(p, poly):
    ln = len(poly)
    return is_odd(sum(rayintersectseg(p, edge)
                    for edge in poly.edges ))

In [205]:
def get_all_neighborhoods():
	d = load_json()
	shape_list=[]
	for shape_idx in range(len(d['features'])):
		name = d['features'][shape_idx]['properties']['SEC_NEIGH']

		edges =[]
		for coordinate_idx in range(len(d['features'][shape_idx]['geometry']['coordinates'][0])-1):
			lon_1 = d['features'][shape_idx]['geometry']['coordinates'][0][coordinate_idx][0]
			lat_1 = d['features'][shape_idx]['geometry']['coordinates'][0][coordinate_idx][1]
			
			lon_2 = d['features'][shape_idx]['geometry']['coordinates'][0][coordinate_idx+1][0]
			lat_2 = d['features'][shape_idx]['geometry']['coordinates'][0][coordinate_idx+1][1]
			
			x1,y1 = spherical_mercator_projection(lon_1,lat_1)
			x2,y2 = spherical_mercator_projection(lon_2,lat_2)
			edges.append(Edge(a=Pt(x=x1,y=y1),b=Pt(x=x2,y=y2)))
		
		shape_list.append(Poly(name=name,edges=tuple(edges)))
	return shape_list

In [206]:
def find_neighborhood(test_long,test_lat,all_neighborhoods):
	x,y = spherical_mercator_projection(test_long,test_lat)
	for neighborhood in all_neighborhoods:
		correct_neighborhood = ispointinside(Pt(x=x,y=y),neighborhood)
		if correct_neighborhood:
			return neighborhood.name

all_neighborhoods = get_all_neighborhoods()

In [207]:
neighborhood = []
for row in results_df["longitude"]:
    test_long = float(row)
for row_1 in results_df["latitude"]:
    test_lat = float(row_1)
    neighborhood.append(find_neighborhood(test_long,test_lat,all_neighborhoods))

results_df['hood_label'] = neighborhood

results_df["hood_label"].unique()

array(['SOUTH SHORE, GRAND CROSSING', None, 'BRONZEVILLE',
       'WASHINGTON PARK', 'CHATHAM,BURNSIDE', 'GRANT PARK',
       'WEST PULLMAN', 'NEAR SOUTH SIDE', 'LOOP', 'MUSEUM CAMPUS',
       'PRINTERS ROW', 'WASHINGTON HEIGHTS,ROSELAND', 'MILLENIUM PARK',
       'RIVERDALE'], dtype=object)

In [208]:
for col in results_df.columns: 
    print(col) 

crash_record_id
crash_date
posted_speed_limit
traffic_control_device
device_condition
weather_condition
lighting_condition
first_crash_type
trafficway_type
alignment
roadway_surface_cond
road_defect
report_type
crash_type
intersection_related_i
damage
date_police_notified
prim_contributory_cause
sec_contributory_cause
street_no
street_direction
street_name
beat_of_occurrence
num_units
most_severe_injury
injuries_total
injuries_fatal
injuries_incapacitating
injuries_non_incapacitating
injuries_reported_not_evident
injuries_no_indication
injuries_unknown
crash_hour
crash_day_of_week
crash_month
latitude
longitude
location
hit_and_run_i
crash_date_est_i
photos_taken_i
private_property_i
statements_taken_i
work_zone_i
work_zone_type
workers_present_i
dooring_i
rd_no
hood_label


In [209]:
results_df["street_direction"].unique()

array(['S', 'N', 'W', 'E'], dtype=object)

In [210]:

results_df = results_df[results_df['hood_label'].notna()]
# filter(None, results_df)

## One Hot Encoding

In [238]:

values = array(results_df["hood_label"])

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)


onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
hl_onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(hour_onehot_encoded)

[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [245]:

values = array(results_df["street_direction"])

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)


onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
SD_onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
# hour_onehot_encoded

In [246]:
values = array(results_df["damage"])

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)


onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
damage_onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

## XGBoost Model

In [264]:
X = hl_onehot_encoded
y = damage_onehot_encoded
print(X.shape)
print(y.shape)

(6257, 13)
(6257, 3)


In [265]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)


In [266]:
xgb_estimator = xgb.XGBClassifier(objective='binary:logistic')


In [267]:
multilabel_model = MultiOutputClassifier(xgb_estimator)


In [268]:
multilabel_model.fit(X_train, y_train)


MultiOutputClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None, gamma=None,
                                              gpu_id=None,
                                              importance_type='gain',
                                              interaction_constraints=None,
                                              learning_rate=None,
                                              max_delta_step=None,
                                              max_depth=None,
                                              min_child_weight=None,
                                              missing=nan,
                                              monotone_constraints=None,
                                              n_estimators=100, n_jobs=None,
                         

In [269]:
print('Accuracy on test data: {:.1f}%'.format(accuracy_score(y_test, multilabel_model.predict(X_test))*100))

Accuracy on test data: 68.5%


## Neural Network

In [134]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [188]:
X = hour_onehot_encoded
y = damage_onehot_encoded
print(X.shape)
print(y.shape)

(364, 9)
(364, 3)


In [189]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
# y_train_categorical = to_categorical(y_train)
# y_test_categorical = to_categorical(y_test)
# y_train_categorical

In [190]:
model = Sequential()

number_inputs = 3
number_hidden_nodes = 4
model.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))

In [191]:
number_classes = 2
model.add(Dense(units=number_classes, activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [192]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=1000,
    shuffle=True,
    verbose=2
)

ValueError: Error when checking input: expected dense_6_input to have shape (3,) but got array with shape (9,)

## Deep Learning

In [221]:
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense

In [223]:
import numpy as np
import pandas as pd

In [247]:
ml_df = pd.DataFrame(columns = [["SD", "HL"]]) 
ml_df["SD"] = SD_onehot_encoded

Exception: Data must be 1-dimensional

In [229]:
X = hour_onehot_encoded
y = damage_onehot_encoded

In [250]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [251]:
X_scaler = StandardScaler().fit(X_train)

In [252]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [253]:
model = Sequential()
model.add(Dense(units=6, activation='relu', input_dim=2))
model.add(Dense(units=2, activation='softmax'))

In [254]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [255]:
model.fit(
    X_train_scaled,
    y_train,
    epochs=100,
    shuffle=True,
    verbose=2
)

ValueError: Error when checking input: expected dense_8_input to have shape (2,) but got array with shape (13,)