In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels
import matplotlib.pyplot as plt
import pprint
import matplotlib as mpl
import calendar

In [2]:
collisions = pd.read_csv(r"C:\Users\61102\Downloads\California-Traffic-Collisions-Data\collisions.csv",  low_memory=False,  parse_dates=["collision_date"])

In [3]:
len(collisions)

9424334

In [4]:
collisions.isnull().sum()

case_id                     0
jurisdiction            11407
officer_id              22367
reporting_district    5572459
chp_shift                   0
                       ...   
latitude              6730338
longitude             6730338
collision_date              0
collision_time          82415
process_date                0
Length: 75, dtype: int64

In [5]:
collisions.columns[collisions.isnull().any()]

Index(['jurisdiction', 'officer_id', 'reporting_district', 'population',
       'special_condition', 'city_division_lapd', 'chp_beat_class',
       'beat_number', 'primary_road', 'secondary_road', 'distance',
       'direction', 'intersection', 'weather_1', 'weather_2',
       'state_highway_indicator', 'caltrans_county', 'caltrans_district',
       'state_route', 'route_suffix', 'postmile_prefix', 'postmile',
       'location_type', 'ramp_intersection', 'side_of_highway', 'tow_away',
       'killed_victims', 'injured_victims', 'party_count',
       'primary_collision_factor', 'pcf_violation_code',
       'pcf_violation_category', 'pcf_violation', 'pcf_violation_subsection',
       'hit_and_run', 'type_of_collision', 'motor_vehicle_involved_with',
       'pedestrian_action', 'road_surface', 'road_condition_1',
       'road_condition_2', 'lighting', 'control_device', 'chp_road_type',
       'not_private_property', 'alcohol_involved',
       'statewide_vehicle_type_at_fault', 'chp_vehicl

In [6]:
collisions["type_of_collision"].fillna("Unknown", inplace = True) 
collisions["killed_victims"].fillna(0, inplace = True)
collisions["injured_victims"].fillna(0, inplace = True)
collisions["party_count"].fillna(0, inplace = True)
collisions["weather_1"].fillna("Unknown", inplace = True)
collisions["statewide_vehicle_type_at_fault"].fillna("Unknown", inplace = True)
collisions["chp_vehicle_type_at_fault"].fillna("Unknown", inplace = True)

In [7]:
collisions.columns[collisions.isnull().any()]

Index(['jurisdiction', 'officer_id', 'reporting_district', 'population',
       'special_condition', 'city_division_lapd', 'chp_beat_class',
       'beat_number', 'primary_road', 'secondary_road', 'distance',
       'direction', 'intersection', 'weather_2', 'state_highway_indicator',
       'caltrans_county', 'caltrans_district', 'state_route', 'route_suffix',
       'postmile_prefix', 'postmile', 'location_type', 'ramp_intersection',
       'side_of_highway', 'tow_away', 'primary_collision_factor',
       'pcf_violation_code', 'pcf_violation_category', 'pcf_violation',
       'pcf_violation_subsection', 'hit_and_run',
       'motor_vehicle_involved_with', 'pedestrian_action', 'road_surface',
       'road_condition_1', 'road_condition_2', 'lighting', 'control_device',
       'chp_road_type', 'not_private_property', 'alcohol_involved',
       'motorcyclist_injured_count', 'primary_ramp', 'secondary_ramp',
       'latitude', 'longitude', 'collision_time'],
      dtype='object')

In [8]:
collisions = collisions.dropna(axis=1)

In [9]:
collisions['injuries_cases'] = collisions["killed_victims"]+collisions["injured_victims"]

In [28]:
collisions[collisions['injuries_cases']>1]

Unnamed: 0,chp_shift,county_city_location,county_location,beat_type,chp_beat_type,weather_1,collision_severity,killed_victims,injured_victims,party_count,...,severe_injury_count,other_visible_injury_count,complaint_of_pain_injury_count,pedestrian_killed_count,pedestrian_injured_count,bicyclist_killed_count,bicyclist_injured_count,motorcyclist_killed_count,collision_date,injuries_cases
10,1400 thru 2159,1900,los angeles,chp county roadarea,county road area,cloudy,pain,0.0,2.0,3.0,...,0,0,2,0,2,0,0,0,2009-02-08,2.0
20,0600 thru 1359,1900,los angeles,chp county roadarea,county road area,cloudy,pain,0.0,2.0,2.0,...,0,0,2,0,0,0,0,0,2009-02-13,2.0
32,0600 thru 1359,1942,los angeles,chp state highway,interstate,clear,other injury,0.0,2.0,1.0,...,0,1,1,0,0,0,0,0,2009-02-23,2.0
47,1400 thru 2159,400,butte,chp county roadarea,county road area,clear,pain,0.0,5.0,2.0,...,0,0,5,0,0,0,0,0,2009-01-06,5.0
56,1400 thru 2159,3801,san francisco,chp state highway,interstate,cloudy,pain,0.0,4.0,3.0,...,0,0,4,0,0,0,0,0,2009-01-30,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9424292,not chp,1933,los angeles,not chp,not chp,clear,pain,0.0,2.0,2.0,...,0,0,2,0,0,0,0,0,2016-10-10,2.0
9424298,2200 thru 0559,3801,san francisco,chp state highway,us highway,clear,other injury,0.0,2.0,3.0,...,0,1,1,0,0,0,0,0,2015-09-10,2.0
9424305,not chp,3009,orange,not chp,not chp,clear,fatal,1.0,3.0,2.0,...,3,0,0,0,0,0,0,0,2015-12-06,4.0
9424313,2200 thru 0559,1502,kern,chp state highway,state route,raining,other injury,0.0,2.0,2.0,...,0,1,1,0,0,0,0,0,2015-07-01,2.0


In [30]:
category_columns = collisions.select_dtypes('object').columns.tolist()
category_columns

['chp_shift',
 'county_location',
 'beat_type',
 'chp_beat_type',
 'weather_1',
 'collision_severity',
 'type_of_collision',
 'statewide_vehicle_type_at_fault',
 'chp_vehicle_type_at_fault']

In [26]:
collisions = collisions.drop('process_date', 1)
collisions = collisions.drop('case_id', 1)

In [24]:
numeric_cols=collisions.select_dtypes('number').columns.tolist()

['case_id',
 'county_city_location',
 'killed_victims',
 'injured_victims',
 'party_count',
 'pedestrian_collision',
 'bicycle_collision',
 'motorcycle_collision',
 'truck_collision',
 'severe_injury_count',
 'other_visible_injury_count',
 'complaint_of_pain_injury_count',
 'pedestrian_killed_count',
 'pedestrian_injured_count',
 'bicyclist_killed_count',
 'bicyclist_injured_count',
 'motorcyclist_killed_count',
 'injuries_cases']

In [34]:
from sklearn.preprocessing import LabelEncoder

def toNumeric(data, to):
    le = LabelEncoder()
    collisions[to] = le.fit_transform(collisions[data].astype(str))

for category in category_columns:
    toNumeric(category, 'n_'+str(category))

In [62]:
X = collisions[['n_chp_shift','n_county_location','n_beat_type','n_chp_beat_type','n_weather_1','n_collision_severity','n_type_of_collision','n_statewide_vehicle_type_at_fault','n_chp_vehicle_type_at_fault', 'county_city_location','party_count','pedestrian_collision','bicycle_collision','motorcycle_collision','truck_collision']]
y = collisions['injuries_cases']

In [63]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split

In [64]:
X_train, x_test, Y_train, y_test = train_test_split(X, y, test_size=0.3)

In [65]:
def run_model(model, X_train, Y_train, x_test, y_test, verbose=False):
    # Y_train = Y_train[:, np.newaxis].ravel()
    model.fit(X_train, Y_train)
    y_predict = model.predict(x_test)
    rmsle = np.sqrt(np.mean(np.power(np.log1p(y_test) - np.log1p(y_predict), 2)))
    rmsle = round(rmsle,14)
    print(rmsle)
    return model, rmsle


In [66]:
lr = LinearRegression()
print("Linear Regression")
print("----------------")
model_1, rmsle_1 = run_model(lr, X_train, Y_train, x_test, y_test)

Linear Regression
----------------
0.35294940524806


  """


In [67]:
pd.DataFrame(lr.coef_, X.columns, columns=['coef']).sort_values(by='coef', ascending=False)

Unnamed: 0,coef
party_count,0.198148
motorcycle_collision,0.049706
n_county_location,0.010266
n_chp_beat_type,0.008534
n_chp_shift,0.00816
n_statewide_vehicle_type_at_fault,0.006412
county_city_location,-0.0001
n_chp_vehicle_type_at_fault,-0.000114
n_weather_1,-0.002843
n_beat_type,-0.012398
