In [79]:
# Code illustrating import of gluon library
# !sudo pip install mxnet
from __future__ import division

# Silence warnings
import warnings
warnings.filterwarnings('ignore')

import mxnet as mx
from mxnet import gluon, autograd, ndarray
import numpy as np

%matplotlib inline
from __future__ import division
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter

Let's decide on some graphic options. Since these data sets have so many columns, it might be nice to see more of them than the default.

In [5]:
pd.set_option('max_columns', 110)
plt.style.use('ggplot')

Here we're reading in our data files and saving them as variables.

In [6]:
persons = pd.read_csv('data/person.csv')
accidents = pd.read_csv('data/accident.csv')
vehicles = pd.read_csv('data/vehicle.csv', encoding ='latin1')  # In case you get encoding errors

Let's get a baseline assessment of a model's ability to accurately classify an Accident without considering information contained within the Person or Vehicle tables.  Let's prepare our data for the Gluon multi-class classification model.

We will want to use the following features of the Accident table:
#Page 32 of FARS Data Manual

    'PEDS': Persons not in vehicles
    'PERNOTMVIT': Non-motorists in crash
    'VE_TOTAL': All vehicles in crash
    'VE_FORMS': Number of vehicles in transport
    'PVH_INVL': Nubmer of parked, working vehicles
    'PERSONS': Number of persons involved
    'PERMVIT': Number of motorists in accident
    'COUNTY': County where accident occured
    'CITY': City where accident occured
    'MONTH': Month of accident
    'DAY': Day of month of accident
    'DAY_WEEK': Day of week of accident
    'YEAR': Year of crash
    'HOUR': Hour of crash
    'MINUTE': Minute of hour of crash
    'TWAY_ID': Traffic Direction at time of crash
    'TWAY_ID2': Traffic Direction at time of crash
    # Not in data, but in data manual
    # 'CL_TWAY': Routing signal at time of accident
    'ROUTE': Routing signal at time of accident
    'SP_JUR': Special Jurisdiction
    'MILEPT' : Closest mile point
    'LATITUDE' : Latitute
    'LONGITUD': Longitude
    'TYP_INT': Type of intersection
    'REL_ROAD': Relation to Trafficway
    'C_M_ZONE': Work Zone
    'WRK_ZONE': Work Zone
    'LGT_COND': Light condition
    'WEATHER': Weather condition
    'WEATHE1': Weather condition
    'WEATHE2': Weather condition

    # Label
    'MAN_COLL': Manner of collision
    'HARM_EV': First injury or damage producing event in crash
    
    
    
*All features have scale/index discontinuities over the history of the data; this must be accounted for on a feature-by-feature basis
    
    

In [77]:
sub_features = [
    'PEDS',
    'PERNOTMVIT',
    'VE_TOTAL',
    'VE_FORMS',
    'PVH_INVL',
    'PERSONS',
    'PERMVIT',
    'COUNTY',
    'CITY',
    'MONTH',
    'DAY',
    'DAY_WEEK',
    'HOUR',
    #Following 2 features are the same over different time horizons
#     'TWAY_ID',
#     'TWAY_ID2',
    #Following 2 features are the same over different time horizons
#     'CL_TWAY',
    'ROUTE',
    'TYP_INT',
    'LGT_COND',
    #Following 3 features are the same over different time horizons
    'WEATHER',
#     'WEATHER1',
#     'WEATHER2'
    
    # LABELS: We can choose which to prediect
    # Manner of Collision
    'MAN_COLL',
    # First Harmul Event
#     'HARM_EV'
]

df_accidents = accidents[sub_features]


Now that we have a sparse dataframe with features and labels (HARM_EV), we'll want to encode our categorical feautures.  Before we do that, let's decrease some of the sparsity by removing features that are missing more than half of their values.

In [None]:
# Almost all instances of 0 map to None, so we will remove columns that have an abundance of them
def trim_features(data_frame, ratio):
    print 'Begining columns:', len(data_frame.columns)
    rows = data_frame.shape[0]
    for col in data_frame.columns:
        non_zeros = data_frame[col].astype(bool).sum(axis=0)
        # If there are more zeros than a given ratio, drop column
        if (non_zeros/rows)<ratio:
            data_frame.drop([col], axis=1, inplace=True)
    
    print 'Ending columns:', len(data_frame.columns)
    return data_frame
        
    
# Trim geographical columns
def trim_geo(data_frame, columns):
    for col in columns:
        data_frame[col] = data_frame[col].apply(lambda x: 0 if x>9995 else x)
    return data_frame
        

# TODO: trim_geo on CITY too
df_accidents_trimmed = trim_geo(df_accidents_trimmed, ['COUNTY'])
df_accidents_trimmed = trim_features(df_accidents, 0.5)

df_accidents_trimmed.head() 

Now that we have semi-dense data, let's one-hot encode our cateogrical features

In [None]:
sub_feature_classes = {
    'COUNTY': 999,
    'CITY': 999,
    'MONTH': 12,
    'DAY': 31,
    'DAY_WEEK': 7,
    'HOUR': 24,
    'ROUTE': 9,
    'TYP_INT': 10,
    'LGT_COND': 9,
    'WEATHER': 14,
    
    # LABEL
    'MAN_COLL': 12,
    'HARM_EV': 100
}

def one_hot_encode_matrix(feature_matrix, label):
    pass

df_accidents_trimmed = pd.get_dummys(df_accidents_trimmed)
print(sub_feature_classes = {
    'COUNTY': 999,
    'CITY': 999,
    'MONTH': 12,
    'DAY': 31,
    'DAY_WEEK': 7,
    'HOUR': 24,
    'ROUTE': 9,
    'TYP_INT': 10,
    'LGT_COND': 9,
    'WEATHER': 14,
    
    # LABEL
    'MAN_COLL': 12,
    'HARM_EV': 100
}

def one_hot_encode_matrix(feature_matrix, label):
    pass

df_accidents_trimmed = pd.get_dummys(df_accidents_trimmed)
df_accidents_trimmed)