# <center> Accident table visualization </center>

## Introduction

In this notebook, we want to visualize the data in the accident table to figure out what factors are most sensitive to the drunk-driver involvement.  

## Method
In particular, we plot the probability distribution of variables in the accident table. The distributions are plotted as histograms and normalized by the total number of event in each plot. So for each variable, we have two distributions, one for accidents with drunk driver involved and one without drunk driver involved. Then these two distributions are compared and we rank the variables by the difference between the distribution for this variable. 

In [None]:
# import libraries and data.
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

# load the data
acc_df = pd.read_csv('data/fars_train/accident_train.csv')

# Statistical distance of variable distribution between drunk and sober driver

In [None]:
def varDiff(varName):
    '''
    function to calculate the the difference of distributions between drunk driver involved accident and 
    sober driver involved accident for a given variable
    '''
    var_df = acc_df[[varName, 'DRUNK_DR']]
    
    # get the data for drunk driver and sober driver
    drunk = var_df[(var_df.DRUNK_DR==True)][varName].values
    sober = var_df[(var_df.DRUNK_DR==False)][varName].values

    # determine the lower and upper bound for the histogram
    minBin = int(np.floor( min(min(drunk), min(sober))))
    maxBin = int(np.ceil( max(max(drunk), max(sober)) ))

    # get the normalized histograms 
    bins = range(minBin, maxBin+2)
    drunk_hist = np.histogram(drunk, bins=bins, density=True)[0]
    sober_hist = np.histogram(sober, bins=bins, density=True)[0]

    # calculate the difference between this two histograms
    diff = np.sum(np.abs(drunk_hist - sober_hist))
    return diff

In [None]:
varList = ['STATE', 'VE_FORMS', 'PEDS', 'PERSONS', 'COUNTY', 'CITY', 'YEAR', 'DAY', 'MONTH', 'DAY_WEEK', 
           'HOUR', 'MINUTE', 'NHS', 'ROAD_FNC', 'ROUTE', 'MILEPT', 'SP_JUR', 'HARM_EV', 'MAN_COLL',
           'REL_ROAD', 'LGT_COND', 'WEATHER', 'CF1', 'CF2', 'CF3', 'FATALS']
varImp = {}
for varName in varList:
    varImp[varName] = varDiff(varName)
varImp = sorted(varImp.items(), key=lambda x: (x[1], x[0]), reverse=True)

In [None]:
varImp

In [None]:
varImp = varImp[0:20]
varImp = np.array(varImp)
diffValue = np.array(varImp[:,1], float)
plt.bar(range(len(diffValue)), diffValue, align='center')
plt.xticks(range(len(varImp[:,0])), list(varImp[:,0]), size='small', rotation=60)
plt.xlim(-0.5, len(diffValue))

# Accident calendar

In [None]:
def day_year(year, month, day):
    '''
    function to calculate the day in the year given month and the day in that month
    counting starts at 0
    i.e. New Year's Day is the 0th day of the year 
    and Christmas is the 358th day of the year if that year has 365 days (common year)
    and 359th day of the year if that year has 366 days (leap year)
    '''
    day_month = {}
    day_month[1] = 31
    if year % 4 != 0:
        day_month[2] = 28
    elif year % 100 != 0:
        day_month[2] = 29
    elif year % 400 != 0:
        day_month[2] = 28
    else:
        day_month[2] = 29
    day_month[2] = day_month[1] + day_month[2]
    day_month[3] = 31 + day_month[2]
    day_month[4] = 30 + day_month[3]
    day_month[5] = 31 + day_month[4]
    day_month[6] = 30 + day_month[5]
    day_month[7] = 31 + day_month[6]
    day_month[8] = 31 + day_month[7]
    day_month[9] = 30 + day_month[8]
    day_month[10] = 31 + day_month[9]
    day_month[11] = 30 + day_month[10]
    day_month[12] = 31 + day_month[11]
    
    if month == 1:
        return day - 1
    else:
        return day_month[month-1] + day - 1
    
def plot_cal(ax, acc_day, sad_year):
    '''
    function to plot the calendar of accidents 
    given the month and day information in a certain year
    '''
    acc_day = np.array(acc_day)
    day_in_year = []
    for i in range(acc_day.shape[0]):
        day_in_year.append(day_year(sad_year, acc_day[i,0], acc_day[i,1]))
    day_in_year = np.array(day_in_year)
    
    bins = range(min(day_in_year), max(day_in_year)+2)
    hist_acc_day = np.histogram(day_in_year, bins)[0]
    
    days = bins[0:-1]
    acc_cal = np.zeros((7, 53), dtype=np.int)
    for day in days:
        acc_cal[day % 7][day // 7] = hist_acc_day[day]
    
    
    labels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat','Sun']
    ax.yaxis.set_ticks(range(0,7))
    ax.yaxis.set_ticklabels(labels)
    im = ax.imshow(acc_cal, interpolation='nearest', cmap='OrRd')
    ax.axes.get_xaxis().set_ticks([])
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="1%", pad=0.05)
    cb = plt.colorbar(im, cax=cax)
    '''
    # set color bar tick labels
    cb_label = np.arange(0,110,10)
    cb_loc = cb_label
    im.set_clim(0,100)
    cb.set_ticks(cb_loc)
    cb.set_ticklabels(cb_label)
    '''

In [None]:
cal_df = acc_df[['YEAR', 'MONTH', 'DAY', 'DRUNK_DR']]
sad_year = 2007
all_day = cal_df[(cal_df.YEAR==sad_year) & (cal_df.MONTH>=1) 
                   & (cal_df.MONTH<=12) & (cal_df.DAY >=1) & (cal_df.DAY <=31)][['MONTH','DAY']]
drunk_day = cal_df[(cal_df.DRUNK_DR==True) & (cal_df.YEAR==sad_year) & (cal_df.MONTH>=1) 
                   & (cal_df.MONTH<=12) & (cal_df.DAY >=1) & (cal_df.DAY <=31)][['MONTH','DAY']]
sober_day = cal_df[(cal_df.DRUNK_DR==False) & (cal_df.YEAR==sad_year) & (cal_df.MONTH>=1) 
                   & (cal_df.MONTH<=12) & (cal_df.DAY >=1) & (cal_df.DAY <=31)][['MONTH','DAY']]

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(nrows=3, ncols=1, figsize=(20,6))
plot_cal(ax1, all_day, sad_year)
plot_cal(ax2, drunk_day, sad_year)
plot_cal(ax3, sober_day, sad_year)

plt.tight_layout()
plt.savefig('/Users/Wenbo/Desktop/Accident_calendar.pdf', dpi=300)
plt.close()