In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

sns.set_context(rc = {'patch.linewidth': 2.0})
sns.set_style("white")
sns.set_palette(sns.color_palette("dark"))
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('seaborn-notebook')

data_dir = '../data/'

In [2]:
df = pd.DataFrame.from_csv(data_dir + 'df_aadt.csv')
df.head()

Unnamed: 0,Segment_ID,Segment_Num,Year,Route,County,District,Postmile,Postmile_Boundary,Postmile_Distance,Latitude,Longitude,Back_Peak_Hourly,Back_Peak_Monthly,Back_AADT,Ahead_Peak_Hourly,Ahead_Peak_Monthly,Ahead_AADT
0,0,1,2010,1,ORA,12,0.129,8.43,8.301,33.467051,-117.669809,0,0,0,3750,40000,37000
1,1,4,2010,1,ORA,12,8.43,9.418,0.988,33.531752,-117.774872,2850,38500,36000,2850,38500,36000
2,2,5,2010,1,ORA,12,9.418,19.797,10.379,33.542738,-117.785319,2850,38500,36000,3600,43000,40000
3,3,11,2010,1,ORA,12,19.797,21.549,1.752,33.621271,-117.929359,4100,50000,46000,4400,42500,40000
4,4,13,2010,1,ORA,12,21.549,22.09,0.541,33.630667,-117.956623,3550,34000,32000,4300,41500,39000


In [3]:
min_year = df.Year.min()
max_year = df.Year.max()
years = range(min_year, max_year + 1)

directions = ['Ahead', 'Back']
time_labels = ['Morning', 'Daytime', 'Evening', 'Nighttime']
time_splits = [5.0, 10.0, 14.0, 20.0]

In [4]:
df_switrs = pd.DataFrame.from_csv(data_dir + 'df_switrs.csv')
df_switrs.head()

Unnamed: 0,Segment_ID,State_Route,Caltrans_District,Caltrans_County,Postmile,Side_Of_Highway,Forward,Latitude,Longitude,GPS_Valid,Collision_Year,Collision_Month,Collision_Day,Collision_DayOfWeek,Collision_Hour,Collision_Minute,Collision_Hours,Collision_Minutes
4577214,1954,60.0,8.0,RIV,2.87,E,True,34.01847,-117.50958,True,2010,1,19,1,1,20,1.333333,80.0
4577218,946,15.0,8.0,SBD,7.68,S,False,,,False,2010,2,6,5,4,50,4.833333,290.0
4577222,-1,58.0,8.0,SBD,31.09,E,True,,,False,2010,1,17,6,18,35,18.583333,1115.0
4577223,1521,40.0,8.0,SBD,54.3,W,False,34.72775,-116.08762,True,2010,1,13,2,10,25,10.416667,625.0
4577224,2299,79.0,8.0,RIV,35.37,N,True,,,False,2010,1,23,5,22,57,22.95,1377.0


In [5]:
def get_df_collisions(df_switrs, df):
    df_collisions = {}
    
    for segment in df.Segment_ID.unique():    
        df_collisions[segment] = {}
        
        df_base = df_switrs[df_switrs.Segment_ID == segment]
        
        df_collisions[segment]['total'] = df_base
        df_collisions[segment]['ahead'] = df_base[df_base.Forward]
        df_collisions[segment]['back'] = df_base[~df_base.Forward]        
        
    return df_collisions

In [6]:
df_collisions = get_df_collisions(df_switrs, df)

In [7]:
def get_collisions(s, ahead = True):
    if not s in df_collisions:
        return 0
    
    col = 'ahead' if ahead else 'back'
    
    return len(df_collisions[s][col])

In [8]:
df['Ahead_Collisions'] = df.Segment_ID.apply(get_collisions, args=(True, ))
df['Back_Collisions'] = df.Segment_ID.apply(get_collisions, args=(False, ))

df['Total_Collisions'] = df.Ahead_Collisions + df.Back_Collisions
df['Ahead_Collision_Ratio'] = df.Ahead_Collisions / df.Total_Collisions
df['Back_Collision_Ratio'] = df.Back_Collisions / df.Total_Collisions

df.Ahead_Collision_Ratio.fillna(0, inplace=True)
df.Back_Collision_Ratio.fillna(0, inplace=True)

In [9]:
def get_time_collisions(s, ti, tf, ahead = True):
    if not s in df_collisions:
        return 0
    
    col = 'ahead' if ahead else 'back'
    
    df_time = df_collisions[s][col]
    
    if tf > ti:
        return len(df_time[(df_time.Collision_Hours >= ti) 
                         & (df_time.Collision_Hours <  tf)])
    else:        
        return len(df_time[(df_time.Collision_Hours <  tf) 
                         | (df_time.Collision_Hours >= ti)])

In [10]:
for direction in directions:
    print 'Setting %s Collisions...' % direction
    
    for i, label in enumerate(time_labels):
        print '   %s...' % label
        
        ti = time_splits[i]
        tf = time_splits[(i + 1) % len(time_splits)]
        ahead = (direction == 'Ahead')
        
        col = '%s_Collisions_%s' % (direction, label)
        
        df[col] = df.Segment_ID.apply(get_time_collisions, args=(ti, tf, ahead))

Setting Ahead Collisions...
   Morning...
   Daytime...
   Evening...
   Nighttime...
Setting Back Collisions...
   Morning...
   Daytime...
   Evening...
   Nighttime...


In [11]:
def get_accident_spread(s, ahead = True):   
    if not s in df_collisions:
        return 0
    
    col = 'ahead' if ahead else 'back'
    
    return df_collisions[s][col].Postmile.std() if len(df_collisions[s][col]) > 1 else -1

In [12]:
df['Ahead_Collision_Spread'] = df.Segment_ID.apply(get_accident_spread, args=(True, ))
df['Back_Collision_Spread'] = df.Segment_ID.apply(get_accident_spread, args=(False, ))

In [13]:
counties = {}

for year in years:
    df_year = df[df.Year == year]
    counties[year] = set(df_year.County.unique())
    print '%s - Total Counties: %d' % (year, len(counties[year]))
        
for year1 in years:
    print
    for year2 in years:
        print len(np.intersect1d(counties[year1], counties[year2])[0]),

2010 - Total Counties: 58
2011 - Total Counties: 58
2012 - Total Counties: 58
2013 - Total Counties: 58
2014 - Total Counties: 58
2015 - Total Counties: 58
2016 - Total Counties: 58

58 58 58 58 58 58 58
58 58 58 58 58 58 58
58 58 58 58 58 58 58
58 58 58 58 58 58 58
58 58 58 58 58 58 58
58 58 58 58 58 58 58
58 58 58 58 58 58 58


In [14]:
counties_dict = dict([(v, k) for k, v in enumerate(df.County.unique(), 1)])

print counties_dict

{'SIS': 15, 'DN': 57, 'RIV': 36, 'GLE': 29, 'MOD': 58, 'NAP': 37, 'SUT': 42, 'SIE': 56, 'CAL': 19, 'NEV': 44, 'ALA': 39, 'COL': 28, 'KIN': 23, 'TEH': 30, 'SCL': 35, 'YUB': 43, 'MAD': 51, 'AMA': 40, 'LA': 2, 'SAC': 26, 'HUM': 48, 'LAS': 50, 'SOL': 38, 'BUT': 47, 'SON': 11, 'MNO': 33, 'SCR': 7, 'TRI': 14, 'YOL': 27, 'STA': 18, 'FRE': 24, 'CC': 16, 'ED': 55, 'TUO': 54, 'MPA': 52, 'SHA': 31, 'MRN': 10, 'SLO': 5, 'VEN': 3, 'PLA': 45, 'LAK': 41, 'INY': 32, 'PLU': 49, 'SM': 8, 'SBT': 46, 'IMP': 34, 'MEN': 12, 'SF': 9, 'SJ': 17, 'TUL': 53, 'MON': 6, 'SBD': 13, 'MER': 25, 'SB': 4, 'ORA': 1, 'KER': 22, 'ALP': 20, 'SD': 21}


In [15]:
df['County_Name'] = df.County
df.County = df.County.apply(lambda x: counties_dict[x])

In [16]:
cols = [
u'Segment_ID', u'Segment_Num', u'Year', u'Route', u'County', u'District',
u'Postmile', u'Postmile_Boundary', u'Postmile_Distance',
u'Latitude', u'Longitude',
u'Back_Peak_Hourly', u'Back_Peak_Monthly', u'Back_AADT', 
u'Ahead_Peak_Hourly', u'Ahead_Peak_Monthly', u'Ahead_AADT',
u'Ahead_Collisions', u'Back_Collisions', u'Total_Collisions',
u'Ahead_Collisions_Morning', u'Ahead_Collisions_Daytime',
u'Ahead_Collisions_Evening', u'Ahead_Collisions_Nighttime',
u'Back_Collisions_Morning', u'Back_Collisions_Daytime',
u'Back_Collisions_Evening', u'Back_Collisions_Nighttime', 
u'Ahead_Collision_Ratio', u'Back_Collision_Ratio',
u'Ahead_Collision_Spread', u'Back_Collision_Spread'
]

df = df[cols]
df.head()

Unnamed: 0,Segment_ID,Segment_Num,Year,Route,County,District,Postmile,Postmile_Boundary,Postmile_Distance,Latitude,...,Ahead_Collisions_Evening,Ahead_Collisions_Nighttime,Back_Collisions_Morning,Back_Collisions_Daytime,Back_Collisions_Evening,Back_Collisions_Nighttime,Ahead_Collision_Ratio,Back_Collision_Ratio,Ahead_Collision_Spread,Back_Collision_Spread
0,0,1,2010,1,1,12,0.129,8.43,8.301,33.467051,...,23,13,9,13,26,9,0.512821,0.487179,2.221142,2.752451
1,1,4,2010,1,1,12,8.43,9.418,0.988,33.531752,...,25,5,2,9,20,8,0.571429,0.428571,0.30015,0.327369
2,2,5,2010,1,1,12,9.418,19.797,10.379,33.542738,...,27,6,14,20,43,12,0.398649,0.601351,4.463006,3.988941
3,3,11,2010,1,1,12,19.797,21.549,1.752,33.621271,...,10,6,3,5,8,4,0.534884,0.465116,0.409102,0.518883
4,4,13,2010,1,1,12,21.549,22.09,0.541,33.630667,...,0,0,0,0,0,0,1.0,0.0,0.0,-1.0


In [17]:
for d in directions:
    df['%s_Collisions_Rate' % d] = df['%s_Collisions' % d] \
                                 / df['%s_AADT' % d]
        
    df['%s_Collisions_Rate' % d].fillna(0, inplace=True)
    df['%s_Collisions_Rate' % d].replace(np.inf, 0, inplace=True)
        
    df['%s_Collisions_Per_Distance' % d] = df['%s_Collisions' % d] \
                                         / df['Postmile_Distance']
        
    df['%s_Collisions_Per_Distance' % d].fillna(0, inplace=True)
    
    for t in time_labels:
        df['%s_Collisions_%s_Ratio' % (d, t)] = df['%s_Collisions_%s' % (d, t)] \
                                              / df['%s_Collisions' % d]
            
        df['%s_Collisions_%s_Ratio' % (d, t)].fillna(0, inplace=True)

In [45]:
def is_gps_valid(row):
    lat = row.Latitude
    lng = row.Longitude
    year = row.Year
    route = row.Route
    s_id = row.Segment_ID
    
    df_next = df_features[(df_features.Year == year)
                        & (df_features.Route == route)
                        & (df_features.Segment_ID == (s_id + 1))]
    
    if len(df_next) == 0:
        return True
    
    next_lat = df_next.iloc[0].Latitude
    next_lng = df_next.iloc[0].Longitude
    
    return np.sqrt((lat - next_lat)**2 + (lng - next_lng)**2) < 0.2#5

In [46]:
df['GPS_Valid'] = df.apply(is_gps_valid, axis=1)

In [47]:
print len(df), sum(df.GPS_Valid)

44289 41407


In [18]:
df.head()

Unnamed: 0,Segment_ID,Segment_Num,Year,Route,County,District,Postmile,Postmile_Boundary,Postmile_Distance,Latitude,...,Ahead_Collisions_Morning_Ratio,Ahead_Collisions_Daytime_Ratio,Ahead_Collisions_Evening_Ratio,Ahead_Collisions_Nighttime_Ratio,Back_Collisions_Rate,Back_Collisions_Per_Distance,Back_Collisions_Morning_Ratio,Back_Collisions_Daytime_Ratio,Back_Collisions_Evening_Ratio,Back_Collisions_Nighttime_Ratio
0,0,1,2010,1,1,12,0.129,8.43,8.301,33.467051,...,0.116667,0.283333,0.383333,0.216667,0.0,6.866643,0.157895,0.22807,0.45614,0.157895
1,1,4,2010,1,1,12,8.43,9.418,0.988,33.531752,...,0.096154,0.326923,0.480769,0.096154,0.001083,39.473684,0.051282,0.230769,0.512821,0.205128
2,2,5,2010,1,1,12,9.418,19.797,10.379,33.542738,...,0.084746,0.355932,0.457627,0.101695,0.002472,8.575007,0.157303,0.224719,0.483146,0.134831
3,3,11,2010,1,1,12,19.797,21.549,1.752,33.621271,...,0.173913,0.130435,0.434783,0.26087,0.000435,11.415525,0.15,0.25,0.4,0.2
4,4,13,2010,1,1,12,21.549,22.09,0.541,33.630667,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
df.to_csv(data_dir + 'df_features.csv')