In [1]:
%matplotlib inline
data_5min_path = "../station_5min/2015/d11/"
import pandas as pd
import numpy as np
import gzip
import time
from bokeh.io import curdoc, vform, output_notebook, push_notebook, output_file, show
from bokeh.models import ColumnDataSource, HBox, VBox
from bokeh.models.widgets import Slider, Button, DataTable, DateFormatter, TableColumn
from bokeh.plotting import Figure,show
from bokeh.models.layouts import WidgetBox
from bokeh.layouts import row, column
from os import listdir
from os.path import isfile, join
from ipywidgets import interact
import datetime as dt

In [2]:
onlyfiles = [f for f in listdir(data_5min_path) if isfile(join(data_5min_path, f))]

In [3]:
colnames = ['Timestamp', 'Station', 'District', 'Freeway #', 'Direction', 'Lane Type', 'Station Length', 'Samples', 
            '% Observed', 'TotalFlow', 'AvgOccupancy', 'AvgSpeed', 'Lane 1 Samples', 'Lane 1 Flow', 'Lane 1 Avg Occ', 
            'Lane 1 Avg Speed', 'Lane 1 Observed', 'Lane 2 Samples', 'Lane 2 Flow', 'Lane 2 Avg Occ', 
            'Lane 2 Avg Speed', 'Lane 2 Observed', 'Lane 3 Samples', 'Lane 3 Flow', 'Lane 3 Avg Occ', 
            'Lane 3 Avg Speed', 'Lane 3 Observed', 'Lane 4 Samples', 'Lane 4 Flow', 'Lane 4 Avg Occ', 
            'Lane 4 Avg Speed', 'Lane 4 Observed', 'Lane 5 Samples', 'Lane 5 Flow', 'Lane 5 Avg Occ', 
            'Lane 5 Avg Speed', 'Lane 5 Observed', 'Lane 6 Samples', 'Lane 6 Flow', 'Lane 6 Avg Occ', 
            'Lane 6 Avg Speed', 'Lane 6 Observed', 'Lane 7 Samples', 'Lane 7 Flow', 'Lane 7 Avg Occ', 
            'Lane 7 Avg Speed', 'Lane 7 Observed', 'Lane 8 Samples', 'Lane 8 Flow', 'Lane 8 Avg Occ', 
            'Lane 8 Avg Speed', 'Lane 8 Observed']

### Get Meta Data into DF

In [4]:
meta_path = "../station_5min/2015/meta_data/d11/"
meta_files = [f for f in listdir(meta_path) if isfile(join(meta_path, f))]
meta_data = pd.read_table(meta_path+meta_files[0])  # metafiles[0]

### Combine all files into big_df

In [7]:
df_list = []
for i, filename in enumerate(onlyfiles):
    t1 = time.time()
    with gzip.open(data_5min_path+filename, 'rb') as f:
        file_content = pd.read_csv(f,header=None,names=colnames)
        df_list.append(file_content)
    t2 = time.time()
    print i, t2-t1
    if i == 30:
        break

0 2.23941993713
1 2.25090003014
2 2.28697896004
3 2.24717879295
4 2.28673696518
5 2.16076183319
6 2.19041705132
7 2.30132508278
8 2.28615307808
9 2.29717111588
10 2.20239210129
11 2.28300094604
12 2.24759793282
13 2.31274580956
14 2.22716403008
15 2.20623111725
16 2.17238092422
17 2.1325109005
18 2.16779398918
19 2.27893400192
20 2.21576309204
21 2.18940901756
22 2.23531699181
23 2.24864315987
24 2.15287017822
25 2.26993894577
26 2.21131396294
27 2.18698811531
28 2.134319067
29 2.24546599388
30 2.18769311905


In [8]:
big_df = pd.concat(df_list)

In [9]:
df_list = None #clear memory space

### Filter and create small_df focusedon Main Line, I-15, Southbound traffic

In [10]:
small_df = big_df.ix[(big_df['Freeway #'] == 15) & (big_df['Direction'] == 'S') & (big_df['Lane Type'] == 'ML'),:]
small_df = small_df[['Timestamp', 'Station', 'District', 'Freeway #', 'Direction', 'Lane Type', 'Station Length', 
                     'Samples', '% Observed', 'TotalFlow', 'AvgOccupancy', 'AvgSpeed']]
meta_data = meta_data[['ID','Latitude','Longitude']]
meta_data.columns = ['Station','Latitude','Longitude']
small_df = small_df.merge(meta_data)

# Creates an index for each station from N to S.  Index 0 is the northernmost station.  Index N is the southernmost.
station_index = small_df[['Station','Latitude']].drop_duplicates().sort_values('Latitude',ascending=False)\
    .reset_index(drop=True).reset_index()
    
small_df = small_df.merge(station_index)
small_df['Timestamp'] = pd.to_datetime(small_df['Timestamp'])
small_df['Time'] = small_df['Timestamp'].apply(lambda x:x.time())
small_df['Date'] = small_df['Timestamp'].apply(lambda x:x.date())

### Fucntions for Plot

In [11]:
def scale_time(i):
    base_time = dt.time(0,0,0)
    delta = dt.timedelta(minutes=i*5)
    my_time = (dt.datetime.combine(dt.date(1,1,1),base_time) + delta).time()
    return my_time

def get_day_of_week(date_value):
    day_list = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
    day_number = date_value.weekday()
    return day_list[day_number]

### Bokeh Plot

In [12]:
output_notebook()
date_value = dt.date(2015,1,1)
time_value = dt.time(0,0,0)
# Set up data
x = small_df.ix[(small_df['Date'] == date_value) & (small_df['Time'] == time_value),'index']
y = small_df.ix[(small_df['Date'] == date_value) & (small_df['Time'] == time_value),'AvgSpeed']
source = ColumnDataSource(data=dict(x=x, y=y))

# Set up plot
plot = Figure(plot_height=600, plot_width=900, title="Eigenvector Analysis",
              tools="",
              x_range=[0, max(x)], y_range=[0, max(y)+10], x_axis_label='Stations', y_axis_label='AvgSpeed')
plot.scatter('x', 'y', source=source)

# Set up table
data_table = dict(
        dates=[date_value],
        hour=[time_value.hour],
        minute=[time_value.minute],
        day_of_week = [get_day_of_week(date_value)]
    )
source_table = ColumnDataSource(data_table)

columns = [
        TableColumn(field="dates", title="Date", formatter=DateFormatter()),
        TableColumn(field="hour", title="Hour"),
        TableColumn(field="minute", title="Minute"),
        TableColumn(field="day_of_week", title="Day"),
    ]
data_table = DataTable(source=source_table, columns=columns, width=600, height=50)

# Set up callbacks
def update_data(my_date,my_time):

    date_value = dt.date(2015,1,my_date)
    time_value = scale_time(my_time)
    # Set up data
    x = small_df.ix[(small_df['Date'] == date_value) & (small_df['Time'] == time_value),'index']
    y = small_df.ix[(small_df['Date'] == date_value) & (small_df['Time'] == time_value),'AvgSpeed']
    
    data_table = dict(
        dates=[date_value],
        hour=[time_value.hour],
        minute=[time_value.minute],
        day_of_week = [get_day_of_week(date_value)]
    )
    
    source.data = dict(x=x, y=y)
    source_table.data = data_table
    
    push_notebook()

show(column(plot,data_table), notebook_handle=True)

In [14]:
interact(update_data, my_date = (1,30), my_time= (0,287))

date = 17, time = 101 is interesting