In [80]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
from os import listdir
from os.path import isfile, join
from datetime import time
import glob
import sys

from pyspark.sql.functions import hour, mean,minute, stddev, count,max as psmax,min as psmin, date_format

from pyspark.sql import SQLContext
from pyspark.sql.types import *

In [60]:
#data locations
data_5min_path = "../data/station_5min/2015/d11/"
meta_path = "../data/station_5min/2015/meta_data/d11/"

# Get data file names

In [61]:
#get all files to process
onlyfiles = [f for f in listdir(data_5min_path) if isfile(join(data_5min_path, f))]

In [62]:
onlyfiles[0:3]

['d11_text_station_5min_2015_01_01.txt.gz',
 'd11_text_station_5min_2015_01_02.txt.gz',
 'd11_text_station_5min_2015_01_03.txt.gz']

In [63]:
colnames = [
    'Timestamp','Station','District','Freeway','Direction_of_Travel',
    'LaneType','StationLength','Samples',
    'Perc_Observed','TotalFlow','AvgOccupancy','AvgSpeed',
    'Lane1_Samples','Lane1_Flow','Lane1_AvgOcc','Lane1_AvgSpeed','Lane1_Observed',
    'Lane2_Samples','Lane2_Flow','Lane2_AvgOcc','Lane2_AvgSpeed','Lane2_Observed',
    'Lane3_Samples','Lane3_Flow','Lane3_AvgOcc','Lane3_AvgSpeed','Lane3_Observed',
    'Lane4_Samples','Lane4_Flow','Lane4_AvgOcc','Lane4_AvgSpeed','Lane4_Observed',
    'Lane5_Samples','Lane5_Flow','Lane5_AvgOcc','Lane5_AvgSpeed','Lane5_Observed',
    'Lane6_Samples','Lane6_Flow','Lane6_AvgOcc','Lane6_AvgSpeed','Lane6_Observed',
    'Lane7_Samples','Lane7_Flow','Lane7_AvgOcc','Lane7_AvgSpeed','Lane7_Observed',
    'Lane8_Samples','Lane8_Flow','Lane8_AvgOcc','Lane8_AvgSpeed','Lane8_Observed'
]
colnames = [c.lower() for c in colnames]

In [64]:
len(colnames)

52

# Make spark schema

struct list was generated with the following code after reading the files with inferschema = true, then hand modified 
```
'[' + ','.join(['StructField("%s",%s(),True)'% (colnames[idx], str(i.dataType))
for idx, i in enumerate(rdd.schema)]) + ']'
```

In [66]:
#print '[\n    ' + ",\n    ".join(['StructField("%s",%s(),True)'% (colnames[idx], str(i.dataType))
#for idx, i in enumerate(rdd.schema)]) + '\n]'

# Build dataframe with spark

In [67]:
struct_list = [
    StructField("timestamp",TimestampType(),True),
    StructField("station",IntegerType(),True),
    StructField("district",IntegerType(),True),
    StructField("freeway",IntegerType(),True),
    StructField("direction_of_travel",StringType(),True),
    StructField("lanetype",StringType(),True),
    StructField("stationlength",DoubleType(),True),
    StructField("samples",IntegerType(),True),
    StructField("perc_observed",IntegerType(),True),
    StructField("totalflow",IntegerType(),True),
    StructField("avgoccupancy",DoubleType(),True),
    StructField("avgspeed",DoubleType(),True),
    StructField("lane1_samples",IntegerType(),True),
    StructField("lane1_flow",IntegerType(),True),
    StructField("lane1_avgocc",DoubleType(),True),
    StructField("lane1_avgspeed",DoubleType(),True),
    StructField("lane1_observed",IntegerType(),True),
    StructField("lane2_samples",IntegerType(),True),
    StructField("lane2_flow",IntegerType(),True),
    StructField("lane2_avgocc",DoubleType(),True),
    StructField("lane2_avgspeed",DoubleType(),True),
    StructField("lane2_observed",IntegerType(),True),
    StructField("lane3_samples",IntegerType(),True),
    StructField("lane3_flow",IntegerType(),True),
    StructField("lane3_avgocc",DoubleType(),True),
    StructField("lane3_avgspeed",DoubleType(),True),
    StructField("lane3_observed",IntegerType(),True),
    StructField("lane4_samples",IntegerType(),True),
    StructField("lane4_flow",IntegerType(),True),
    StructField("lane4_avgocc",DoubleType(),True),
    StructField("lane4_avgspeed",DoubleType(),True),
    StructField("lane4_observed",IntegerType(),True),
    StructField("lane5_samples",IntegerType(),True),
    StructField("lane5_flow",IntegerType(),True),
    StructField("lane5_avgocc",DoubleType(),True),
    StructField("lane5_avgspeed",DoubleType(),True),
    StructField("lane5_observed",IntegerType(),True),
    StructField("lane6_samples",IntegerType(),True),
    StructField("lane6_flow",IntegerType(),True),
    StructField("lane6_avgocc",DoubleType(),True),
    StructField("lane6_avgspeed",DoubleType(),True),
    StructField("lane6_observed",IntegerType(),True),
    StructField("lane7_samples",IntegerType(),True),
    StructField("lane7_flow",IntegerType(),True),
    StructField("lane7_avgocc",DoubleType(),True),
    StructField("lane7_avgspeed",DoubleType(),True),
    StructField("lane7_observed",IntegerType(),True),
    StructField("lane8_samples",IntegerType(),True),
    StructField("lane8_flow",IntegerType(),True),
    StructField("lane8_avgocc",DoubleType(),True),
    StructField("lane8_avgspeed",DoubleType(),True),
    StructField("lane8_observed",IntegerType(),True)
]

schema_struct = StructType(struct_list)

In [68]:
#node this is only the first 5 days of files for now
files = [data_5min_path + filename for filename in onlyfiles]#[:5]]

rdd = spark.read.csv(
    files, 
    header='false',
    timestampFormat='MM/dd/yyyy HH:mm:ss',
    schema=schema_struct,
    inferSchema='false'
)
    
rdd.take(1)

[Row(timestamp=datetime.datetime(2015, 2, 20, 0, 0), station=1100313, district=11, freeway=5, direction_of_travel=u'N', lanetype=u'FR', stationlength=None, samples=10, perc_observed=100, totalflow=12, avgoccupancy=None, avgspeed=None, lane1_samples=10, lane1_flow=12, lane1_avgocc=None, lane1_avgspeed=None, lane1_observed=1, lane2_samples=None, lane2_flow=None, lane2_avgocc=None, lane2_avgspeed=None, lane2_observed=0, lane3_samples=None, lane3_flow=None, lane3_avgocc=None, lane3_avgspeed=None, lane3_observed=0, lane4_samples=None, lane4_flow=None, lane4_avgocc=None, lane4_avgspeed=None, lane4_observed=0, lane5_samples=None, lane5_flow=None, lane5_avgocc=None, lane5_avgspeed=None, lane5_observed=0, lane6_samples=None, lane6_flow=None, lane6_avgocc=None, lane6_avgspeed=None, lane6_observed=0, lane7_samples=None, lane7_flow=None, lane7_avgocc=None, lane7_avgspeed=None, lane7_observed=0, lane8_samples=None, lane8_flow=None, lane8_avgocc=None, lane8_avgspeed=None, lane8_observed=0)]

In [69]:
rdd.count()

154289957

### build freeway STM station order from meta data

In [70]:
def loadMeta():
    meta_dir='../data/External/meta/2015/d11/d11_text_meta_2015_*.txt'
    meta_files = glob.glob(meta_dir)

    meta_file_list = []
    for meta_file in meta_files:
        date = str('_'.join(meta_file.split('_')[4:7])).split('.')[0]
        df = pd.read_table(meta_file, index_col=None, header=0)
        date_col = pd.Series([date] * len(df))
        df['file_date'] = date_col
        # drop rows that are missing latitude / longitude values
        #df.dropna(inplace=True, subset=['Latitude', 'Longitude'], how='any')
        meta_file_list.append(df)

    meta_frame = pd.concat(meta_file_list).drop_duplicates(subset='ID', keep='last')

    usefwy = [ 56, 125, 805,  52, 163,   8,  15,   5, 905,  78,  94,  54]

    meta_frame = meta_frame[meta_frame.Fwy.apply(lambda x: x in usefwy)]

    #Add freeway name FwyDir
    meta_frame['freeway'] = meta_frame.Fwy.apply(str) + meta_frame.Dir
    
    r_c = {}
    for c in meta_frame.columns:
        r_c[c]=c.lower()
    
    meta_frame=meta_frame.rename(columns = r_c )
    return meta_frame

meta_data = sqlCtx.createDataFrame(loadMeta().loc[:,['id','abs_pm','type']].rename(columns={'id':'station'}))

In [71]:
meta_data.show()

+-------+-----------------+----+
|station|           abs_pm|type|
+-------+-----------------+----+
|1113072|            7.885|  ML|
|1113073|            7.885|  OR|
|1113680|            7.364|  ML|
|1113683|            7.885|  FR|
|1119041|            3.282|  ML|
|1119042|            5.428|  ML|
|1119059|            4.067|  ML|
|1119060|            6.346|  ML|
|1119093|            7.026|  ML|
|1119094|            4.747|  ML|
|1119109|             7.93|  ML|
|1119110|            5.575|  ML|
|1119178|           12.009|  ML|
|1119179|           11.364|  ML|
|1122687|            8.103|  FR|
|1122983|            9.712|  ML|
|1122991|9.277000000000001|  ML|
|1119020|4.718999999999999|  ML|
|1119021|            2.344|  ML|
|1119050|            3.928|  ML|
+-------+-----------------+----+
only showing top 20 rows



# filter for weekdays I5 S
# group by station, time


In [72]:
station_time = (
    rdd
    .where(
        'freeway = 5'
    )
    .where(
        'direction_of_travel = "S"'
    )
    .select(
        'timestamp',
        'station',
        'totalflow',
        'avgoccupancy',
        'avgspeed',
        date_format('timestamp', 'u').alias('dayofweek')
    )
    .filter(
        'dayofweek < 6'
    )
    .groupBy([
        'station',
        hour("timestamp").alias("hour"),
        minute("timestamp").alias("minute")
    ])
    .agg(
        mean("totalflow").alias("flow_mean"),
        stddev("totalflow").alias("flow_std"),
        count("totalflow").alias("flow_count"),
        psmax("totalflow").alias("flow_max"),
        psmin("totalflow").alias("flow_min"),
        
        mean("avgoccupancy").alias("occ_mean"),
        stddev("avgoccupancy").alias("occ_std"),
        count("avgoccupancy").alias("occ_count"),
        psmax("avgoccupancy").alias("occ_max"),
        psmin("avgoccupancy").alias("occ_min"),
        
        mean("avgspeed").alias("speed_mean"),
        stddev("avgspeed").alias("speed_std"),
        count("avgspeed").alias("speed_count"),
        psmax("avgspeed").alias("speed_max"),
        psmin("avgspeed").alias("speed_min")
    )
)
#station_time.show(10)

In [73]:
df = station_time.toPandas()

In [74]:
df.station.unique().shape

(182,)

In [75]:
df['time'] = df.apply(lambda x:time(int(x.hour),int(x.minute)),axis = 1)

In [76]:
df.sort_values('time',inplace=True)

In [77]:
df.columns

Index([u'station', u'hour', u'minute', u'flow_mean', u'flow_std',
       u'flow_count', u'flow_max', u'flow_min', u'occ_mean', u'occ_std',
       u'occ_count', u'occ_max', u'occ_min', u'speed_mean', u'speed_std',
       u'speed_count', u'speed_max', u'speed_min', u'time'],
      dtype='object')

In [82]:
df

Unnamed: 0,station,hour,minute,flow_mean,flow_std,flow_count,flow_max,flow_min,occ_mean,occ_std,occ_count,occ_max,occ_min,speed_mean,speed_std,speed_count,speed_max,speed_min,time
757,1108435,0,0,4.232932,2.286434,249,14.0,0.0,,,0,,,,,0,,,00:00:00
39425,1100384,0,0,3.568000,2.072331,250,11.0,0.0,,,0,,,,,0,,,00:00:00
26906,1114832,0,0,60.260536,32.696641,261,202.0,12.0,0.010267,0.005398,261,0.0356,0.0024,72.207663,2.446210,261,75.2,56.5,00:00:00
26907,1118326,0,0,30.095785,9.421458,261,88.0,14.0,0.006778,0.002244,261,0.0221,0.0033,68.311494,0.893536,261,71.5,66.2,00:00:00
7534,1108453,0,0,8.641593,3.484549,226,18.0,1.0,,,0,,,,,0,,,00:00:00
27163,1108726,0,0,4.357430,2.539359,249,19.0,0.0,,,0,,,,,0,,,00:00:00
38386,1114240,0,0,90.678161,20.858771,261,195.0,47.0,0.018870,0.004424,261,0.0406,0.0097,67.729885,0.509702,261,69.2,64.7,00:00:00
52141,1118152,0,0,2.898785,2.270767,247,12.0,0.0,,,0,,,,,0,,,00:00:00
38385,1113960,0,0,3.471774,2.251228,248,12.0,0.0,,,0,,,,,0,,,00:00:00
51604,1108680,0,0,89.731801,30.574702,261,183.0,25.0,0.023224,0.006714,261,0.0458,0.0060,68.533716,0.783099,261,70.8,64.1,00:00:00
