In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
from os import listdir
from os.path import isfile, join
from datetime import time
import glob
import sys
import datetime

from pyspark.sql.functions import udf, hour, mean, minute, stddev, count, max as psmax, min as psmin, date_format

from pyspark.sql import SQLContext
from pyspark.sql.types import *

Go here to see pyspark functions
http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html

In [2]:
#data locations
data_5min_path = "../../../d11_traffic_data/d11/"
meta_path = "../../../d11_traffic_data/meta/d11/"

# Get data file names

These are all of the filenames that are going to be fed into the Spark instance.

In [3]:
#get all files to process
onlyfiles = [f for f in listdir(data_5min_path) if isfile(join(data_5min_path, f)) ]
onlyfiles = [a for a in onlyfiles if 'txt.gz' in a]

In [None]:
onlyfiles[0:3]

['d11_text_station_5min_2015_01_01.txt.gz',
 'd11_text_station_5min_2015_01_02.txt.gz',
 'd11_text_station_5min_2015_01_03.txt.gz']

# Make spark schema

struct list was generated with the following code after reading the files with inferschema = true, then hand modified 
```
'[' + ','.join(['StructField("%s",%s(),True)'% (colnames[idx], str(i.dataType))
for idx, i in enumerate(rdd.schema)]) + ']'
```

 Column Names are not defined in the files. Defining column names for readability

In [4]:
colnames = [
    'Timestamp','Station','District','Freeway','Direction_of_Travel',
    'LaneType','StationLength','Samples',
    'Perc_Observed','TotalFlow','AvgOccupancy','AvgSpeed',
    'Lane1_Samples','Lane1_Flow','Lane1_AvgOcc','Lane1_AvgSpeed','Lane1_Observed',
    'Lane2_Samples','Lane2_Flow','Lane2_AvgOcc','Lane2_AvgSpeed','Lane2_Observed',
    'Lane3_Samples','Lane3_Flow','Lane3_AvgOcc','Lane3_AvgSpeed','Lane3_Observed',
    'Lane4_Samples','Lane4_Flow','Lane4_AvgOcc','Lane4_AvgSpeed','Lane4_Observed',
    'Lane5_Samples','Lane5_Flow','Lane5_AvgOcc','Lane5_AvgSpeed','Lane5_Observed',
    'Lane6_Samples','Lane6_Flow','Lane6_AvgOcc','Lane6_AvgSpeed','Lane6_Observed',
    'Lane7_Samples','Lane7_Flow','Lane7_AvgOcc','Lane7_AvgSpeed','Lane7_Observed',
    'Lane8_Samples','Lane8_Flow','Lane8_AvgOcc','Lane8_AvgSpeed','Lane8_Observed'
]
colnames = [c.lower() for c in colnames]

Ran spark instance once with limited dataset to identify the datatypes of each column to help create the following script.

In [None]:
#print '[\n    ' + ",\n    ".join(['StructField("%s",%s(),True)'% (colnames[idx], str(i.dataType))
#for idx, i in enumerate(rdd.schema)]) + '\n]'

# Build dataframe with spark

Defining the schema of the files that we are reading in. StructType creates the dataframe schema equivalent of create syntax in SQL)

In [5]:
struct_list = [
    StructField("timestamp",TimestampType(),True),
    StructField("station",IntegerType(),True),
    StructField("district",IntegerType(),True),
    StructField("freeway",IntegerType(),True),
    StructField("direction_of_travel",StringType(),True),
    StructField("lanetype",StringType(),True),
    StructField("stationlength",DoubleType(),True),
    StructField("samples",IntegerType(),True),
    StructField("perc_observed",IntegerType(),True),
    StructField("totalflow",IntegerType(),True),
    StructField("avgoccupancy",DoubleType(),True),
    StructField("avgspeed",DoubleType(),True),
    StructField("lane1_samples",IntegerType(),True),
    StructField("lane1_flow",IntegerType(),True),
    StructField("lane1_avgocc",DoubleType(),True),
    StructField("lane1_avgspeed",DoubleType(),True),
    StructField("lane1_observed",IntegerType(),True),
    StructField("lane2_samples",IntegerType(),True),
    StructField("lane2_flow",IntegerType(),True),
    StructField("lane2_avgocc",DoubleType(),True),
    StructField("lane2_avgspeed",DoubleType(),True),
    StructField("lane2_observed",IntegerType(),True),
    StructField("lane3_samples",IntegerType(),True),
    StructField("lane3_flow",IntegerType(),True),
    StructField("lane3_avgocc",DoubleType(),True),
    StructField("lane3_avgspeed",DoubleType(),True),
    StructField("lane3_observed",IntegerType(),True),
    StructField("lane4_samples",IntegerType(),True),
    StructField("lane4_flow",IntegerType(),True),
    StructField("lane4_avgocc",DoubleType(),True),
    StructField("lane4_avgspeed",DoubleType(),True),
    StructField("lane4_observed",IntegerType(),True),
    StructField("lane5_samples",IntegerType(),True),
    StructField("lane5_flow",IntegerType(),True),
    StructField("lane5_avgocc",DoubleType(),True),
    StructField("lane5_avgspeed",DoubleType(),True),
    StructField("lane5_observed",IntegerType(),True),
    StructField("lane6_samples",IntegerType(),True),
    StructField("lane6_flow",IntegerType(),True),
    StructField("lane6_avgocc",DoubleType(),True),
    StructField("lane6_avgspeed",DoubleType(),True),
    StructField("lane6_observed",IntegerType(),True),
    StructField("lane7_samples",IntegerType(),True),
    StructField("lane7_flow",IntegerType(),True),
    StructField("lane7_avgocc",DoubleType(),True),
    StructField("lane7_avgspeed",DoubleType(),True),
    StructField("lane7_observed",IntegerType(),True),
    StructField("lane8_samples",IntegerType(),True),
    StructField("lane8_flow",IntegerType(),True),
    StructField("lane8_avgocc",DoubleType(),True),
    StructField("lane8_avgspeed",DoubleType(),True),
    StructField("lane8_observed",IntegerType(),True)
]

schema_struct = StructType(struct_list)

Loading the data into spark dataframe from the files (equivalent of insert statements with files in SQL)

In [6]:
#node this is only the first 5 days of files for now
files = [data_5min_path + filename for filename in onlyfiles[:5]]

rdd = spark.read.csv(
    files, 
    header='false',
    timestampFormat='MM/dd/yyyy HH:mm:ss',
    schema=schema_struct,
    inferSchema='false'
)
    
rdd.take(1)

[Row(timestamp=datetime.datetime(2015, 1, 5, 0, 0), station=1100313, district=11, freeway=5, direction_of_travel=u'N', lanetype=u'FR', stationlength=None, samples=10, perc_observed=100, totalflow=6, avgoccupancy=None, avgspeed=None, lane1_samples=10, lane1_flow=6, lane1_avgocc=None, lane1_avgspeed=None, lane1_observed=1, lane2_samples=None, lane2_flow=None, lane2_avgocc=None, lane2_avgspeed=None, lane2_observed=0, lane3_samples=None, lane3_flow=None, lane3_avgocc=None, lane3_avgspeed=None, lane3_observed=0, lane4_samples=None, lane4_flow=None, lane4_avgocc=None, lane4_avgspeed=None, lane4_observed=0, lane5_samples=None, lane5_flow=None, lane5_avgocc=None, lane5_avgspeed=None, lane5_observed=0, lane6_samples=None, lane6_flow=None, lane6_avgocc=None, lane6_avgspeed=None, lane6_observed=0, lane7_samples=None, lane7_flow=None, lane7_avgocc=None, lane7_avgspeed=None, lane7_observed=0, lane8_samples=None, lane8_flow=None, lane8_avgocc=None, lane8_avgspeed=None, lane8_observed=0)]

In [None]:
rdd.count()

### Build freeway STM station order from meta data

Build a sparksql dataframe with the metadata

In [7]:
meta_path = "../../../d11_traffic_data/meta/d11/"

def loadMeta():
    meta_dir= meta_path+'d11_text_meta_2015_*.txt'
    meta_files = glob.glob(meta_dir)

    meta_file_list = []
    for meta_file in meta_files:
        date = str('_'.join(meta_file.split('_')[4:7])).split('.')[0]
        df = pd.read_table(meta_file, index_col=None, header=0)
        date_col = pd.Series([date] * len(df))
        df['file_date'] = date_col
        # drop rows that are missing latitude / longitude values
        #df.dropna(inplace=True, subset=['Latitude', 'Longitude'], how='any')
        meta_file_list.append(df)

    meta_frame = pd.concat(meta_file_list).drop_duplicates(subset='ID', keep='last')

    usefwy = [ 56, 125, 805,  52, 163,   8,  15,   5, 905,  78,  94,  54]

    meta_frame = meta_frame[meta_frame.Fwy.apply(lambda x: x in usefwy)]

    #Add freeway name FwyDir
    meta_frame['freeway'] = meta_frame.Fwy.apply(str) + meta_frame.Dir
    
    r_c = {}
    for c in meta_frame.columns:
        r_c[c]=c.lower()
    
    meta_frame=meta_frame.rename(columns = r_c )
    return meta_frame

meta_data = sqlCtx.createDataFrame(loadMeta().loc[:,['id','abs_pm','type']].rename(columns={'id':'station'}))

In [None]:
meta_data.show(100)

# filter for weekdays I5 S
# group by station, time


Modify this to make all queries

In [13]:
weekdaySelector = udf(
    lambda x: "weekday" if int(x) < 6 else "weekend"
)

#timeOfDay = udf(
#    lambda x: x[0] #time(int(x.hour),int(x.minute))
#)

timeOfDay = udf(
    lambda x: time(int(x.hour), int(x.minute)).strftime("%H:%M")
)


In [18]:
station_time = (
    rdd
    .select(
        'freeway',
        'direction_of_travel',
        'timestamp',
        'station',
        'totalflow',
        'avgoccupancy',
        'avgspeed',
        date_format('timestamp', 'u').alias('dayofweek')
    )
    
)

station_time = (
    station_time
    .withColumn(
        'dayType', 
        weekdaySelector(station_time.dayofweek)
        )
    .withColumn(
        'timeOfDay',
        timeOfDay(station_time.timestamp)
    )
    .groupBy([
        'freeway',
        'direction_of_travel',
        'station',
        'dayType',
        #'dayofweek',
        'timeOfDay'
        #hour("timestamp").alias("hour"),
        #minute("timestamp").alias("minute")
    ])
    .agg(
        mean("totalflow").alias("flow_mean"),
        stddev("totalflow").alias("flow_std"),
        count("totalflow").alias("flow_count"),
        psmax("totalflow").alias("flow_max"),
        psmin("totalflow").alias("flow_min"),
        
        mean("avgoccupancy").alias("occ_mean"),
        stddev("avgoccupancy").alias("occ_std"),
        count("avgoccupancy").alias("occ_count"),
        psmax("avgoccupancy").alias("occ_max"),
        psmin("avgoccupancy").alias("occ_min"),
        
        mean("avgspeed").alias("speed_mean"),
        stddev("avgspeed").alias("speed_std"),
        count("avgspeed").alias("speed_count"),
        psmax("avgspeed").alias("speed_max"),
        psmin("avgspeed").alias("speed_min")
    )
)



#station_time = (
#    station_time
#    .withColumn(
#        'timeOfDay', 
#        timeOfDay(station_time)
#    )
#)
#station_time.show(10)

In [19]:
df = station_time.toPandas()

In [None]:
df.station.unique().shape

In [None]:
#print df.columns
df['dayType'].unique()

In [None]:
df[:10].apply(lambda x:time(int(x.hour),int(x.minute)),axis = 1)

In [None]:
df.sort_values('time',inplace=True)

In [11]:
df.columns

Index([u'freeway', u'direction_of_travel', u'station', u'dayType',
       u'dayofweek', u'timeOfDay', u'flow_mean', u'flow_std', u'flow_count',
       u'flow_max', u'flow_min', u'occ_mean', u'occ_std', u'occ_count',
       u'occ_max', u'occ_min', u'speed_mean', u'speed_std', u'speed_count',
       u'speed_max', u'speed_min'],
      dtype='object')

In [20]:
df.sort_values('timeOfDay')

Unnamed: 0,freeway,direction_of_travel,station,dayType,timeOfDay,flow_mean,flow_std,flow_count,flow_max,flow_min,occ_mean,occ_std,occ_count,occ_max,occ_min,speed_mean,speed_std,speed_count,speed_max,speed_min
0,8,W,1108338,weekday,00:00,1.333333,1.154701,3,2,0,,,0,,,,,0,,
772932,15,S,1100586,weekend,00:00,16.500000,7.778175,2,22,11,,,0,,,,,0,,
772931,8,W,1100475,weekend,00:00,29.500000,0.707107,2,30,29,,,0,,,,,0,,
419878,5,S,1100381,weekend,00:00,7.000000,1.414214,2,8,6,,,0,,,,,0,,
419879,94,W,1108294,weekend,00:00,7.000000,1.414214,2,8,6,,,0,,,,,0,,
419880,5,N,1108509,weekend,00:00,120.000000,8.485281,2,126,114,0.024850,0.000495,2,0.0252,0.0245,69.050000,0.636396,2,69.5,68.6
419881,5,S,1108726,weekend,00:00,8.500000,0.707107,2,9,8,,,0,,,,,0,,
419882,56,W,1113072,weekend,00:00,61.000000,5.656854,2,65,57,0.022900,0.001697,2,0.0241,0.0217,68.550000,0.070711,2,68.6,68.5
419883,78,E,1113565,weekend,00:00,10.000000,1.414214,2,11,9,,,0,,,,,0,,
419884,125,S,1114018,weekend,00:00,66.500000,10.606602,2,74,59,0.018400,0.002404,2,0.0201,0.0167,67.950000,0.212132,2,68.1,67.8
