In [1]:

import pandas as pd
import numpy as np
import time
from os import listdir
from os.path import isfile, join
from datetime import time
import glob
import sys

from pyspark.sql.functions import hour, mean,minute, stddev, count,max as psmax,min as psmin, date_format

from pyspark.sql import SQLContext
from pyspark.sql.types import *

In [2]:
# Replace with your values
#
# NOTE: Set the access to this notebook appropriately to protect the security of your keys.
# Or you can delete this cell after you run the mount command below once successfully.
ACCESS_KEY = ""
SECRET_KEY = ""
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
AWS_BUCKET_NAME = "dse-team2-2014"
MOUNT_NAME = "ch1data_1"

dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)

In [3]:
# Replace with your values
#
# NOTE: Set the access to this notebook appropriately to protect the security of your keys.
# Or you can delete this cell after you run the mount command below once successfully.
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
AWS_BUCKET_NAME = "dse-team1-2015"
MOUNT_NAME = "ch2data"

dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)

In [4]:
dir_list = ["/mnt/%s/dse_traffic/station_5min/%i/d11/" % (MOUNT_NAME,y) for y in range(2008,2016,1)]

In [5]:
onlyfiles = [d.path for d in dbutils.fs.ls(dl) for dl in dir_list ]

In [6]:
#len(files)

In [7]:
#files = dbutils.fs.ls("/mnt/%s/dse_traffic/station_5min/2015/d11/" % MOUNT_NAME)

In [8]:
#onlyfiles = [f.path for f in files]

In [9]:
struct_list = [
    StructField("timestamp",TimestampType(),True),
    StructField("station",IntegerType(),True),
    StructField("district",IntegerType(),True),
    StructField("freeway",IntegerType(),True),
    StructField("direction_of_travel",StringType(),True),
    StructField("lanetype",StringType(),True),
    StructField("stationlength",DoubleType(),True),
    StructField("samples",IntegerType(),True),
    StructField("perc_observed",IntegerType(),True),
    StructField("totalflow",IntegerType(),True),
    StructField("avgoccupancy",DoubleType(),True),
    StructField("avgspeed",DoubleType(),True),
    StructField("lane1_samples",IntegerType(),True),
    StructField("lane1_flow",IntegerType(),True),
    StructField("lane1_avgocc",DoubleType(),True),
    StructField("lane1_avgspeed",DoubleType(),True),
    StructField("lane1_observed",IntegerType(),True),
    StructField("lane2_samples",IntegerType(),True),
    StructField("lane2_flow",IntegerType(),True),
    StructField("lane2_avgocc",DoubleType(),True),
    StructField("lane2_avgspeed",DoubleType(),True),
    StructField("lane2_observed",IntegerType(),True),
    StructField("lane3_samples",IntegerType(),True),
    StructField("lane3_flow",IntegerType(),True),
    StructField("lane3_avgocc",DoubleType(),True),
    StructField("lane3_avgspeed",DoubleType(),True),
    StructField("lane3_observed",IntegerType(),True),
    StructField("lane4_samples",IntegerType(),True),
    StructField("lane4_flow",IntegerType(),True),
    StructField("lane4_avgocc",DoubleType(),True),
    StructField("lane4_avgspeed",DoubleType(),True),
    StructField("lane4_observed",IntegerType(),True),
    StructField("lane5_samples",IntegerType(),True),
    StructField("lane5_flow",IntegerType(),True),
    StructField("lane5_avgocc",DoubleType(),True),
    StructField("lane5_avgspeed",DoubleType(),True),
    StructField("lane5_observed",IntegerType(),True),
    StructField("lane6_samples",IntegerType(),True),
    StructField("lane6_flow",IntegerType(),True),
    StructField("lane6_avgocc",DoubleType(),True),
    StructField("lane6_avgspeed",DoubleType(),True),
    StructField("lane6_observed",IntegerType(),True),
    StructField("lane7_samples",IntegerType(),True),
    StructField("lane7_flow",IntegerType(),True),
    StructField("lane7_avgocc",DoubleType(),True),
    StructField("lane7_avgspeed",DoubleType(),True),
    StructField("lane7_observed",IntegerType(),True),
    StructField("lane8_samples",IntegerType(),True),
    StructField("lane8_flow",IntegerType(),True),
    StructField("lane8_avgocc",DoubleType(),True),
    StructField("lane8_avgspeed",DoubleType(),True),
    StructField("lane8_observed",IntegerType(),True)
]

schema_struct = StructType(struct_list)

In [10]:
rdd = spark.read.csv(
    onlyfiles, 
    header='false',
    timestampFormat='MM/dd/yyyy HH:mm:ss',
    schema=schema_struct,
    inferSchema='false'
)
    
rdd.take(1)

In [11]:
rdd.count()

In [12]:
station_time = (
    rdd
    .where(
        'freeway = 5'
    )
    .where(
        'direction_of_travel = "S"'
    )
    .select(
        'timestamp',
        'station',
        'totalflow',
        'avgoccupancy',
        'avgspeed',
        date_format('timestamp', 'u').alias('dayofweek')
    )
    .filter(
        'dayofweek < 6'
    )
    .groupBy([
        'station',
        hour("timestamp").alias("hour"),
        minute("timestamp").alias("minute")
    ])
    .agg(
        mean("totalflow").alias("flow_mean"),
        stddev("totalflow").alias("flow_std"),
        count("totalflow").alias("flow_count"),
        psmax("totalflow").alias("flow_max"),
        psmin("totalflow").alias("flow_min"),
        
        mean("avgoccupancy").alias("occ_mean"),
        stddev("avgoccupancy").alias("occ_std"),
        count("avgoccupancy").alias("occ_count"),
        psmax("avgoccupancy").alias("occ_max"),
        psmin("avgoccupancy").alias("occ_min"),
        
        mean("avgspeed").alias("speed_mean"),
        stddev("avgspeed").alias("speed_std"),
        count("avgspeed").alias("speed_count"),
        psmax("avgspeed").alias("speed_max"),
        psmin("avgspeed").alias("speed_min")
    )
)
#station_time.show(10)

In [13]:
df = station_time.toPandas()

In [14]:
string = df.to_csv()

In [15]:
dbutils.fs.put("/mnt/ch2data/share_data/weekday_stats_2008_2015.csv", string)