In [1]:
import pandas as pd
import numpy as np
import csv

In [12]:
stations = pd.read_csv("Data/station.csv")
status = pd.read_csv("Data/status.csv")
trip = pd.read_csv("Data/trip.csv")
weather = pd.read_csv("Data/weather.csv")

## How does the data look like?

In [13]:
stations.head(5)

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013
1,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
2,4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
3,5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,8/5/2013
4,6,San Pedro Square,37.336721,-121.894074,15,San Jose,8/7/2013


Stations table has 7 fields with station information - location (lat, long, city), dock count and installation date are the main columns.

In [14]:
status.head(5)

Unnamed: 0,station_id,bikes_available,docks_available,time
0,2,2,25,2013/08/29 12:06:01
1,2,2,25,2013/08/29 12:07:01
2,2,2,25,2013/08/29 12:08:01
3,2,2,25,2013/08/29 12:09:01
4,2,2,25,2013/08/29 12:10:01


Status table has 4 columns with info of number of bikes available at a station for a given time. Further, looks like it is updated every minute.

In [15]:
trip.head(5)

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,4576,63,8/29/2013 14:13,South Van Ness at Market,66,8/29/2013 14:14,South Van Ness at Market,66,520,Subscriber,94127
1,4607,70,8/29/2013 14:42,San Jose City Hall,10,8/29/2013 14:43,San Jose City Hall,10,661,Subscriber,95138
2,4130,71,8/29/2013 10:16,Mountain View City Hall,27,8/29/2013 10:17,Mountain View City Hall,27,48,Subscriber,97214
3,4251,77,8/29/2013 11:29,San Jose City Hall,10,8/29/2013 11:30,San Jose City Hall,10,26,Subscriber,95060
4,4299,83,8/29/2013 12:02,South Van Ness at Market,66,8/29/2013 12:04,Market at 10th,67,319,Subscriber,94103


Trip table has 11 fields with trip info - start and stop info, what time and which station, subscriber type, bike id.

In [17]:
weather.head(5)

Unnamed: 0,date,max_temperature_f,mean_temperature_f,min_temperature_f,max_dew_point_f,mean_dew_point_f,min_dew_point_f,max_humidity,mean_humidity,min_humidity,...,mean_visibility_miles,min_visibility_miles,max_wind_Speed_mph,mean_wind_speed_mph,max_gust_speed_mph,precipitation_inches,cloud_cover,events,wind_dir_degrees,zip_code
0,8/29/2013,74.0,68.0,61.0,61.0,58.0,56.0,93.0,75.0,57.0,...,10.0,10.0,23.0,11.0,28.0,0,4.0,,286.0,94107
1,8/30/2013,78.0,69.0,60.0,61.0,58.0,56.0,90.0,70.0,50.0,...,10.0,7.0,29.0,13.0,35.0,0,2.0,,291.0,94107
2,8/31/2013,71.0,64.0,57.0,57.0,56.0,54.0,93.0,75.0,57.0,...,10.0,10.0,26.0,15.0,31.0,0,4.0,,284.0,94107
3,9/1/2013,74.0,66.0,58.0,60.0,56.0,53.0,87.0,68.0,49.0,...,10.0,10.0,25.0,13.0,29.0,0,4.0,,284.0,94107
4,9/2/2013,75.0,69.0,62.0,61.0,60.0,58.0,93.0,77.0,61.0,...,10.0,6.0,23.0,12.0,30.0,0,6.0,,277.0,94107


Weather info - min/max temperature, dew point and humidity.

## Possible analyses:

Time series based predictions:
- Predict number of trips per station/day. Further can be extended to show demand vs supply at these stations.
- Predict travel time for station/day. Is the current pricing strategy which is maximum usage of 30 minutes apt?

Clustering:
- Cluster stations. What are the primary traits common to stations in one cluster? This gives us factors that stand out - demand? location? number of docks? installation date? travel time?

Recommendation based:
- Recommend locations where new stations can be installed based on current supply/demand. (Prediction component not necessarily required)


## Using SparkSQL

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *

sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [3]:
path = "/Users/deena/Documents/Intersession/ProjectWork/"
station = sqlContext.read.load(path + "Data/station.csv", 
                      format='com.databricks.spark.csv', 
                      header='true', 
                      inferSchema='true')

status = sqlContext.read.load(path + "Data/status.csv", 
                      format='com.databricks.spark.csv', 
                      header='true', 
                      inferSchema='true')

trip = sqlContext.read.load(path + "Data/trip.csv", 
                      format='com.databricks.spark.csv', 
                      header='true',
                      inferSchema='true')

weather = sqlContext.read.load(path + "Data/weather.csv", 
                      format='com.databricks.spark.csv', 
                      header='true', 
                      inferSchema='true')

In [4]:
station.write.saveAsTable("station")
status.write.saveAsTable("status")
trip.write.saveAsTable("trip")
weather.write.saveAsTable("weather")

In [5]:
# Station table
sqlContext.sql("""select id, name, lat, long, dock_count, city, 
    DATEDIFF(cast('2017-01-01 00:00:00' as TIMESTAMP), installation_date) as age from station limit 2""").collect()

[Row(id=2, name=u'San Jose Diridon Caltrain Station', lat=37.329732, long=-121.901782, dock_count=27, city=u'San Jose', age=1244),
 Row(id=3, name=u'San Jose Civic Center', lat=37.330698, long=-121.888979, dock_count=15, city=u'San Jose', age=1245)]

In [6]:
# Status table
sqlContext.sql("""select station_id, hour(time) as hour, 
(case when date_format(time, 'EEEE') in ('Saturday', 'Sunday') then 1 else 0 end) as weekday_flag, docks_available,
avg(bikes_available) as bikes_available
from status group by 1,2,3,4 limit 2""").collect()

[Row(station_id=3, hour=2, weekday_flag=0, docks_available=7, bikes_available=7.9980506822612085),
 Row(station_id=3, hour=19, weekday_flag=1, docks_available=3, bikes_available=12.0)]

In [7]:
# Weather table
sqlContext.sql("""select zip_code, hour(date) as hour, 
(case when date_format(date, 'EEEE') in ('Saturday', 'Sunday') then 1 else 0 end) as weekday_flag,
avg(max_temperature_f) as max_temperature_f, avg(mean_temperature_f) as mean_temperature_f,
avg(min_temperature_f) as min_temperature_f, avg(max_dew_point_f) as max_dew_point_f,
avg(mean_dew_point_f) as mean_dew_point_f, avg(min_dew_point_f) as min_dew_point_f,
avg(max_humidity) as max_humidity, avg(mean_humidity) as mean_humidity,
avg(min_humidity) as min_humidity, avg(max_sea_level_pressure_inches) as max_sea_level_pressure_inches,
avg(mean_sea_level_pressure_inches) as mean_sea_level_pressure_inches,
avg(min_sea_level_pressure_inches) as min_sea_level_pressure_inches,
avg(max_visibility_miles) as max_visibility_miles, avg(mean_visibility_miles) as mean_visibility_miles,
avg(min_visibility_miles) as min_visibility_miles, avg(max_wind_Speed_mph) as max_wind_Speed_mph,
avg(mean_wind_speed_mph) as mean_wind_speed_mph, avg(max_gust_speed_mph) as max_gust_speed_mph,
avg(precipitation_inches) as precipitation_inches, avg(cloud_cover) as cloud_cover,
avg(wind_dir_degrees) as wind_dir_degrees,
avg(case when events = 'Fog' then 1 when (events = 'rain' or events = 'Rain') then 2 when events = 'Fog-Rain' then 3
 when events = 'Rain-Thunderstorm' then 4 else 0 end) as events
from weather group by 1,2,3 limit 1""").collect()

[Row(zip_code=95113, hour=0, weekday_flag=0, max_temperature_f=72.36328871892925, mean_temperature_f=61.975143403441685, min_temperature_f=51.087954110898664, max_dew_point_f=51.76099426386233, mean_dew_point_f=47.48374760994264, min_dew_point_f=42.69407265774379, max_humidity=85.39005736137668, mean_humidity=63.48565965583174, min_humidity=41.04780114722753, max_sea_level_pressure_inches=30.0689292543021, mean_sea_level_pressure_inches=30.013441682600387, min_sea_level_pressure_inches=29.956883365200742, max_visibility_miles=9.996175908221797, mean_visibility_miles=9.592734225621415, min_visibility_miles=8.294455066921605, max_wind_Speed_mph=16.437858508604208, mean_wind_speed_mph=5.927342256214149, max_gust_speed_mph=20.9140625, precipitation_inches=0.0320408163265306, cloud_cover=3.0478011472275335, wind_dir_degrees=259.5621414913958, events=0.2390057361376673)]

## Final combined dataset

In [8]:
final_df = sqlContext.sql("""select a.*, b.*, c.*, d.*
from
    (
    select zip_code, a.start_station_id as station_id, a.hour,
    (case when date_format(start_date, 'EEEE') in ('Saturday', 'Sunday') then 1 else 0 end) as weekday_flag,
    month(start_date) as month, year(start_date) as year,
    avg(outgoing_bikes) as outgoing_bikes, avg(incoming_bikes) as incoming_bikes
    from
        (select zip_code, hour(start_date) as hour, start_station_id, cast(start_date as DATE) as start_date,
        avg(duration) as avg_duration, sum(1) as outgoing_bikes
        from trip
        group by 1,2,3,4
        ) as a
    left join
        (select hour(end_date) as hour, end_station_id, cast(end_date as DATE) as end_date, sum(1) as incoming_bikes
        from trip
        group by 1,2,3
        ) as b
        on a.start_station_id = b.end_station_id and a.hour = b.hour and a.start_date = b.end_date
    group by 1,2,3,4,5,6
    ) as a
left join
    (select id, name, lat, long, dock_count, city, 
    DATEDIFF(cast('2017-01-01 00:00:00' as TIMESTAMP), installation_date) as age 
    from station) as b
    on a.station_id = b.id
left join
    (select zip_code, hour(date) as hour, month(date) as month, year(date) as year,
    (case when date_format(date, 'EEEE') in ('Saturday', 'Sunday') then 1 else 0 end) as weekday_flag,
    avg(max_temperature_f) as max_temperature_f, avg(mean_temperature_f) as mean_temperature_f,
    avg(min_temperature_f) as min_temperature_f, avg(max_dew_point_f) as max_dew_point_f,
    avg(mean_dew_point_f) as mean_dew_point_f, avg(min_dew_point_f) as min_dew_point_f,
    avg(max_humidity) as max_humidity, avg(mean_humidity) as mean_humidity,
    avg(min_humidity) as min_humidity, avg(max_sea_level_pressure_inches) as max_sea_level_pressure_inches,
    avg(mean_sea_level_pressure_inches) as mean_sea_level_pressure_inches,
    avg(min_sea_level_pressure_inches) as min_sea_level_pressure_inches,
    avg(max_visibility_miles) as max_visibility_miles, avg(mean_visibility_miles) as mean_visibility_miles,
    avg(min_visibility_miles) as min_visibility_miles, avg(max_wind_Speed_mph) as max_wind_Speed_mph,
    avg(mean_wind_speed_mph) as mean_wind_speed_mph, avg(max_gust_speed_mph) as max_gust_speed_mph,
    avg(precipitation_inches) as precipitation_inches, avg(cloud_cover) as cloud_cover,
    avg(wind_dir_degrees) as wind_dir_degrees,
    avg(case when events = 'Fog' then 1 when (events = 'rain' or events = 'Rain') then 2 when events = 'Fog-Rain' then 3
     when events = 'Rain-Thunderstorm' then 4 else 0 end) as events
    from weather 
    group by 1,2,3,4,5) as c
    on a.zip_code=c.zip_code and a.hour=c.hour and a.weekday_flag=c.weekday_flag and a.month=c.month and a.year=c.year
left join
    (select station_id, hour(time) as hour, month(time) as month, year(time) as year,
    (case when date_format(time, 'EEEE') in ('Saturday', 'Sunday') then 1 else 0 end) as weekday_flag, docks_available,
    avg(bikes_available) as bikes_available
    from status 
    group by 1,2,3,4,5,6) as d
    on a.station_id=d.station_id and a.hour=d.hour and a.weekday_flag=d.weekday_flag and a.month=d.month and a.year=d.year
""")

In [9]:
final_df.printSchema()

root
 |-- zip_code: string (nullable = true)
 |-- station_id: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- weekday_flag: integer (nullable = false)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- outgoing_bikes: double (nullable = true)
 |-- incoming_bikes: double (nullable = true)
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- dock_count: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- zip_code: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekday_flag: integer (nullable = true)
 |-- max_temperature_f: double (nullable = true)
 |-- mean_temperature_f: double (nullable = true)
 |-- min_temperature_f: double (nullable = true)
 |-- max_dew_point_f: double (nullable = true)
 |-- mean_de

In [10]:
final_df.take(1)

[Row(zip_code=u'94502', station_id=10, hour=16, weekday_flag=0, month=9, year=2013, outgoing_bikes=1.0, incoming_bikes=None, id=10, name=u'San Jose City Hall', lat=37.337391, long=-121.886995, dock_count=15, city=u'San Jose', age=1244, zip_code=None, hour=None, month=None, year=None, weekday_flag=None, max_temperature_f=None, mean_temperature_f=None, min_temperature_f=None, max_dew_point_f=None, mean_dew_point_f=None, min_dew_point_f=None, max_humidity=None, mean_humidity=None, min_humidity=None, max_sea_level_pressure_inches=None, mean_sea_level_pressure_inches=None, min_sea_level_pressure_inches=None, max_visibility_miles=None, mean_visibility_miles=None, min_visibility_miles=None, max_wind_Speed_mph=None, mean_wind_speed_mph=None, max_gust_speed_mph=None, precipitation_inches=None, cloud_cover=None, wind_dir_degrees=None, events=None, station_id=None, hour=None, month=None, year=None, weekday_flag=None, docks_available=None, bikes_available=None)]