In [55]:
from pyspark.sql.functions import *
from pyspark.sql.types import DateType

### Importing DataFrame objects from MongoDB 

In [76]:
statusDF = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://127.0.0.1/msan697.status").load()

stationDF = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://127.0.0.1/msan697.station").load()

weatherDF = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://127.0.0.1/msan697.weather").load()

tripDF = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://127.0.0.1/msan697.trip").load()

In [23]:
statusDF.show(n=5)

+--------------------+---------------+---------------+----------+-------------------+
|                 _id|bikes_available|docks_available|station_id|               time|
+--------------------+---------------+---------------+----------+-------------------+
|[5a5d49a291bd3626...|              2|             25|         2|2013/08/29 12:06:01|
|[5a5d49a291bd3626...|              2|             25|         2|2013/08/29 12:07:01|
|[5a5d49a291bd3626...|              2|             25|         2|2013/08/29 12:08:01|
|[5a5d49a291bd3626...|              2|             25|         2|2013/08/29 12:09:01|
|[5a5d49a291bd3626...|              2|             25|         2|2013/08/29 12:10:01|
+--------------------+---------------+---------------+----------+-------------------+
only showing top 5 rows



In [22]:
stationDF.show(n=5)

+--------------------+--------+----------+---+-----------------+------------------+-------------------+--------------------+
|                 _id|    city|dock_count| id|installation_date|               lat|               long|                name|
+--------------------+--------+----------+---+-----------------+------------------+-------------------+--------------------+
|[5a5d498991bd3626...|San Jose|        19|  5|         8/5/2013|         37.331415|          -121.8932|    Adobe on Almaden|
|[5a5d498991bd3626...|San Jose|        15|  6|         8/7/2013|37.336721000000004|        -121.894074|    San Pedro Square|
|[5a5d498991bd3626...|San Jose|        15|  7|         8/7/2013|         37.333798|-121.88694299999999|Paseo de San Antonio|
|[5a5d498991bd3626...|San Jose|        15|  8|         8/5/2013|         37.330165|-121.88583100000001| San Salvador at 1st|
|[5a5d498991bd3626...|San Jose|        15|  9|         8/5/2013|         37.348742|-121.89471499999999|           Japantown|


In [9]:
weatherDF.show(n=2)

+--------------------+-----------+---------+------+---------------+------------------+------------+-----------------------------+-----------------+--------------------+------------------+----------------+-------------+------------------------------+------------------+---------------------+-------------------+---------------+------------+-----------------------------+-----------------+--------------------+--------------------+----------------+--------+
|                 _id|cloud_cover|     date|events|max_dew_point_f|max_gust_speed_mph|max_humidity|max_sea_level_pressure_inches|max_temperature_f|max_visibility_miles|max_wind_Speed_mph|mean_dew_point_f|mean_humidity|mean_sea_level_pressure_inches|mean_temperature_f|mean_visibility_miles|mean_wind_speed_mph|min_dew_point_f|min_humidity|min_sea_level_pressure_inches|min_temperature_f|min_visibility_miles|precipitation_inches|wind_dir_degrees|zip_code|
+--------------------+-----------+---------+------+---------------+------------------+--

In [11]:
tripDF.show(n=5)

+--------------------+-------+--------+---------------+--------------+--------------------+----+---------------+----------------+--------------------+-----------------+--------+
|                 _id|bike_id|duration|       end_date|end_station_id|    end_station_name|  id|     start_date|start_station_id|  start_station_name|subscription_type|zip_code|
+--------------------+-------+--------+---------------+--------------+--------------------+----+---------------+----------------+--------------------+-----------------+--------+
|[5a5d4ec891bd3626...|    520|      63|8/29/2013 14:14|            66|South Van Ness at...|4576|8/29/2013 14:13|              66|South Van Ness at...|       Subscriber|   94127|
|[5a5d4ec891bd3626...|    661|      70|8/29/2013 14:43|            10|  San Jose City Hall|4607|8/29/2013 14:42|              10|  San Jose City Hall|       Subscriber|   95138|
|[5a5d4ec891bd3626...|     48|      71|8/29/2013 10:17|            27|Mountain View Cit...|4130|8/29/2013 10:1

In [184]:
statusDF.columns

['_id',
 'bikes_available',
 'docks_available',
 'station_id',
 'time',
 'dayofweek',
 'weekend',
 'weekday',
 'hourofday',
 'morning',
 'afternoon',
 'evening',
 'night',
 'month']

In [183]:
tripDF.columns

['_id',
 'bike_id',
 'duration',
 'end_date',
 'end_station_id',
 'end_station_name',
 'id',
 'start_date',
 'start_station_id',
 'start_station_name',
 'subscription_type',
 'zip_code']

### Adding features 

#### Weekday/Weekend 

In [110]:
#Adding day of week column
statusDF = statusDF.withColumn('dayofweek',date_format(from_unixtime(unix_timestamp(statusDF["time"][0:10], 'yyyy/MM/dd')),'EEEE'))

In [179]:
statusDF.select('time','dayofweek').show(n=5)

+-------------------+---------+
|               time|dayofweek|
+-------------------+---------+
|2013/08/29 12:06:01| Thursday|
|2013/08/29 12:07:01| Thursday|
|2013/08/29 12:08:01| Thursday|
|2013/08/29 12:09:01| Thursday|
|2013/08/29 12:10:01| Thursday|
+-------------------+---------+
only showing top 5 rows



In [101]:
#Adding weekend column
statusDF = statusDF.withColumn("weekend", when(col('dayofweek') == 'Saturday',1).when(col('dayofweek') == 'Sunday', 1).otherwise(0))
#Adding weekday column
statusDF = statusDF.withColumn("weekday", when(col('dayofweek') == 'Saturday',0).when(col('dayofweek') == 'Sunday', 0).otherwise(1))

In [178]:
#Checking output
statusDF.select('time','dayofweek','weekend','weekday').where(statusDF.dayofweek == "Sunday").show(n=1)

+-------------------+---------+-------+-------+
|               time|dayofweek|weekend|weekday|
+-------------------+---------+-------+-------+
|2013/09/01 00:00:02|   Sunday|      1|      0|
+-------------------+---------+-------+-------+
only showing top 1 row



#### Hour of Day/Morning/Afternoon/Evening/Night 

In [155]:
#Adding hourofday column
statusDF = statusDF.withColumn('hourofday',statusDF["time"][12:2])

We define morning as the time between 5am and 12pm, afternoon between 12pm and 5pm, evening between 5pm and 11pm and night between 11pm and 5am.

In [172]:
#Adding morning column
statusDF = statusDF.withColumn("morning", when(col('hourofday').between(5,11),1).otherwise(0))
#Adding afternoon column
statusDF = statusDF.withColumn("afternoon", when(col('hourofday').between(12,16),1).otherwise(0))
#Adding evening column
statusDF = statusDF.withColumn("evening", when(col('hourofday').between(17,22),1).otherwise(0))
#Adding night column
statusDF = statusDF.withColumn("night", when(col('hourofday').between(23,24), 1).when(col('hourofday').between(0,4),1).otherwise(0))

In [177]:
statusDF.select('time','hourofday','morning','afternoon','evening','night').show(n=5)

+-------------------+---------+-------+---------+-------+-----+
|               time|hourofday|morning|afternoon|evening|night|
+-------------------+---------+-------+---------+-------+-----+
|2013/08/29 12:06:01|       12|      0|        1|      0|    0|
|2013/08/29 12:07:01|       12|      0|        1|      0|    0|
|2013/08/29 12:08:01|       12|      0|        1|      0|    0|
|2013/08/29 12:09:01|       12|      0|        1|      0|    0|
|2013/08/29 12:10:01|       12|      0|        1|      0|    0|
+-------------------+---------+-------+---------+-------+-----+
only showing top 5 rows



#### Month

In [176]:
#Adding month column
statusDF = statusDF.withColumn('month',month(from_unixtime(unix_timestamp(statusDF["time"][0:10], 'yyyy/MM/dd'))))

In [186]:
statusDF.select('time','month').show(n=5)

+-------------------+-----+
|               time|month|
+-------------------+-----+
|2013/08/29 12:06:01|    8|
|2013/08/29 12:07:01|    8|
|2013/08/29 12:08:01|    8|
|2013/08/29 12:09:01|    8|
|2013/08/29 12:10:01|    8|
+-------------------+-----+
only showing top 5 rows



In [188]:
#Features so far
statusDF.select('station_id', 'weekend', 'weekday', 'morning', 'afternoon', 'evening', 'night', 'hourofday', 'month').show(n=10)

+----------+-------+-------+-------+---------+-------+-----+---------+-----+
|station_id|weekend|weekday|morning|afternoon|evening|night|hourofday|month|
+----------+-------+-------+-------+---------+-------+-----+---------+-----+
|         2|      0|      1|      0|        1|      0|    0|       12|    8|
|         2|      0|      1|      0|        1|      0|    0|       12|    8|
|         2|      0|      1|      0|        1|      0|    0|       12|    8|
|         2|      0|      1|      0|        1|      0|    0|       12|    8|
|         2|      0|      1|      0|        1|      0|    0|       12|    8|
|         2|      0|      1|      0|        1|      0|    0|       12|    8|
|         2|      0|      1|      0|        1|      0|    0|       12|    8|
|         2|      0|      1|      0|        1|      0|    0|       12|    8|
|         2|      0|      1|      0|        1|      0|    0|       12|    8|
|         2|      0|      1|      0|        1|      0|    0|       12|    8|