In [1]:
from pyspark.sql import Row, SQLContext
from pyspark.sql.types  import *
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.feature import StringIndexer

In [2]:
statusDF = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://127.0.0.1/msan697.status").load()

stationDF = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://127.0.0.1/msan697.station").load()

weatherDF = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://127.0.0.1/msan697.weather").load()

tripDF = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://127.0.0.1/msan697.trip").load()

In [89]:
stationDF.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- city: string (nullable = true)
 |-- dock_count: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- installation_date: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- name: string (nullable = true)



##### Status Table

In [25]:
statusDF = statusDF.withColumn('dayofweek',date_format(from_unixtime(unix_timestamp(statusDF["time"][0:10], 'yyyy/MM/dd')),'EEEE'))

In [26]:
statusDF = statusDF.withColumn("dayofweek", 
                    when(col("dayofweek").isNull(), date_format(from_unixtime(unix_timestamp(statusDF["time"][0:10], 'yyyy-MM-dd')),'EEEE')).
                        otherwise(col('dayofweek')))

In [27]:
#Adding weekend column
statusDF = statusDF.withColumn("weekend", when(col('dayofweek') == 'Saturday',1).when(col('dayofweek') == 'Sunday', 1).otherwise(0))
#Adding weekday column
statusDF = statusDF.withColumn("weekday", when(col('dayofweek') == 'Saturday',0).when(col('dayofweek') == 'Sunday', 0).otherwise(1))


In [28]:
#Adding hourofday column
statusDF = statusDF.withColumn('hourofday',statusDF["time"][12:2].cast(IntegerType()))

In [29]:
#Adding morning column
statusDF = statusDF.withColumn("morning", when(col('hourofday').between(5,11),1).otherwise(0))
#Adding afternoon column
statusDF = statusDF.withColumn("afternoon", when(col('hourofday').between(12,16),1).otherwise(0))
#Adding evening column
statusDF = statusDF.withColumn("evening", when(col('hourofday').between(17,22),1).otherwise(0))
#Adding night column
statusDF = statusDF.withColumn("night", when(col('hourofday').between(23,24), 1).when(col('hourofday').between(0,4),1).otherwise(0))

In [30]:
#Adding month column
statusDF = statusDF.withColumn('month',month(from_unixtime(unix_timestamp(statusDF["time"][0:10], 'yyyy/MM/dd'))))
# Adding year column
statusDF = statusDF.withColumn('year',year(from_unixtime(unix_timestamp(statusDF["time"][0:10], 'yyyy/MM/dd'))))

In [31]:
#Cleaning month column
statusDF = statusDF.withColumn("month", 
                    when(col("month").isNull(), month(from_unixtime(unix_timestamp(statusDF["time"][0:10], 'yyyy-MM-dd')))).
                        otherwise(col('month')))
#Cleaning year column
statusDF = statusDF.withColumn("year", 
                    when(col("year").isNull(), year(from_unixtime(unix_timestamp(statusDF["time"][0:10], 'yyyy-MM-dd')))).
                        otherwise(col('year')))

In [32]:
statusDF_avg = sqlContext.sql("""
SELECT station_id, weekend, weekday, hourofday, month, year, dayofweek, morning, afternoon, evening, night,
avg(bikes_available) AS avg_bikes_available, 
avg(docks_available) AS avg_docks_available
FROM statusDF
GROUP BY 1,2,3,4,5,6,7,8,9,10,11
""")

#### Weather Table

In [23]:
weatherDF = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", "mongodb://127.0.0.1/msan697.weather").load()

In [24]:
tocols_to_change = ['cloud_cover',
 'events',
 'max_dew_point_f',
 'max_gust_speed_mph',
 'max_humidity',
 'max_sea_level_pressure_inches',
 'max_temperature_f',
 'max_visibility_miles',
 'max_wind_Speed_mph',
 'mean_dew_point_f',
 'mean_humidity',
 'mean_sea_level_pressure_inches',
 'mean_temperature_f',
 'mean_visibility_miles',
 'mean_wind_speed_mph',
 'min_dew_point_f',
 'min_humidity',
 'min_sea_level_pressure_inches',
 'min_temperature_f',
 'min_visibility_miles',
 'precipitation_inches',
 'wind_dir_degrees',
 'zip_code']

In [25]:
weatherDF = weatherDF.select([*(col(c).cast("float").alias(c) for c in tocols_to_change), 'date'])

In [26]:
weatherDF = weatherDF.withColumn("events", when(col('events') == 'Fog', 1).\
                                 when(col('events').like ('%ain'),2).\
                                 when(col('events') == 'Fog-Rain',3).\
                                 when(col('events') == 'Rain-Thunderstorm',4).\
                                 otherwise(0))

In [27]:
#Adding day of week column
weatherDF = weatherDF.withColumn('dayofweek',date_format(from_unixtime(unix_timestamp(weatherDF["date"], 'M/dd/yyyy')),'EEEE'))
#Adding weekend column
weatherDF = weatherDF.withColumn("weekend", when(col('dayofweek') == 'Saturday',1).when(col('dayofweek') == 'Sunday', 1).otherwise(0))
#Adding weekday column
weatherDF = weatherDF.withColumn("weekday", when(col('dayofweek') == 'Saturday',0).when(col('dayofweek') == 'Sunday', 0).otherwise(1))
#Adding month column
weatherDF = weatherDF.withColumn('month',month(from_unixtime(unix_timestamp(weatherDF["date"], 'M/dd/yyyy'))))
# Adding year column
weatherDF = weatherDF.withColumn('year',year(from_unixtime(unix_timestamp(weatherDF["date"], 'M/dd/yyyy'))))

In [28]:
weatherDF = weatherDF.fillna(0)

In [32]:
weatherDF.take(1)

[Row(dayofweek='Monday', month=10, year=2014, weekend=0, weekday=1, max_temperature_f=76.6, mean_temperature_f=65.05, min_temperature_f=53.35, max_dew_point_f=58.65, mean_dew_point_f=51.4, min_dew_point_f=43.55, max_humidity=90.3, mean_humidity=65.4, min_humidity=38.5, max_sea_level_pressure_inches=30.015499973297118, mean_sea_level_pressure_inches=29.96599988937378, min_sea_level_pressure_inches=29.91900005340576, max_visibility_miles=11.25, mean_visibility_miles=10.05, min_visibility_miles=8.2, max_wind_Speed_mph=14.55, mean_wind_speed_mph=4.75, max_gust_speed_mph=16.55, precipitation_inches=0.0014999999664723873, cloud_cover=1.85, wind_dir_degrees=298.65, events=0.0)]

In [30]:
weatherDF.printSchema()

root
 |-- cloud_cover: float (nullable = false)
 |-- events: integer (nullable = false)
 |-- max_dew_point_f: float (nullable = false)
 |-- max_gust_speed_mph: float (nullable = false)
 |-- max_humidity: float (nullable = false)
 |-- max_sea_level_pressure_inches: float (nullable = false)
 |-- max_temperature_f: float (nullable = false)
 |-- max_visibility_miles: float (nullable = false)
 |-- max_wind_Speed_mph: float (nullable = false)
 |-- mean_dew_point_f: float (nullable = false)
 |-- mean_humidity: float (nullable = false)
 |-- mean_sea_level_pressure_inches: float (nullable = false)
 |-- mean_temperature_f: float (nullable = false)
 |-- mean_visibility_miles: float (nullable = false)
 |-- mean_wind_speed_mph: float (nullable = false)
 |-- min_dew_point_f: float (nullable = false)
 |-- min_humidity: float (nullable = false)
 |-- min_sea_level_pressure_inches: float (nullable = false)
 |-- min_temperature_f: float (nullable = false)
 |-- min_visibility_miles: float (nullable = fals

In [31]:
weatherDF = weatherDF.drop("date").groupBy("dayofweek", "month","year","weekend","weekday")\
.agg(avg("max_temperature_f").alias("max_temperature_f"), \
avg("mean_temperature_f").alias("mean_temperature_f"),\
avg("min_temperature_f").alias("min_temperature_f"),\
avg("max_dew_point_f").alias("max_dew_point_f"),\
avg("mean_dew_point_f").alias("mean_dew_point_f"), \
avg("min_dew_point_f").alias("min_dew_point_f"),\
avg("max_humidity").alias("max_humidity"), \
avg("mean_humidity").alias("mean_humidity"),\
avg("min_humidity").alias("min_humidity"), \
avg("max_sea_level_pressure_inches").alias("max_sea_level_pressure_inches"),\
avg("mean_sea_level_pressure_inches").alias("mean_sea_level_pressure_inches"),\
avg("min_sea_level_pressure_inches").alias("min_sea_level_pressure_inches"),\
avg("max_visibility_miles").alias("max_visibility_miles"), \
avg("mean_visibility_miles").alias("mean_visibility_miles"),\
avg("min_visibility_miles").alias("min_visibility_miles"), \
avg("max_wind_Speed_mph").alias("max_wind_Speed_mph"),\
avg("mean_wind_speed_mph").alias("mean_wind_speed_mph"),\
avg("max_gust_speed_mph").alias("max_gust_speed_mph"),\
avg("precipitation_inches").alias("precipitation_inches"),\
avg("cloud_cover").alias("cloud_cover"),\
avg("wind_dir_degrees").alias("wind_dir_degrees"),\
avg("events").alias("events")
    )

In [33]:
sqlContext.sql('drop table if exists weatherDF')
weatherDF.write.saveAsTable('weatherDF')

#### Stations Table

In [34]:
# add age of the docks
stationDF = stationDF.withColumn('age', \
               datediff(from_unixtime(unix_timestamp(date_format(current_date(), "M/d/y"), 'MM/dd/yyy')),\
                              from_unixtime(unix_timestamp(stationDF['installation_date'], 'MM/dd/yyy'))))

In [36]:
sqlContext.sql('drop table if exists stationDF')
stationDF.write.saveAsTable('stationDF')

#### Trip Table

In [35]:
tripDF = tripDF.withColumn('start_date', concat(col('start_date'),lit(':00'))).withColumn('end_date', concat(col('end_date'),lit(':00')))

In [36]:
tripDF = tripDF.withColumn('dayofweek',date_format(from_unixtime(unix_timestamp('start_date', 'MM/dd/yyy HH:mm:ss')),'EEEE'))\
.withColumn("weekend", when(col('dayofweek') == 'Saturday',1).when(col('dayofweek') == 'Sunday', 1).otherwise(0))\
.withColumn("weekday", when(col('dayofweek') == 'Saturday',0).when(col('dayofweek') == 'Sunday', 0).otherwise(1))\
.withColumn('hourofday',hour(from_unixtime(unix_timestamp('start_date', 'MM/dd/yyy HH:mm:ss'))))\
.withColumn('month',month(from_unixtime(unix_timestamp('start_date', 'MM/dd/yyy HH:mm:ss'))))\
.withColumn('year',year(from_unixtime(unix_timestamp('start_date', 'MM/dd/yyy HH:mm:ss'))))

In [37]:
tripDF = tripDF.withColumn('start_date_part', date_format(to_date(from_unixtime(unix_timestamp('start_date', 'MM/dd/yyy HH:mm:ss'))), 'MM/dd/yyy'))

In [38]:
tripDF = tripDF.withColumn('end_date_part', date_format(to_date(from_unixtime(unix_timestamp('end_date', 'MM/dd/yyy HH:mm:ss'))), 'MM/dd/yyy'))

In [41]:
sqlContext.sql('drop table if exists tripDF')
tripDF.write.saveAsTable('tripDF')

In [39]:
tripDF_final = sqlContext.sql("""
select a.start_station_id, a.hourofday,
        weekend, weekday, month, year, dayofweek,
        avg(outgoing_bikes) as outgoing_bikes, avg(incoming_bikes) as incoming_bikes,
        (coalesce(avg(outgoing_bikes),0) - coalesce(avg(incoming_bikes),0)) as traffic
        from
        (
            select hourofday, start_station_id, start_date_part, weekend, weekday, month, year, dayofweek,
            sum(1) as outgoing_bikes
            from tripDF
            group by 1,2,3,4,5,6,7,8
        ) as a
        left join
        (
            select hourofday, end_station_id, end_date_part, sum(1) as incoming_bikes
            from tripDF
            group by 1,2,3
        ) as b
        on a.start_station_id = b.end_station_id and a.hourofday = b.hourofday and a.start_date_part = b.end_date_part
        group by 1,2,3,4,5,6,7
""")

In [40]:
tripDF_final.printSchema()

root
 |-- start_station_id: integer (nullable = true)
 |-- hourofday: integer (nullable = true)
 |-- weekend: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- dayofweek: string (nullable = true)
 |-- outgoing_bikes: double (nullable = true)
 |-- incoming_bikes: double (nullable = true)
 |-- traffic: double (nullable = false)



In [44]:
sqlContext.sql('drop table if exists tripDF_final')
tripDF_final.write.saveAsTable('tripDF_final')

In [41]:
joined_df = stationDF.join(tripDF_final, stationDF.id == tripDF_final.start_station_id, how='left')

In [42]:
joined_df = joined_df.drop("start_station_id", "end_station_id", "_id", "installation_date")

In [43]:
joined_df = joined_df.withColumnRenamed("id", "station_id").withColumnRenamed("name", "station_name")

In [44]:
joined_df2 = joined_df.join(statusDF_avg, ["station_id", "hourofday", "dayofweek", "weekend", "weekday", "month", "year"])

In [45]:
joined_df2 = joined_df2.withColumnRenamed("dock_count", "total_capacity").withColumnRenamed("age", "station_age")

In [46]:
weatherDF.take(1)

[Row(dayofweek='Sunday', month=9, year=2013, weekend=1, weekday=0, max_temperature_f=77.12, mean_temperature_f=67.08, min_temperature_f=57.24, max_dew_point_f=59.4, mean_dew_point_f=55.36, min_dew_point_f=51.16, max_humidity=86.92, mean_humidity=69.36, min_humidity=46.36, max_sea_level_pressure_inches=29.9747998046875, mean_sea_level_pressure_inches=29.940800170898438, min_sea_level_pressure_inches=29.852399826049805, max_visibility_miles=10.0, mean_visibility_miles=10.0, min_visibility_miles=9.84, max_wind_Speed_mph=17.28, mean_wind_speed_mph=6.0, max_gust_speed_mph=20.6, precipitation_inches=0.00039999999105930326, cloud_cover=1.92, wind_dir_degrees=312.68, events=0.0)]

In [47]:
final_joined = joined_df2.join(weatherDF, [ "dayofweek", "weekend", "weekday", "month", "year"])

In [55]:
sqlContext.sql('drop table if exists final_features')
final_joined.write.saveAsTable('final_features')

In [48]:
final_nodup = final_joined.distinct()

In [58]:
sqlContext.sql('drop table if exists final_no_duplicates')
final_nodup.write.saveAsTable('final_no_duplicates')

In [3]:
final_nodup = sqlContext.sql('select * from final_no_duplicates')

In [3]:
final_nodup.printSchema()

root
 |-- dayofweek: string (nullable = true)
 |-- weekend: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- station_id: integer (nullable = true)
 |-- hourofday: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- total_capacity: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- station_name: string (nullable = true)
 |-- station_age: integer (nullable = true)
 |-- outgoing_bikes: double (nullable = true)
 |-- incoming_bikes: double (nullable = true)
 |-- traffic: double (nullable = true)
 |-- morning: integer (nullable = true)
 |-- afternoon: integer (nullable = true)
 |-- evening: integer (nullable = true)
 |-- night: integer (nullable = true)
 |-- avg_bikes_available: double (nullable = true)
 |-- avg_docks_available: double (nullable = true)
 |-- max_temperature_f: double (nullable = true)
 |-- mean_temperature_f: doubl

#### Predictions

In [4]:
#converting strings to numeric values
from pyspark.ml.feature import StringIndexer

In [5]:
def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

In [6]:
final_nodup_numeric = indexStringColumns(final_nodup, ["dayofweek", "city", "station_name"])

In [7]:
final_nodup_numeric.printSchema()

root
 |-- weekend: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- station_id: integer (nullable = true)
 |-- hourofday: integer (nullable = true)
 |-- total_capacity: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- station_age: integer (nullable = true)
 |-- outgoing_bikes: double (nullable = true)
 |-- incoming_bikes: double (nullable = true)
 |-- traffic: double (nullable = true)
 |-- morning: integer (nullable = true)
 |-- afternoon: integer (nullable = true)
 |-- evening: integer (nullable = true)
 |-- night: integer (nullable = true)
 |-- avg_bikes_available: double (nullable = true)
 |-- avg_docks_available: double (nullable = true)
 |-- max_temperature_f: double (nullable = true)
 |-- mean_temperature_f: double (nullable = true)
 |-- min_temperature_f: double (nullable = true)
 |-- max_dew_point_f: double (nullable = true)
 |-- 

In [8]:
final_nodup_numeric = final_nodup_numeric.fillna(0)

## For Traffic Data

In [9]:
final_nodup_numeric_traffic = final_nodup_numeric.drop('avg_bikes_available', 'avg_docks_available')

In [10]:
input_cols = final_nodup_numeric_traffic.schema.names

In [11]:
input_cols.remove('traffic')

In [12]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(outputCol="features", inputCols = input_cols)

In [13]:
#lpoints - labeled data.
lpoints = va.transform(final_nodup_numeric_traffic).select("features", "traffic").withColumnRenamed("traffic", "label")

In [14]:
lpoints.show()

+--------------------+-------------------+
|            features|              label|
+--------------------+-------------------+
|[0.0,1.0,4.0,2014...|               -0.5|
|[0.0,1.0,4.0,2014...|-0.6666666666666667|
|[0.0,1.0,4.0,2014...| 0.6666666666666667|
|[0.0,1.0,4.0,2014...|                1.0|
|[0.0,1.0,4.0,2014...|                1.0|
|[0.0,1.0,4.0,2014...|                1.0|
|[0.0,1.0,4.0,2014...|                1.0|
|[0.0,1.0,4.0,2014...|               -0.5|
|[0.0,1.0,4.0,2014...|                0.0|
|[0.0,1.0,4.0,2014...|                0.0|
|[0.0,1.0,4.0,2014...|               1.25|
|[0.0,1.0,4.0,2014...|               -1.0|
|[0.0,1.0,4.0,2014...|                0.0|
|[0.0,1.0,4.0,2014...|                1.0|
|[0.0,1.0,4.0,2014...|                0.5|
|[0.0,1.0,4.0,2014...| 1.6666666666666667|
|[0.0,1.0,4.0,2014...|  2.916666666666667|
|[0.0,1.0,4.0,2014...|                1.0|
|[0.0,1.0,4.0,2014...|                0.0|
|[0.0,1.0,4.0,2014...|               0.75|
+----------

In [15]:
#Divide the dataset into training and testing sets.
splits = lpoints.randomSplit([0.8, 0.2])

In [16]:
#cache() : the algorithm is interative and training and data sets are going to be reused many times.
train_df = splits[0].cache()
valid_df = splits[1].cache()

### Linear Regression for Traffic

In [70]:
#Train the model.
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(train_df)

In [71]:
# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 0.571816
r2: 0.917037


In [72]:
# Fit on validation set
validpredicts = lrModel.transform(valid_df)
validpredicts.show()

+--------------------+--------------------+--------------------+
|            features|               label|          prediction|
+--------------------+--------------------+--------------------+
|[0.0,1.0,1.0,2014...|                 1.0|  0.8995087999842729|
|[0.0,1.0,1.0,2014...| -1.0000000000000002| -0.5866447239642745|
|[0.0,1.0,1.0,2014...|                 3.0|  2.2483556641043645|
|[0.0,1.0,1.0,2014...|-0.33333333333333326|-0.11741386547255964|
|[0.0,1.0,1.0,2014...|                2.75|  2.0209040947343007|
|[0.0,1.0,1.0,2015...|                 0.0| 0.12210537305288557|
|[0.0,1.0,1.0,2015...|                 1.0|  0.8995087999842729|
|[0.0,1.0,2.0,2014...|                 1.0|  0.8995087999842729|
|[0.0,1.0,2.0,2014...|                 0.0|  0.1662396565691747|
|[0.0,1.0,2.0,2015...|                0.75|  0.6720572306142092|
|[0.0,1.0,2.0,2015...|  2.3333333333333335|  1.7398943313759483|
|[0.0,1.0,3.0,2014...|                -1.0| -0.5670294868459235|
|[0.0,1.0,3.0,2014...|   

In [73]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(validpredicts)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.56702


#### Generalized Linear Regression

In [110]:
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)
# Fit the model
glrmodel = glr.fit(lpoints)

In [111]:
# Fit on validation set
validpredicts = glrmodel.transform(lpoints)
validpredicts.show()

+--------------------+-------------------+--------------------+
|            features|              label|          prediction|
+--------------------+-------------------+--------------------+
|[0.0,1.0,4.0,2014...|               -0.5|-0.13092343534083228|
|[0.0,1.0,4.0,2014...|-0.6666666666666667|-0.49274437642551777|
|[0.0,1.0,4.0,2014...| 0.6666666666666667|  0.5573660449098027|
|[0.0,1.0,4.0,2014...|                1.0|  0.9459357108710079|
|[0.0,1.0,4.0,2014...|                1.0|    0.95467990402772|
|[0.0,1.0,4.0,2014...|                1.0|  0.9561703654777277|
|[0.0,1.0,4.0,2014...|                1.0|  0.9075470802903581|
|[0.0,1.0,4.0,2014...|               -0.5|-0.32919074983942453|
|[0.0,1.0,4.0,2014...|                0.0| 0.05532153668088524|
|[0.0,1.0,4.0,2014...|                0.0| 0.06662025170452501|
|[0.0,1.0,4.0,2014...|               1.25|  0.9371638551499849|
|[0.0,1.0,4.0,2014...|               -1.0| -0.7016509497861689|
|[0.0,1.0,4.0,2014...|                0.

In [118]:
"""
|-- weekend: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- station_id: integer (nullable = true)
 |-- hourofday: integer (nullable = true)
 |-- total_capacity: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 ....
 |-- dayofweek: double (nullable = true)
 |-- city: double (nullable = true)
 |-- station_name: double (nullable = true)
"""

traffic_predictions = validpredicts.select('prediction').collect()
traffic_features = validpredicts.select('features').collect()

In [45]:
traffic_predictions[0][0]

Row(prediction=-0.7597863470875004)

In [48]:
traffic_features[0][0][len(traffic_features[0][0])-3]

0.0

In [47]:
len(traffic_features[0][0][])

41

In [50]:
# prediction, station_id, lat, long, hourofday, month, year, dayofweek, city, station_name
list_tupes = list()
for i in range(len(traffic_features)):
    list_tupes.append((traffic_predictions[i][0], traffic_features[i][0][4], traffic_features[i][0][7], traffic_features[i][0][8],\
                      traffic_features[i][0][5], traffic_features[i][0][2], traffic_features[i][0][3], \
                       traffic_features[i][0][len(traffic_features[0][0])-3],\
                      traffic_features[i][0][len(traffic_features[0][0])-2],\
                      traffic_features[i][0][len(traffic_features[0][0])-1]))

In [76]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(validpredicts)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.460254


### Prediction for availability

In [53]:
final_nodup_numeric_availability = final_nodup_numeric.drop('traffic', 'avg_docks_available')

In [54]:
input_cols = final_nodup_numeric_availability.schema.names

In [55]:
input_cols.remove('avg_bikes_available')

In [56]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(outputCol="features", inputCols = input_cols)

In [57]:
#lpoints - labeled data.
lpoints_availability = va.transform(final_nodup_numeric_availability).select("features", "avg_bikes_available").withColumnRenamed("avg_bikes_available", "label")

In [22]:
lpoints_availability.show()

+--------------------+------------------+
|            features|             label|
+--------------------+------------------+
|[0.0,1.0,4.0,2014...| 6.604166666666667|
|[0.0,1.0,4.0,2014...|18.916666666666668|
|[0.0,1.0,4.0,2014...| 4.204166666666667|
|[0.0,1.0,4.0,2014...| 6.591666666666667|
|[0.0,1.0,4.0,2014...| 9.316666666666666|
|[0.0,1.0,4.0,2014...| 8.016666666666667|
|[0.0,1.0,4.0,2014...| 4.920833333333333|
|[0.0,1.0,4.0,2014...| 7.016666666666667|
|[0.0,1.0,4.0,2014...| 6.170212765957447|
|[0.0,1.0,4.0,2014...|             2.775|
|[0.0,1.0,4.0,2014...|            18.775|
|[0.0,1.0,4.0,2014...|              7.95|
|[0.0,1.0,4.0,2014...| 6.729166666666667|
|[0.0,1.0,4.0,2014...| 7.083333333333333|
|[0.0,1.0,4.0,2014...|            5.7625|
|[0.0,1.0,4.0,2014...|            6.1375|
|[0.0,1.0,4.0,2014...|10.904166666666667|
|[0.0,1.0,4.0,2014...|             6.475|
|[0.0,1.0,4.0,2014...| 9.995833333333334|
|[0.0,1.0,4.0,2014...|              10.5|
+--------------------+------------

In [58]:
#Divide the dataset into training and testing sets.
splits_availability = lpoints_availability.randomSplit([0.8, 0.2])

In [59]:
#cache() : the algorithm is interative and training and data sets are going to be reused many times.
train_df_availability = splits_availability[0].cache()
valid_df_availability = splits_availability[1].cache()

### Linear Regression for Availability

In [112]:
#Train the model.
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel_availability = lr.fit(lpoints_availability)

In [86]:
# Summarize the model over the training set and print out some metrics
trainingSummary_availability = lrModel_availability.summary
print("RMSE: %f" % trainingSummary_availability.rootMeanSquaredError)
print("r2: %f" % trainingSummary_availability.r2)

RMSE: 2.704588
r2: 0.291949


In [87]:
# Fit on validation set
validpredicts_availability = lrModel.transform(valid_df_availability)
validpredicts_availability.show()

+--------------------+------------------+--------------------+
|            features|             label|          prediction|
+--------------------+------------------+--------------------+
|[0.0,1.0,1.0,2014...|14.043333333333333| 0.10739394521412235|
|[0.0,1.0,1.0,2014...|15.133333333333333|  -0.398423628830912|
|[0.0,1.0,1.0,2014...|              6.49| -0.9336640585534726|
|[0.0,1.0,1.0,2014...| 6.733333333333333|  0.8995087999842729|
|[0.0,1.0,1.0,2015...| 9.856666666666667|  0.8995087999842729|
|[0.0,1.0,2.0,2014...| 7.191666666666666|  0.1662396565691747|
|[0.0,1.0,2.0,2014...|              6.15|  1.9111439480743417|
|[0.0,1.0,2.0,2014...|           17.7375| -1.9385190548549236|
|[0.0,1.0,2.0,2014...|10.720833333333333| -0.8702982460060087|
|[0.0,1.0,2.0,2014...|12.070833333333333|   -1.11020119261863|
|[0.0,1.0,2.0,2015...|            4.2875|-0.07818339123585796|
|[0.0,1.0,2.0,2015...|10.645833333333334| -0.2003949151383743|
|[0.0,1.0,2.0,2015...| 6.658333333333333| -0.7141437652

### Generalized Linear Regression for Availability

In [88]:
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)
# Fit the model
glrmodel_availability = glr.fit(train_df_availability)

In [89]:
# Fit on validation set
validpredicts_availability = glrmodel.transform(valid_df_availability)
validpredicts_availability.show()

+--------------------+------------------+--------------------+
|            features|             label|          prediction|
+--------------------+------------------+--------------------+
|[0.0,1.0,1.0,2014...|14.043333333333333| 0.14919182670136166|
|[0.0,1.0,1.0,2014...|15.133333333333333| -0.5037824354786058|
|[0.0,1.0,1.0,2014...|              6.49|  -1.117434153350874|
|[0.0,1.0,1.0,2014...| 6.733333333333333|  0.8577138474827808|
|[0.0,1.0,1.0,2015...| 9.856666666666667|  0.8347164154325073|
|[0.0,1.0,2.0,2014...| 7.191666666666666| 0.14265552048899943|
|[0.0,1.0,2.0,2014...|              6.15|  1.9945993346308954|
|[0.0,1.0,2.0,2014...|           17.7375|  -2.078060191849284|
|[0.0,1.0,2.0,2014...|10.720833333333333| -0.9543413699910148|
|[0.0,1.0,2.0,2014...|12.070833333333333| -1.1925562869292394|
|[0.0,1.0,2.0,2015...|            4.2875|-0.20203965741448426|
|[0.0,1.0,2.0,2015...|10.645833333333334|-0.24229871945506565|
|[0.0,1.0,2.0,2015...| 6.658333333333333| -0.7185371084

### Random Forest for availability

In [113]:
rf = RandomForestRegressor(maxBins = 70)
rfmodel = rf.fit(lpoints_availability)

In [114]:
rfpredicts = rfmodel.transform(lpoints_availability)

In [115]:
# Select example rows to display.
availability_features = rfpredicts.select("features").collect()
availability_prediction = rfpredicts.select("label").collect()

In [63]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(rfpredicts)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Root Mean Squared Error (RMSE) on test data = 2.43219


In [119]:
len(availability_features), len(availability_prediction), len(traffic_features)

(120757, 120757, 120757)

In [72]:
availability_features[0][0][0]

0.0

In [120]:
# prediction_available, prediction_traffic, station_id, lat, long, hourofday, month, year, dayofweek, city, station_name
available_traffic_pred_list = list()
for i in range(len(availability_features)):
    available_traffic_pred_list.append((availability_prediction[i][0],traffic_predictions[i][0],\
                                availability_features[i][0][4], \
                       availability_features[i][0][7], availability_features[i][0][8],\
                      availability_features[i][0][5], availability_features[i][0][2], \
                       availability_features[i][0][3], \
                       availability_features[i][0][len(availability_features[0][0])-3],\
                      availability_features[i][0][len(availability_features[0][0])-2],\
                      availability_features[i][0][len(availability_features[0][0])-1]))

In [121]:
import pandas as pd
df = pd.DataFrame(available_traffic_pred_list)

In [122]:
df.columns = ['prediction_available', 'prediction_traffic', 'station_id', 'lat', 'long', 'hourofday', 'month', 'year', 'dayofweek', 'city', 'station_name']

In [123]:
station_pd = pd.read_csv('../bikeshare/Data/station.csv')

In [124]:
station_pd[:1]

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013


In [125]:
new_df = df.merge(station_pd, left_on='station_id', right_on='id')

In [126]:
new_df = new_df.drop_duplicates()

In [127]:
new_df.to_csv('available_traffic.csv')

#### Recommendation

In [None]:
"""
Beale at Market

Temporary Transbay Terminal (Howard at Beale)

Steuart at Market

Spear at Folsom
"""

In [158]:
nearest_station_df = new_df[(new_df['name'].str.contains('Beale at Market', case = False)) \
       & (new_df['month'] == 8.0) & (new_df['year'] == 2015.0)
      & (new_df['dayofweek'] == 3.0) & (new_df['hourofday'] == 10.0)]

In [160]:
nearest_station_df = nearest_station_df.append(new_df[(new_df['name'].str.contains('Howard at Beale', case = False)) \
       & (new_df['month'] == 8.0) & (new_df['year'] == 2015.0)\
      & (new_df['dayofweek'] == 3.0) & (new_df['hourofday'] == 10.0)])

In [161]:
nearest_station_df = nearest_station_df.append(new_df[(new_df['name'].str.contains('Steuart at Market', case = False)) \
       & (new_df['month'] == 8.0) & (new_df['year'] == 2015.0)\
      & (new_df['dayofweek'] == 3.0) & (new_df['hourofday'] == 10.0)])

In [162]:
nearest_station_df = nearest_station_df.append(new_df[(new_df['name'].str.contains('Spear at Folsom', case = False)) \
       & (new_df['month'] == 8.0) & (new_df['year'] == 2015.0)\
      & (new_df['dayofweek'] == 3.0) & (new_df['hourofday'] == 10.0)])

In [164]:
nearest_station_df.to_csv('nearest_station_df.csv')