## ALS

The final goal was to attempt to create a basic recommendation system using past reservations. This uses [Alternating Least Squares](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.recommendation.ALS.html), a native recommender in PySpark's ML library. This documentation, along with [this walkthrough](https://github.com/shashwatwork/Building-Recommeder-System-in-PySpark/blob/master/Crafting%20Recommedation%20System%20with%20PySpark.ipynb) guided this implementation.

ALS is fairly straightforward, using three inputs, all integers, to build the model:
- userCol - person record in the transaction, customerzip was used in this model
- itemCol - facilityid, the product identifier
- ratingCol - a score to the item assigned by the user, used a "days stayed" calculation to simulate this value

In [1]:
from pyspark.sql import SparkSession

MAX_MEMORY = "8g"

spark = SparkSession.builder.appName('recreation.gov reservations') \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()

In [17]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

schemaRating = StructType([
    StructField("productid", IntegerType(), True),
    StructField("cleanzip", FloatType(), True),
    StructField("participation", FloatType(), True),
])

In [18]:
dfReservations2021 = spark.read.format('csv').schema(schemaRating).csv('REC_ALS.csv', header=True, ignoreTrailingWhiteSpace=True)

In [19]:
dfReservations2021 = dfReservations2021.withColumn('cleanzip', dfReservations2021['cleanzip'].cast(IntegerType()))

In [20]:
dfReservations2021 = dfReservations2021.withColumn('participation', dfReservations2021['participation'].cast(IntegerType()))

In [21]:
dfReservations2021.filter(dfReservations2021.participation.isNull()).show()

+---------+--------+-------------+
|productid|cleanzip|participation|
+---------+--------+-------------+
|   250877|   96555|         null|
|   232490|   55424|         null|
|   232490|   91501|         null|
|   232490|   33774|         null|
|   247571|   33176|         null|
|   247571|   92831|         null|
|   247661|   16823|         null|
|   247661|   16823|         null|
|   247661|   16823|         null|
|   247661|   16823|         null|
|   247661|   16823|         null|
|   247661|   16823|         null|
|   247661|   16823|         null|
|   247661|   14830|         null|
|   250009|   80516|         null|
|   258830|   15017|         null|
|   272266|   11771|         null|
|   272266|   20782|         null|
|   258887|   49046|         null|
|   251833|   21401|         null|
+---------+--------+-------------+
only showing top 20 rows



In [22]:
dfReservations2021.show(truncate=False)

+---------+--------+-------------+
|productid|cleanzip|participation|
+---------+--------+-------------+
|639772   |99709   |2            |
|639772   |99706   |0            |
|639772   |99706   |2            |
|639772   |99709   |0            |
|639772   |99709   |1            |
|639772   |84401   |4            |
|639772   |99709   |4            |
|99821    |99709   |2            |
|639772   |99743   |0            |
|639772   |99743   |4            |
|99799    |99708   |12           |
|639772   |84401   |4            |
|639772   |99710   |2            |
|99821    |99705   |2            |
|99799    |99709   |2            |
|99821    |99709   |2            |
|99862    |99712   |4            |
|639822   |99556   |0            |
|99799    |99709   |2            |
|99766    |99712   |3            |
+---------+--------+-------------+
only showing top 20 rows



In [23]:
dfReservations2021 = dfReservations2021.dropna()

In [24]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

#split train and test
trainDF, testDF = dfReservations2021.randomSplit([0.8, 0.2])
trainDF.cache()

# build model
# coldStartStrategy - helped drop nulls
als = ALS(maxIter=5, regParam=0.01, userCol="cleanzip", itemCol="productid", ratingCol="participation", coldStartStrategy="drop")
model = als.fit(trainDF)

In [25]:
# generate predictions
predictions = model.transform(testDF)

# evalute model using root mean squared evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="participation",predictionCol="prediction")
evaluator.evaluate(predictions)

48.55433850515385

In [26]:
# create a test user from Silver Spring, MD
test_user = testDF.filter('cleanzip == 20901').select('cleanzip', 'productid', 'participation')
test_user.show()

+--------+---------+-------------+
|cleanzip|productid|participation|
+--------+---------+-------------+
|   20901|     1550|            2|
|   20901|     1810|           15|
|   20901|     5713|           10|
|   20901|     5741|            4|
|   20901|     5745|           12|
|   20901|     5766|            6|
|   20901|     5771|           12|
|   20901|     5787|            2|
|   20901|     5796|            6|
|   20901|     5796|            8|
|   20901|     6256|            4|
|   20901|     6459|            2|
|   20901|     6460|            2|
|   20901|     6462|            6|
|   20901|     6474|            6|
|   20901|     7481|           12|
|   20901|    10330|            4|
|   20901|    10379|           18|
|   20901|    76418|            0|
|   20901|    77841|           28|
+--------+---------+-------------+
only showing top 20 rows



In [27]:
# get recommendations for test user
recommendations = model.transform(test_user)
recommendations.sort('prediction', ascending=False).show()

+--------+---------+-------------+----------+
|cleanzip|productid|participation|prediction|
+--------+---------+-------------+----------+
|   20901|     6256|            4|  68.52154|
|   20901|    37740|           24|  59.58333|
|   20901|    37740|           16|  59.58333|
|   20901|    83907|           21| 39.448288|
|   20901|    78821|            6|  34.85572|
|   20901|    19718|            4| 31.878809|
|   20901|     5787|            2| 28.387165|
|   20901|    53218|            5| 26.003227|
|   20901|   100231|            8| 20.048817|
|   20901|    40059|            8| 15.074044|
|   20901|     5741|            4|  13.98929|
|   20901|    90163|            3| 11.828209|
|   20901|    90163|            9| 11.828209|
|   20901|    40525|            1| 11.456576|
|   20901|    19706|            8|  9.517392|
|   20901|    19743|            6|  9.347351|
|   20901|     6462|            6|  8.621582|
|   20901| 10098728|            3|  8.451751|
|   20901|   100801|            8|

In [29]:
%pip install requests

Note: you may need to restart the kernel to use updated packages.


In [30]:
import requests
r = requests.get('https://ridb.recreation.gov/api/v1/campsites/19758', headers={'apikey': '6DD62AA61A9A4211BD4414728184BFE4'})
r.json()

[{'CampsiteID': '19758',
  'FacilityID': '232095',
  'CampsiteName': '053',
  'CampsiteType': 'WALK TO',
  'TypeOfUse': 'Overnight',
  'Loop': 'WALK',
  'CampsiteAccessible': False,
  'CampsiteReservable': True,
  'CampsiteLongitude': -79.381987,
  'CampsiteLatitude': 38.8316930000001,
  'CreatedDate': '2014-05-02',
  'LastUpdatedDate': '2020-10-15',
  'ATTRIBUTES': [{'AttributeName': 'Checkin Time',
    'AttributeValue': '2:00 PM'},
   {'AttributeName': 'Min Num of People', 'AttributeValue': '1'},
   {'AttributeName': 'Campfire Allowed', 'AttributeValue': 'Yes'},
   {'AttributeName': 'Checkout Time', 'AttributeValue': '1:00 PM'},
   {'AttributeName': 'IS EQUIPMENT MANDATORY', 'AttributeValue': 'true'},
   {'AttributeName': 'Picnic Table', 'AttributeValue': 'Y'},
   {'AttributeName': 'Site Rating', 'AttributeValue': 'Preferred'},
   {'AttributeName': 'Max Vehicle Length', 'AttributeValue': '0'},
   {'AttributeName': 'Placed on Map', 'AttributeValue': '1'},
   {'AttributeName': 'Fire Pi