In [3]:
import pyspark
import numpy as np
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql import functions, types
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
spark = (pyspark.sql.SparkSession.builder 
  .master("local[*]")
  .getOrCreate())

In [5]:
!head data/ratings.json

{"user_id": 6040, "movie_id": 858, "rating": 4, "timestamp": 956678732.0}
{"user_id": 6040, "movie_id": 2384, "rating": 4, "timestamp": 956678754.0}
{"user_id": 6040, "movie_id": 593, "rating": 5, "timestamp": 956678754.0}
{"user_id": 6040, "movie_id": 1961, "rating": 4, "timestamp": 956678777.0}
{"user_id": 6040, "movie_id": 1419, "rating": 3, "timestamp": 956678856.0}
{"user_id": 6040, "movie_id": 213, "rating": 5, "timestamp": 956678856.0}
{"user_id": 6040, "movie_id": 3111, "rating": 5, "timestamp": 956678856.0}
{"user_id": 6040, "movie_id": 573, "rating": 4, "timestamp": 956678856.0}
{"user_id": 6040, "movie_id": 3505, "rating": 4, "timestamp": 956678856.0}
{"user_id": 6040, "movie_id": 1734, "rating": 2, "timestamp": 956678881.0}


In [6]:
path_users = 'data/users.dat'
users = (spark.read.load("data/users.dat",
                         format="csv", sep=":", inferSchema="true")
                        .drop('_c1', '_c3', '_c5', '_c7'))

In [7]:
users = (users.withColumnRenamed(users.schema.names[0], 'userID')
                        .withColumnRenamed(users.schema.names[1], 'gender')
                        .withColumnRenamed(users.schema.names[2], 'age')
                        .withColumnRenamed(users.schema.names[3], 'occupation')
                        .withColumnRenamed(users.schema.names[4], 'zip'))

In [8]:
users.schema.names

['userID', 'gender', 'age', 'occupation', 'zip']

In [9]:
print((users.count(), len(users.columns)))

(6040, 5)


In [10]:
users.show(5)

+------+------+---+----------+-----+
|userID|gender|age|occupation|  zip|
+------+------+---+----------+-----+
|     1|     F|  1|        10|48067|
|     2|     M| 56|        16|70072|
|     3|     M| 25|        15|55117|
|     4|     M| 45|         7|02460|
|     5|     M| 25|        20|55455|
+------+------+---+----------+-----+
only showing top 5 rows



In [11]:
# read in the dataset into pyspark DataFrame
path_ratings = 'data/ratings.json'
ratings = spark.read.json(path_ratings)

In [12]:
ratings.show(5)

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|     858|     4|9.56678732E8|   6040|
|    2384|     4|9.56678754E8|   6040|
|     593|     5|9.56678754E8|   6040|
|    1961|     4|9.56678777E8|   6040|
|    1419|     3|9.56678856E8|   6040|
+--------+------+------------+-------+
only showing top 5 rows



In [13]:
print((ratings.count(), len(ratings.columns)))

(719949, 4)


In [14]:
ratings.schema.names

['movie_id', 'rating', 'timestamp', 'user_id']

In [15]:
ratings.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- rating: long (nullable = true)
 |-- timestamp: double (nullable = true)
 |-- user_id: long (nullable = true)



In [16]:
ratings.show(5)

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|     858|     4|9.56678732E8|   6040|
|    2384|     4|9.56678754E8|   6040|
|     593|     5|9.56678754E8|   6040|
|    1961|     4|9.56678777E8|   6040|
|    1419|     3|9.56678856E8|   6040|
+--------+------+------------+-------+
only showing top 5 rows



In [17]:
ratings.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: double, user_id: bigint]

#### inspect requests dataset

In [18]:
requests = spark.read.json('data/requests.json')
requests.persist()

DataFrame[movie_id: bigint, rating: double, timestamp: double, user_id: bigint]

In [19]:
requests.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: double (nullable = true)
 |-- user_id: long (nullable = true)



In [20]:
requests.show(5)

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|    2019|   NaN|9.56678777E8|   6040|
|     759|   NaN|9.56679248E8|   6040|
|    2858|   NaN|9.56679275E8|   6040|
|     246|   NaN|9.56679413E8|   6040|
|    1617|   NaN|9.56679473E8|   6040|
+--------+------+------------+-------+
only showing top 5 rows



In [21]:
print((requests.count(), len(requests.columns)))

(280260, 4)


### transforming timestamp column of ratings

In [22]:
# convert format of datetime column 'timestamp' from epoch to standard 
ratings = (ratings.withColumn('timestamp',
                    functions.date_format(ratings.timestamp.cast(dataType=types.TimestampType()),
                    "yyyy-MM-dd HH:mm:ss")))

In [23]:
ratings.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint]

In [24]:
ratings.show(5)

+--------+------+-------------------+-------+
|movie_id|rating|          timestamp|user_id|
+--------+------+-------------------+-------+
|     858|     4|2000-04-25 09:05:32|   6040|
|    2384|     4|2000-04-25 09:05:54|   6040|
|     593|     5|2000-04-25 09:05:54|   6040|
|    1961|     4|2000-04-25 09:06:17|   6040|
|    1419|     3|2000-04-25 09:07:36|   6040|
+--------+------+-------------------+-------+
only showing top 5 rows



In [25]:
ratings = ratings.sort(ratings.timestamp.asc())

In [26]:
ratings.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint]

In [27]:
ratings.show(5)

+--------+------+-------------------+-------+
|movie_id|rating|          timestamp|user_id|
+--------+------+-------------------+-------+
|     858|     4|2000-04-25 09:05:32|   6040|
|    2384|     4|2000-04-25 09:05:54|   6040|
|     593|     5|2000-04-25 09:05:54|   6040|
|    1961|     4|2000-04-25 09:06:17|   6040|
|    1419|     3|2000-04-25 09:07:36|   6040|
+--------+------+-------------------+-------+
only showing top 5 rows



In [28]:
print((ratings.count(), len(users.columns)))

(719949, 5)


In [29]:
719949*.8

575959.2000000001

In [30]:
719949 *.2

143989.80000000002

In [31]:
# Sort by index and get first 4000 rows
ratings_train = ratings.sort(ratings.timestamp.asc()).limit(575959)

In [32]:
ratings_train.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint]

In [33]:
ratings_train.show(5)

+--------+------+-------------------+-------+
|movie_id|rating|          timestamp|user_id|
+--------+------+-------------------+-------+
|     858|     4|2000-04-25 09:05:32|   6040|
|    2384|     4|2000-04-25 09:05:54|   6040|
|     593|     5|2000-04-25 09:05:54|   6040|
|    1961|     4|2000-04-25 09:06:17|   6040|
|    1419|     3|2000-04-25 09:07:36|   6040|
+--------+------+-------------------+-------+
only showing top 5 rows



In [34]:
print((ratings_train.count(), len(ratings_train.columns)))

(575959, 4)


In [35]:
ratings_train.sort(ratings_train.timestamp.asc()).show(10)

+--------+------+-------------------+-------+
|movie_id|rating|          timestamp|user_id|
+--------+------+-------------------+-------+
|     858|     4|2000-04-25 09:05:32|   6040|
|    2384|     4|2000-04-25 09:05:54|   6040|
|     593|     5|2000-04-25 09:05:54|   6040|
|    1961|     4|2000-04-25 09:06:17|   6040|
|    1419|     3|2000-04-25 09:07:36|   6040|
|     213|     5|2000-04-25 09:07:36|   6040|
|    3111|     5|2000-04-25 09:07:36|   6040|
|     573|     4|2000-04-25 09:07:36|   6040|
|    3505|     4|2000-04-25 09:07:36|   6040|
|    1734|     2|2000-04-25 09:08:01|   6040|
+--------+------+-------------------+-------+
only showing top 10 rows



In [36]:
ratings_test = ratings.subtract(ratings_train)

In [37]:
ratings_test.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint]

In [38]:
print((ratings_test.count(), len(ratings_test.columns)))

(143990, 4)


### model

In [39]:
# build recommendation model using ALS on the training data
als = ALS(
    rank=10,
    maxIter=10,
    userCol='user_id',
    itemCol='movie_id',
    ratingCol='rating',
)

In [40]:
# fit the ALS model on training set

als_model = als.fit(ratings_train)

### predict on ratings_test with fitted model

In [41]:
# generate predictions with your model for the test set by using the transform method on your ALS model
preds_test = als_model.transform(ratings_test)
preds_test.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint, prediction: float]

In [42]:
# evaluate your model and print out the RMSE from your test set

In [43]:
preds_test.sort(preds_test.timestamp.asc()).show(10)

+--------+------+-------------------+-------+----------+
|movie_id|rating|          timestamp|user_id|prediction|
+--------+------+-------------------+-------+----------+
|    3354|     3|2000-11-20 00:28:20|   1577| 3.2754974|
|    2571|     3|2000-11-20 00:28:25|   1579| 3.7229302|
|     377|     4|2000-11-20 00:28:25|   1579| 3.0375485|
|       6|     3|2000-11-20 00:28:25|   1579| 3.5706792|
|    2427|     2|2000-11-20 00:28:25|   1579| 3.4963312|
|    1276|     4|2000-11-20 00:28:32|   1592| 3.9360816|
|    1270|     4|2000-11-20 00:28:32|   1574|  4.696215|
|    2396|     3|2000-11-20 00:28:32|   1574|  4.134782|
|    1079|     4|2000-11-20 00:28:32|   1574| 3.7328296|
|     858|     5|2000-11-20 00:28:33|   1575| 4.3419867|
+--------+------+-------------------+-------+----------+
only showing top 10 rows



In [44]:
preds_test.where(preds_test['prediction'].isNotNull()).show()

+--------+------+-------------------+-------+----------+
|movie_id|rating|          timestamp|user_id|prediction|
+--------+------+-------------------+-------+----------+
|     148|     5|2000-11-30 05:47:04|    673|       NaN|
|     148|     3|2000-11-22 00:19:36|   1242|       NaN|
|     148|     2|2000-11-22 10:05:35|   1069|       NaN|
|     148|     2|2000-11-22 05:57:01|   1605| 1.8249925|
|     148|     2|2000-11-21 14:38:26|   1150|       NaN|
|     463|     3|2000-12-01 01:03:09|    660|       NaN|
|     463|     2|2000-11-22 10:02:18|   1069|       NaN|
|     463|     2|2000-11-22 08:33:30|   1146|       NaN|
|     463|     1|2000-11-28 12:05:54|    746|       NaN|
|     463|     2|2000-11-26 01:52:43|   1980| 2.4116402|
|     471|     5|2000-11-23 02:51:07|   1395|       NaN|
|     471|     4|2000-11-20 14:30:07|   1303|       NaN|
|     471|     3|2000-11-22 03:09:41|   1199|       NaN|
|     471|     3|2000-11-20 08:03:22|   1404|       NaN|
|     471|     5|2000-11-20 06:

In [47]:
# Evaluate the model by computing the RMSE on the test data
# preds_test = als_model.transform(ratings_test)

In [48]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

In [49]:
rmse = evaluator.evaluate(preds_test)

In [50]:
rmse

nan

In [51]:
# inspect user_factors and item_factors

In [52]:
user_factors = als_model.userFactors

In [53]:
user_factors.sort(user_factors.id.asc()).show(10)

+----+--------------------+
|  id|            features|
+----+--------------------+
|1570|[0.07491307, -0.5...|
|1571|[-0.96726793, 0.0...|
|1572|[-0.6420927, 0.81...|
|1573|[-0.6550203, 0.16...|
|1574|[0.05617068, 1.16...|
|1575|[0.05610694, 0.04...|
|1576|[-0.7933536, 0.41...|
|1577|[-0.2509155, 0.10...|
|1578|[-0.34301838, -0....|
|1579|[-0.6010333, -0.1...|
+----+--------------------+
only showing top 10 rows



In [54]:
user_factors.show(10)

+----+--------------------+
|  id|            features|
+----+--------------------+
|1570|[0.07491307, -0.5...|
|1580|[-0.1162439, 0.43...|
|1590|[-1.0390608, 0.16...|
|1600|[-0.5776814, 0.12...|
|1610|[-0.7560675, 0.68...|
|1620|[-0.587719, -0.13...|
|1630|[-0.4968645, 0.08...|
|1640|[-0.5812046, 0.18...|
|1650|[-0.5737709, 0.22...|
|1660|[-0.19235167, 0.0...|
+----+--------------------+
only showing top 10 rows



In [55]:
print((user_factors.count(), len(user_factors.columns)))

(4464, 2)


In [56]:
item_factors = als_model.itemFactors

In [57]:
item_factors.show(10)

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[-0.21958895, 0.4...|
| 20|[-0.44277176, 0.4...|
| 30|[-0.17176531, 0.2...|
| 40|[0.07197746, 0.25...|
| 50|[-0.49478978, 0.0...|
| 60|[-0.19229119, 0.0...|
| 70|[-0.11067993, 0.2...|
| 80|[-0.58575565, 0.0...|
| 90|[-0.29340154, 0.0...|
|100|[-0.20213532, 0.3...|
+---+--------------------+
only showing top 10 rows



In [58]:
print((item_factors.count(), len(item_factors.columns)))

(3576, 2)


In [59]:
user_1570_row = user_factors[user_factors['id'] == 1570].first()

In [60]:
# note - Alex and I got significantly different values for the below
user_1570_row

Row(id=1570, features=[0.07491306960582733, -0.5896221399307251, 1.1276379823684692, -0.3849954903125763, 1.1354124546051025, -0.19439247250556946, 0.5728875994682312, -0.7282409071922302, 0.26278382539749146, -1.548317790031433])

In [61]:
user_1570_factors = np.array(user_1570_row['features'])

In [62]:
user_1570_factors

array([ 0.07491307, -0.58962214,  1.12763798, -0.38499549,  1.13541245,
       -0.19439247,  0.5728876 , -0.72824091,  0.26278383, -1.54831779])

In [63]:
toy_story_row = item_factors[item_factors['id'] == 1].first()
toy_story_factors = np.array(toy_story_row['features'])

In [64]:
toy_story_row

Row(id=1, features=[-0.5367518067359924, 0.04210188239812851, 0.5631994009017944, 0.25970175862312317, 0.8728556036949158, 0.0016247003804892302, 0.5866580605506897, -0.9223973155021667, -0.22879548370838165, -0.9024826884269714])

In [65]:
toy_story_factors

array([-0.53675181,  0.04210188,  0.5631994 ,  0.25970176,  0.8728556 ,
        0.0016247 ,  0.58665806, -0.92239732, -0.22879548, -0.90248269])

In [66]:
user_1570_factors

array([ 0.07491307, -0.58962214,  1.12763798, -0.38499549,  1.13541245,
       -0.19439247,  0.5728876 , -0.72824091,  0.26278383, -1.54831779])

In [72]:
toy_story_factors

array([-0.53675181,  0.04210188,  0.5631994 ,  0.25970176,  0.8728556 ,
        0.0016247 ,  0.58665806, -0.92239732, -0.22879548, -0.90248269])

In [68]:
user_1570_factors @ toy_story_factors

3.805825231832004

In [69]:
# predictions.sort(predictions.timestamp.asc()).show(10)

In [70]:
# ratings.where(col('rating').isNull())

In [73]:
user_1570_preds = preds_test[preds_test['user_id'] == 1570]

In [74]:
user_1570_preds.sort('movie_id').show()

+--------+------+-------------------+-------+----------+
|movie_id|rating|          timestamp|user_id|prediction|
+--------+------+-------------------+-------+----------+
|     164|     4|2000-11-20 00:33:53|   1570|  3.909648|
|     270|     2|2000-11-20 00:29:52|   1570| 2.2307768|
|     296|     5|2000-11-20 00:31:37|   1570|  4.317787|
|     306|     4|2000-11-20 00:31:37|   1570| 4.4758215|
|     350|     2|2000-11-20 00:35:20|   1570| 2.3341775|
|     517|     3|2000-11-20 00:36:06|   1570| 2.1073813|
|     541|     4|2000-11-20 00:38:01|   1570|  4.431365|
|     648|     3|2000-11-20 00:36:06|   1570| 2.4821692|
|     800|     5|2000-11-20 00:32:54|   1570| 4.3803797|
|     858|     5|2000-11-20 00:29:52|   1570| 4.3533316|
|     903|     4|2000-11-20 00:32:38|   1570| 4.2394323|
|     904|     5|2000-11-20 00:32:38|   1570|  4.351304|
|     919|     4|2000-11-20 00:31:11|   1570| 3.8894572|
|     922|     4|2000-11-20 00:38:01|   1570| 4.6088367|
|     923|     5|2000-11-20 00:

In [75]:
# produces 
recs = als_model.recommendForAllUsers(numItems=10)
recs.persist()

DataFrame[user_id: int, recommendations: array<struct<movie_id:int,rating:float>>]

In [76]:
# returns list of lists
recs.sort(recs.user_id.asc()).show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|   1570|[[2192, 5.718519]...|
|   1571|[[572, 5.5701656]...|
|   1572|[[3523, 4.8673015...|
|   1573|[[2197, 5.4305997...|
|   1574|[[572, 6.227351],...|
|   1575|[[2192, 5.661892]...|
|   1576|[[572, 5.4530873]...|
|   1577|[[3306, 5.266887]...|
|   1578|[[3338, 5.2708817...|
|   1579|[[751, 4.683699],...|
|   1580|[[3338, 4.4096313...|
|   1582|[[3523, 6.149501]...|
|   1583|[[1361, 5.14452],...|
|   1584|[[751, 4.6355004]...|
|   1585|[[572, 4.6860504]...|
|   1586|[[572, 5.6415596]...|
|   1587|[[2309, 4.5818915...|
|   1588|[[3140, 5.468425]...|
|   1589|[[3338, 5.107899]...|
|   1590|[[3338, 5.1533346...|
+-------+--------------------+
only showing top 20 rows



In [77]:
# normalized or standard_scalar, row-wise, normalize per movie

In [80]:
# recs[recs['user_id']==10].first()['recommendations']

TypeError: 'NoneType' object is not subscriptable

In [81]:
# !grep 3086 < data/movies.csv

In [82]:
preds_requests = als_model.transform(requests)

In [83]:
preds_requests.persist()

DataFrame[movie_id: bigint, rating: double, timestamp: double, user_id: bigint, prediction: float]

In [84]:
preds_requests.where(preds_requests['prediction'].isNotNull()).show()

+--------+------+-------------+-------+----------+
|movie_id|rating|    timestamp|user_id|prediction|
+--------+------+-------------+-------+----------+
|     148|   NaN| 9.77959026E8|     53|       NaN|
|     148|   NaN| 9.76559602E8|   4169| 3.1917603|
|     148|   NaN| 9.89024856E8|   5333| 2.4790075|
|     148|   NaN| 9.77005381E8|   4387|  2.254983|
|     148|   NaN| 9.66907208E8|   3539| 2.7710366|
|     148|   NaN| 9.76266538E8|    840|       NaN|
|     148|   NaN| 9.76841639E8|    216|       NaN|
|     148|   NaN| 9.76191154E8|    482|       NaN|
|     148|   NaN|1.029283935E9|    752|       NaN|
|     148|   NaN|1.026978024E9|    424|       NaN|
|     148|   NaN| 9.74150193E8|   2456|  2.615888|
|     148|   NaN|  9.7014489E8|   3053| 2.8342977|
|     463|   NaN| 9.80596453E8|    970|       NaN|
|     463|   NaN| 9.76560887E8|   4169| 2.4588823|
|     463|   NaN| 9.78242788E8|     26|       NaN|
|     463|   NaN| 9.76395651E8|    319|       NaN|
|     463|   NaN| 9.76907712E8|

In [85]:
preds_requests.sort(preds_requests.timestamp.asc()).show(10)

+--------+------+------------+-------+----------+
|movie_id|rating|   timestamp|user_id|prediction|
+--------+------+------------+-------+----------+
|    2019|   NaN|9.56678777E8|   6040| 4.2240205|
|     759|   NaN|9.56679248E8|   6040| 3.7661352|
|    2858|   NaN|9.56679275E8|   6040| 3.8562121|
|     246|   NaN|9.56679413E8|   6040| 3.9449675|
|    1617|   NaN|9.56679473E8|   6040| 3.9109821|
|    2324|   NaN|9.56679629E8|   6040| 3.5924335|
|    1089|   NaN|9.56679796E8|   6040| 3.7035742|
|    2804|   NaN|9.56680123E8|   6039| 3.9610476|
|     933|   NaN| 9.5668027E8|   6039| 3.9462368|
|    1304|   NaN|9.56680308E8|   6039| 4.0060763|
+--------+------+------------+-------+----------+
only showing top 10 rows



In [86]:
print((preds_requests.count(), len(preds_requests.columns)))

(280260, 5)
