In [3]:
import pyspark
import numpy as np
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql import functions, types
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
spark = (pyspark.sql.SparkSession.builder 
  .master("local[*]")
  .getOrCreate())

In [5]:
path_users = 'data/users.dat'
users = (spark.read.load("data/users.dat",
                         format="csv", sep=":", inferSchema="true")
                        .drop('_c1', '_c3', '_c5', '_c7'))

In [6]:
users = (users.withColumnRenamed(users.schema.names[0], 'userID')
                        .withColumnRenamed(users.schema.names[1], 'gender')
                        .withColumnRenamed(users.schema.names[2], 'age')
                        .withColumnRenamed(users.schema.names[3], 'occupation')
                        .withColumnRenamed(users.schema.names[4], 'zip'))

In [7]:
users.schema.names

['userID', 'gender', 'age', 'occupation', 'zip']

In [8]:
print((users.count(), len(users.columns)))

(6040, 5)


In [9]:
users.show(5)

+------+------+---+----------+-----+
|userID|gender|age|occupation|  zip|
+------+------+---+----------+-----+
|     1|     F|  1|        10|48067|
|     2|     M| 56|        16|70072|
|     3|     M| 25|        15|55117|
|     4|     M| 45|         7|02460|
|     5|     M| 25|        20|55455|
+------+------+---+----------+-----+
only showing top 5 rows



In [10]:
# read in the dataset into pyspark DataFrame
path_ratings = 'data/ratings.json'
ratings = spark.read.json(path_ratings)

In [11]:
ratings.show(5)

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|     858|     4|9.56678732E8|   6040|
|    2384|     4|9.56678754E8|   6040|
|     593|     5|9.56678754E8|   6040|
|    1961|     4|9.56678777E8|   6040|
|    1419|     3|9.56678856E8|   6040|
+--------+------+------------+-------+
only showing top 5 rows



In [12]:
print((ratings.count(), len(ratings.columns)))

(719949, 4)


In [13]:
ratings.schema.names

['movie_id', 'rating', 'timestamp', 'user_id']

In [14]:
ratings.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- rating: long (nullable = true)
 |-- timestamp: double (nullable = true)
 |-- user_id: long (nullable = true)



In [15]:
ratings.show(5)

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|     858|     4|9.56678732E8|   6040|
|    2384|     4|9.56678754E8|   6040|
|     593|     5|9.56678754E8|   6040|
|    1961|     4|9.56678777E8|   6040|
|    1419|     3|9.56678856E8|   6040|
+--------+------+------------+-------+
only showing top 5 rows



In [16]:
ratings.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: double, user_id: bigint]

#### inspect requests dataset

In [17]:
requests = spark.read.json('data/requests.json')
requests.persist()

DataFrame[movie_id: bigint, rating: double, timestamp: double, user_id: bigint]

In [18]:
requests.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: double (nullable = true)
 |-- user_id: long (nullable = true)



In [19]:
requests.show(5)

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|    2019|   NaN|9.56678777E8|   6040|
|     759|   NaN|9.56679248E8|   6040|
|    2858|   NaN|9.56679275E8|   6040|
|     246|   NaN|9.56679413E8|   6040|
|    1617|   NaN|9.56679473E8|   6040|
+--------+------+------------+-------+
only showing top 5 rows



In [20]:
print((requests.count(), len(requests.columns)))

(280260, 4)


### transforming timestamp column of ratings

In [21]:
# convert format of datetime column 'timestamp' from epoch to standard 
ratings = (ratings.withColumn('timestamp',
                    functions.date_format(ratings.timestamp.cast(dataType=types.TimestampType()),
                    "yyyy-MM-dd HH:mm:ss")))

In [22]:
ratings.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint]

In [23]:
ratings.show(5)

+--------+------+-------------------+-------+
|movie_id|rating|          timestamp|user_id|
+--------+------+-------------------+-------+
|     858|     4|2000-04-25 09:05:32|   6040|
|    2384|     4|2000-04-25 09:05:54|   6040|
|     593|     5|2000-04-25 09:05:54|   6040|
|    1961|     4|2000-04-25 09:06:17|   6040|
|    1419|     3|2000-04-25 09:07:36|   6040|
+--------+------+-------------------+-------+
only showing top 5 rows



In [24]:
ratings = ratings.sort(ratings.timestamp.asc())

In [25]:
ratings.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint]

In [26]:
ratings.show(5)

+--------+------+-------------------+-------+
|movie_id|rating|          timestamp|user_id|
+--------+------+-------------------+-------+
|     858|     4|2000-04-25 09:05:32|   6040|
|    2384|     4|2000-04-25 09:05:54|   6040|
|     593|     5|2000-04-25 09:05:54|   6040|
|    1961|     4|2000-04-25 09:06:17|   6040|
|    1419|     3|2000-04-25 09:07:36|   6040|
+--------+------+-------------------+-------+
only showing top 5 rows



In [27]:
print((ratings.count(), len(users.columns)))

(719949, 5)


In [28]:
719949*.8

575959.2000000001

In [29]:
719949 *.2

143989.80000000002

In [30]:
# Sort by index and get first 4000 rows
ratings_train = ratings.sort(ratings.timestamp.asc()).limit(575959)

In [31]:
ratings_train.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint]

In [32]:
ratings_train.show(5)

+--------+------+-------------------+-------+
|movie_id|rating|          timestamp|user_id|
+--------+------+-------------------+-------+
|     858|     4|2000-04-25 09:05:32|   6040|
|    2384|     4|2000-04-25 09:05:54|   6040|
|     593|     5|2000-04-25 09:05:54|   6040|
|    1961|     4|2000-04-25 09:06:17|   6040|
|    1419|     3|2000-04-25 09:07:36|   6040|
+--------+------+-------------------+-------+
only showing top 5 rows



In [33]:
print((ratings_train.count(), len(ratings_train.columns)))

(575959, 4)


In [34]:
ratings_train.sort(ratings_train.timestamp.asc()).show(10)

+--------+------+-------------------+-------+
|movie_id|rating|          timestamp|user_id|
+--------+------+-------------------+-------+
|     858|     4|2000-04-25 09:05:32|   6040|
|    2384|     4|2000-04-25 09:05:54|   6040|
|     593|     5|2000-04-25 09:05:54|   6040|
|    1961|     4|2000-04-25 09:06:17|   6040|
|    1419|     3|2000-04-25 09:07:36|   6040|
|     213|     5|2000-04-25 09:07:36|   6040|
|    3111|     5|2000-04-25 09:07:36|   6040|
|     573|     4|2000-04-25 09:07:36|   6040|
|    3505|     4|2000-04-25 09:07:36|   6040|
|    1734|     2|2000-04-25 09:08:01|   6040|
+--------+------+-------------------+-------+
only showing top 10 rows



In [35]:
ratings_test = ratings.subtract(ratings_train)

In [36]:
ratings_test.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint]

In [37]:
print((ratings_test.count(), len(ratings_test.columns)))

(143990, 4)


### model

In [38]:
# build recommendation model using ALS on the training data
als = ALS(
    rank=10,
    maxIter=10,
    userCol='user_id',
    itemCol='movie_id',
    ratingCol='rating',
)

In [39]:
# fit the ALS model on training set

als_model = als.fit(ratings_train)

### predict on ratings_test with fitted model

In [40]:
# generate predictions with your model for the test set by using the transform method on your ALS model
preds_test = als_model.transform(ratings_test)
preds_test.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint, prediction: float]

In [41]:
# evaluate your model and print out the RMSE from your test set

In [42]:
preds_test.sort(preds_test.timestamp.asc()).show(10)

+--------+------+-------------------+-------+----------+
|movie_id|rating|          timestamp|user_id|prediction|
+--------+------+-------------------+-------+----------+
|    3354|     3|2000-11-20 00:28:20|   1577| 3.2429955|
|     377|     4|2000-11-20 00:28:25|   1579| 3.0146024|
|    2571|     3|2000-11-20 00:28:25|   1579| 3.7643135|
|       6|     3|2000-11-20 00:28:25|   1579|  3.515823|
|    2427|     2|2000-11-20 00:28:25|   1579|   3.38514|
|    2396|     3|2000-11-20 00:28:32|   1574|  3.962767|
|    1270|     4|2000-11-20 00:28:32|   1574| 4.6968465|
|    1276|     4|2000-11-20 00:28:32|   1592| 3.9603846|
|    1079|     4|2000-11-20 00:28:32|   1574| 3.7616415|
|    3685|     2|2000-11-20 00:28:33|   1576| 3.1399944|
+--------+------+-------------------+-------+----------+
only showing top 10 rows



In [43]:
preds_test.where(preds_test['prediction'].isNotNull()).show()

+--------+------+-------------------+-------+----------+
|movie_id|rating|          timestamp|user_id|prediction|
+--------+------+-------------------+-------+----------+
|     148|     5|2000-11-30 05:47:04|    673|       NaN|
|     148|     3|2000-11-22 00:19:36|   1242|       NaN|
|     148|     2|2000-11-22 10:05:35|   1069|       NaN|
|     148|     2|2000-11-22 05:57:01|   1605| 1.9305981|
|     148|     2|2000-11-21 14:38:26|   1150|       NaN|
|     463|     3|2000-12-01 01:03:09|    660|       NaN|
|     463|     2|2000-11-22 10:02:18|   1069|       NaN|
|     463|     2|2000-11-22 08:33:30|   1146|       NaN|
|     463|     1|2000-11-28 12:05:54|    746|       NaN|
|     463|     2|2000-11-26 01:52:43|   1980| 2.3886154|
|     471|     5|2000-11-23 02:51:07|   1395|       NaN|
|     471|     4|2000-11-20 14:30:07|   1303|       NaN|
|     471|     3|2000-11-22 03:09:41|   1199|       NaN|
|     471|     3|2000-11-20 08:03:22|   1404|       NaN|
|     471|     5|2000-11-20 06:

In [44]:
# Evaluate the model by computing the RMSE on the test data
# preds_test = als_model.transform(ratings_test)

In [45]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

In [46]:
rmse = evaluator.evaluate(preds_test)

In [47]:
rmse

nan

In [48]:
# inspect user_factors and item_factors

In [49]:
user_factors = als_model.userFactors

In [50]:
user_factors.sort(user_factors.id.asc()).show(10)

+----+--------------------+
|  id|            features|
+----+--------------------+
|1570|[-0.098242715, -1...|
|1571|[-0.27003378, -1....|
|1572|[-0.42812914, -0....|
|1573|[-1.1849321, -1.3...|
|1574|[0.011200594, -0....|
|1575|[-0.05512247, -0....|
|1576|[0.38811678, -1.2...|
|1577|[-0.8838546, -0.9...|
|1578|[-0.5934041, -1.7...|
|1579|[-0.30370015, -1....|
+----+--------------------+
only showing top 10 rows



In [51]:
user_factors.show(10)

+----+--------------------+
|  id|            features|
+----+--------------------+
|1570|[-0.098242715, -1...|
|1580|[-0.5938925, -0.8...|
|1590|[-0.64532673, -1....|
|1600|[-0.22160304, -1....|
|1610|[-0.83903724, -1....|
|1620|[-0.36547416, -1....|
|1630|[-0.390614, -0.92...|
|1640|[-0.39971396, -1....|
|1650|[-0.09299978, -1....|
|1660|[-0.15269476, -1....|
+----+--------------------+
only showing top 10 rows



In [52]:
print((user_factors.count(), len(user_factors.columns)))

(4464, 2)


In [53]:
item_factors = als_model.itemFactors

In [54]:
item_factors.show(10)

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[-0.54918313, -0....|
| 20|[-0.69458765, -0....|
| 30|[-0.156775, -0.87...|
| 40|[-0.06158027, -0....|
| 50|[-0.36162597, -1....|
| 60|[-0.16453812, -0....|
| 70|[-0.6149566, -0.4...|
| 80|[-0.3703191, -0.7...|
| 90|[-0.32752094, -0....|
|100|[-0.67923605, -0....|
+---+--------------------+
only showing top 10 rows



In [55]:
print((item_factors.count(), len(item_factors.columns)))

(3576, 2)


In [56]:
user_1570_row = user_factors[user_factors['id'] == 1570].first()

In [57]:
# note - Alex and I got significantly different values for the below
user_1570_row

Row(id=1570, features=[-0.09824271500110626, -1.124036431312561, 1.4023230075836182, 1.017096757888794, 0.3638903796672821, 0.004380527418106794, -0.9986612200737, -0.5456803441047668, 0.3612426221370697, 0.7719910740852356])

In [58]:
user_1570_factors = np.array(user_1570_row['features'])

In [59]:
user_1570_factors

array([-0.09824272, -1.12403643,  1.40232301,  1.01709676,  0.36389038,
        0.00438053, -0.99866122, -0.54568034,  0.36124262,  0.77199107])

In [60]:
toy_story_row = item_factors[item_factors['id'] == 1].first()
toy_story_factors = np.array(toy_story_row['features'])

In [61]:
toy_story_row

Row(id=1, features=[0.08965704590082169, -1.0805866718292236, 0.42876312136650085, 0.531302809715271, -0.11097260564565659, -0.5822567939758301, -1.01619553565979, -0.4469119608402252, 0.4968760907649994, 0.15137770771980286])

In [62]:
toy_story_factors

array([ 0.08965705, -1.08058667,  0.42876312,  0.53130281, -0.11097261,
       -0.58225679, -1.01619554, -0.44691196,  0.49687609,  0.15137771])

In [63]:
user_1570_factors

array([-0.09824272, -1.12403643,  1.40232301,  1.01709676,  0.36389038,
        0.00438053, -0.99866122, -0.54568034,  0.36124262,  0.77199107])

In [64]:
m_factors

NameError: name 'm_factors' is not defined

In [None]:
user_1570_factors @ toy_story_factors

In [None]:
# predictions.sort(predictions.timestamp.asc()).show(10)

In [None]:
# ratings.where(col('rating').isNull())

In [None]:
user_1570_preds = predictions[predictions['user_id'] == 1570]

In [None]:
user_1570_preds.sort('movie_id').show()

In [None]:
# produces 
recs = als_model.recommendForAllUsers(numItems=10)
recs.persist()

In [None]:
# returns list of lists
recs.sort(recs.user_id.asc()).show()

In [None]:
# normalized or standard_scalar, row-wise, normalize per movie

In [None]:
recs[recs['userId']==10].first()['recommendations']

In [None]:
# !grep 3086 < data/movies.csv

In [None]:
preds_requests = als_model.transform(requests)

In [None]:
preds_requests.persist()

In [None]:
preds_requests.sort(preds_requests.timestamp.asc()).show(10)

In [None]:
print((preds_requests.count(), len(preds_requests.columns)))