In [None]:
import pyspark
import numpy as np
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql import functions, types
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
spark = (pyspark.sql.SparkSession.builder 
  .master("local[*]")
  .getOrCreate())

In [49]:
path_users = 'data/users.dat'
users = (spark.read.load("data/users.dat",
                         format="csv", sep=":", inferSchema="true")
                        .drop('_c1', '_c3', '_c5', '_c7'))

In [50]:
users = (users.withColumnRenamed(users.schema.names[0], 'userID')
                        .withColumnRenamed(users.schema.names[1], 'gender')
                        .withColumnRenamed(users.schema.names[2], 'age')
                        .withColumnRenamed(users.schema.names[3], 'occupation')
                        .withColumnRenamed(users.schema.names[4], 'zip'))

In [51]:
users.schema.names

['userID', 'gender', 'age', 'occupation', 'zip']

In [52]:
print((users.count(), len(users.columns)))

(6040, 5)


In [53]:
users.show(5)

+------+------+---+----------+-----+
|userID|gender|age|occupation|  zip|
+------+------+---+----------+-----+
|     1|     F|  1|        10|48067|
|     2|     M| 56|        16|70072|
|     3|     M| 25|        15|55117|
|     4|     M| 45|         7|02460|
|     5|     M| 25|        20|55455|
+------+------+---+----------+-----+
only showing top 5 rows



In [54]:
# read in the dataset into pyspark DataFrame
path_ratings = 'data/ratings.json'
ratings = spark.read.json(path_ratings)

In [55]:
ratings.show(5)

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|     858|     4|9.56678732E8|   6040|
|    2384|     4|9.56678754E8|   6040|
|     593|     5|9.56678754E8|   6040|
|    1961|     4|9.56678777E8|   6040|
|    1419|     3|9.56678856E8|   6040|
+--------+------+------------+-------+
only showing top 5 rows



In [56]:
print((ratings.count(), len(ratings.columns)))

(719949, 4)


In [57]:
ratings.schema.names

['movie_id', 'rating', 'timestamp', 'user_id']

In [58]:
ratings.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- rating: long (nullable = true)
 |-- timestamp: double (nullable = true)
 |-- user_id: long (nullable = true)



In [59]:
ratings.show(5)

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|     858|     4|9.56678732E8|   6040|
|    2384|     4|9.56678754E8|   6040|
|     593|     5|9.56678754E8|   6040|
|    1961|     4|9.56678777E8|   6040|
|    1419|     3|9.56678856E8|   6040|
+--------+------+------------+-------+
only showing top 5 rows



In [60]:
ratings.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: double, user_id: bigint]

#### inspect requests dataset

In [61]:
requests = spark.read.json('data/requests.json')
requests.persist()

DataFrame[movie_id: bigint, rating: double, timestamp: double, user_id: bigint]

In [62]:
requests.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: double (nullable = true)
 |-- user_id: long (nullable = true)



In [63]:
requests.show(5)

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|    2019|   NaN|9.56678777E8|   6040|
|     759|   NaN|9.56679248E8|   6040|
|    2858|   NaN|9.56679275E8|   6040|
|     246|   NaN|9.56679413E8|   6040|
|    1617|   NaN|9.56679473E8|   6040|
+--------+------+------------+-------+
only showing top 5 rows



In [64]:
print((requests.count(), len(requests.columns)))

(280260, 4)


### transforming timestamp column of ratings

In [65]:
# convert format of datetime column 'timestamp' from epoch to standard 
ratings = (ratings.withColumn('timestamp',
                    functions.date_format(ratings.timestamp.cast(dataType=types.TimestampType()),
                    "yyyy-MM-dd HH:mm:ss")))

In [66]:
ratings.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint]

In [67]:
ratings.show(5)

+--------+------+-------------------+-------+
|movie_id|rating|          timestamp|user_id|
+--------+------+-------------------+-------+
|     858|     4|2000-04-25 09:05:32|   6040|
|    2384|     4|2000-04-25 09:05:54|   6040|
|     593|     5|2000-04-25 09:05:54|   6040|
|    1961|     4|2000-04-25 09:06:17|   6040|
|    1419|     3|2000-04-25 09:07:36|   6040|
+--------+------+-------------------+-------+
only showing top 5 rows



In [68]:
ratings = ratings.sort(ratings.timestamp.asc())

In [69]:
ratings.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint]

In [70]:
ratings.show(5)

+--------+------+-------------------+-------+
|movie_id|rating|          timestamp|user_id|
+--------+------+-------------------+-------+
|     858|     4|2000-04-25 09:05:32|   6040|
|    2384|     4|2000-04-25 09:05:54|   6040|
|     593|     5|2000-04-25 09:05:54|   6040|
|    1961|     4|2000-04-25 09:06:17|   6040|
|    1419|     3|2000-04-25 09:07:36|   6040|
+--------+------+-------------------+-------+
only showing top 5 rows



In [71]:
print((ratings.count(), len(users.columns)))

(719949, 5)


In [72]:
719949*.8

575959.2000000001

In [73]:
719949 *.2

143989.80000000002

In [74]:
# Sort by index and get first 4000 rows
ratings_train = ratings.sort(ratings.timestamp.asc()).limit(575959)

In [75]:
ratings_train.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint]

In [76]:
ratings_train.show(5)

+--------+------+-------------------+-------+
|movie_id|rating|          timestamp|user_id|
+--------+------+-------------------+-------+
|     858|     4|2000-04-25 09:05:32|   6040|
|    2384|     4|2000-04-25 09:05:54|   6040|
|     593|     5|2000-04-25 09:05:54|   6040|
|    1961|     4|2000-04-25 09:06:17|   6040|
|    1419|     3|2000-04-25 09:07:36|   6040|
+--------+------+-------------------+-------+
only showing top 5 rows



In [77]:
print((ratings_train.count(), len(ratings_train.columns)))

(575959, 4)


In [78]:
ratings_train.sort(ratings_train.timestamp.asc()).show(10)

+--------+------+-------------------+-------+
|movie_id|rating|          timestamp|user_id|
+--------+------+-------------------+-------+
|     858|     4|2000-04-25 09:05:32|   6040|
|    2384|     4|2000-04-25 09:05:54|   6040|
|     593|     5|2000-04-25 09:05:54|   6040|
|    1961|     4|2000-04-25 09:06:17|   6040|
|    1419|     3|2000-04-25 09:07:36|   6040|
|     213|     5|2000-04-25 09:07:36|   6040|
|    3111|     5|2000-04-25 09:07:36|   6040|
|     573|     4|2000-04-25 09:07:36|   6040|
|    3505|     4|2000-04-25 09:07:36|   6040|
|    1734|     2|2000-04-25 09:08:01|   6040|
+--------+------+-------------------+-------+
only showing top 10 rows



In [79]:
ratings_test = ratings.subtract(ratings_train)

In [80]:
ratings_test.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: string, user_id: bigint]

In [None]:
print((ratings_test.count(), len(ratings_test.columns)))

### model

In [None]:
# build recommendation model using ALS on the training data
als = ALS(
    rank=10,
    maxIter=10,
    userCol='user_id',
    itemCol='movie_id',
    ratingCol='rating',
)

In [None]:
# fit the ALS model on training set

als_model = als.fit(ratings_train)

### predict on ratings_test with fitted model

In [None]:
# generate predictions with your model for the test set by using the transform method on your ALS model
preds_test = als_model.transform(ratings_test)
preds_test.persist()

In [None]:
# evaluate your model and print out the RMSE from your test set

In [None]:
preds_test.sort(preds_test.timestamp.asc()).show(10)

In [45]:
preds_test.where(preds_test['prediction'].isNotNull()).show()

+--------+------+-------------------+-------+----------+
|movie_id|rating|          timestamp|user_id|prediction|
+--------+------+-------------------+-------+----------+
|     148|     5|2000-11-30 05:47:04|    673|       NaN|
|     148|     3|2000-11-22 00:19:36|   1242|       NaN|
|     148|     2|2000-11-22 10:05:35|   1069|       NaN|
|     148|     2|2000-11-22 05:57:01|   1605| 1.8003508|
|     148|     2|2000-11-21 14:38:26|   1150|       NaN|
|     463|     3|2000-12-01 01:03:09|    660|       NaN|
|     463|     2|2000-11-22 10:02:18|   1069|       NaN|
|     463|     2|2000-11-22 08:33:30|   1146|       NaN|
|     463|     1|2000-11-28 12:05:54|    746|       NaN|
|     463|     2|2000-11-26 01:52:43|   1980|  2.339059|
|     471|     5|2000-11-23 02:51:07|   1395|       NaN|
|     471|     4|2000-11-20 14:30:07|   1303|       NaN|
|     471|     3|2000-11-22 03:09:41|   1199|       NaN|
|     471|     3|2000-11-20 08:03:22|   1404|       NaN|
|     471|     5|2000-11-20 06:

In [None]:
# Evaluate the model by computing the RMSE on the test data
# preds_test = als_model.transform(ratings_test)

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

In [None]:
rmse = evaluator.evaluate(preds_test)

In [None]:
rmse

In [None]:
# inspect user_factors and item_factors

In [None]:
user_factors = als_model.userFactors

In [None]:
user_factors.sort(user_factors.id.asc()).show(10)

In [None]:
user_factors.show(10)

In [None]:
print((user_factors.count(), len(user_factors.columns)))

In [None]:
item_factors = als_model.itemFactors

In [None]:
item_factors.show(10)

In [None]:
print((item_factors.count(), len(item_factors.columns)))

In [None]:
user_1570_row = user_factors[user_factors['id'] == 1570].first()

In [None]:
# note - Alex and I got significantly different values for the below
user_1570_row

In [None]:
user_1570_factors = np.array(user_1570_row['features'])

In [None]:
user_1570_factors

In [None]:
toy_story_row = item_factors[item_factors['id'] == 1].first()
toy_story_factors = np.array(toy_story_row['features'])

In [None]:
toy_story_row

In [None]:
toy_story_factors

In [None]:
user_1570_factors

In [None]:
m_factors

In [None]:
user_1570_factors @ toy_story_factors

In [None]:
# predictions.sort(predictions.timestamp.asc()).show(10)

In [None]:
# ratings.where(col('rating').isNull())

In [None]:
user_1570_preds = predictions[predictions['user_id'] == 1570]

In [None]:
user_1570_preds.sort('movie_id').show()

In [None]:
# produces 
recs = als_model.recommendForAllUsers(numItems=10)
recs.persist()

In [None]:
# returns list of lists
recs.sort(recs.user_id.asc()).show()

In [None]:
# normalized or standard_scalar, row-wise, normalize per movie

In [None]:
recs[recs['userId']==10].first()['recommendations']

In [None]:
# !grep 3086 < data/movies.csv

In [None]:
preds_requests = als_model.transform(requests)

In [None]:
preds_requests.persist()

In [None]:
preds_requests.sort(preds_requests.timestamp.asc()).show(10)

In [None]:
print((preds_requests.count(), len(preds_requests.columns)))