### Exploring the ALS algorithm

In [92]:
import os
import pandas as pd
import numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
from pyspark.sql.functions import udf, col
import pyspark.sql.functions as F

from pyspark.ml.recommendation import ALS #, Rating #, MatrixFactorizationModel

In [18]:
# Visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)

import matplotlib.pyplot as plt
plt.rc('figure', figsize=(18, 4))

import seaborn as sns
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (18,4)})

np.set_printoptions(precision=4, suppress=True)

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [19]:
# setting random seed for notebook reproducability
rnd_seed=42
np.random.seed=rnd_seed
np.random.set_state=rnd_seed

## 2. Creating the Spark Session

In [20]:
os.environ['SPARK_HOME']

KeyError: 'SPARK_HOME'

In [21]:
spark = SparkSession.builder.master("local[2]").appName("movie-recommender").getOrCreate()
spark

In [22]:
sc = spark.sparkContext
sc

In [23]:
sqlContext = SQLContext(spark.sparkContext)
sqlContext

<pyspark.sql.context.SQLContext at 0x7fd44da19710>

#### Loading the rating data set

In [32]:
schema = StructType([
    StructField("user_id", IntegerType(), nullable=False),
    StructField("movie_id", IntegerType(), nullable=False),
    StructField("rating", IntegerType(), nullable=False),
    StructField("timestamp", LongType(), nullable=False)]
  )

In [38]:
ratings_df = spark.read.csv("data/ml-100k/u.data", sep='\t', schema=schema)
ratings_df.first()

Row(user_id=196, movie_id=242, rating=3, timestamp=881250949)

In [39]:
# Load and parse the data
#ratings = rating_data.rdd.map(lambda record: Rating(int(record[0]), int(record[1]), float(record[2]))).toDF()

In [40]:
ratings_df.show()

+-------+--------+------+---------+
|user_id|movie_id|rating|timestamp|
+-------+--------+------+---------+
|    196|     242|     3|881250949|
|    186|     302|     3|891717742|
|     22|     377|     1|878887116|
|    244|      51|     2|880606923|
|    166|     346|     1|886397596|
|    298|     474|     4|884182806|
|    115|     265|     2|881171488|
|    253|     465|     5|891628467|
|    305|     451|     3|886324817|
|      6|      86|     3|883603013|
|     62|     257|     2|879372434|
|    286|    1014|     5|879781125|
|    200|     222|     5|876042340|
|    210|      40|     3|891035994|
|    224|      29|     3|888104457|
|    303|     785|     3|879485318|
|    122|     387|     5|879270459|
|    194|     274|     2|879539794|
|    291|    1042|     4|874834944|
|    234|    1184|     2|892079237|
+-------+--------+------+---------+
only showing top 20 rows



In [41]:
# Build the recommendation model using Alternating Least Squares
#rank = 10
#iterations = 10
#lambda_ = 0.01

In [42]:
als = ALS(rank=10, maxIter=10, regParam=0.01, userCol='user_id', itemCol='movie_id', ratingCol='rating', seed=rnd_seed)

In [43]:
model = als.fit(ratings_df)

In [60]:
model.userFactors.show(10, 100)

+---+----------------------------------------------------------------------------------------------------+
| id|                                                                                            features|
+---+----------------------------------------------------------------------------------------------------+
| 10|[-0.54010206, -0.84998554, 0.31779268, -0.2886768, 0.23462527, 0.5715468, 0.46049356, 0.83112156,...|
| 20|[-1.033407, -0.2879485, 1.2173023, -1.8964379, 0.22374271, 0.99104583, -0.50815064, 0.53431267, 0...|
| 30|[-0.6141571, -1.2998686, -0.35556945, -0.49162617, 0.2718643, 0.38572592, 0.5292643, 0.8750414, 1...|
| 40|[-0.97532904, -0.34082136, 0.7854108, -0.2649228, -0.3352775, 0.0044832723, 0.17719568, 0.6497875...|
| 50|[-0.49595955, 1.5031621, 1.1059343, -1.40197, 1.6300769, 0.5451597, 0.93870777, -0.42847547, 1.67...|
| 60|[-0.5351085, -1.138615, -0.0115808295, -0.6367293, 0.10694349, 0.6845996, 0.12883754, 0.8526163, ...|
| 70|[-0.2827961, -0.6142456, 0.52760

In [61]:
model.itemFactors.show(10, 100)

+---+----------------------------------------------------------------------------------------------------+
| id|                                                                                            features|
+---+----------------------------------------------------------------------------------------------------+
| 10|[-0.69717366, -1.2098774, -0.7123832, -0.82355976, 0.79044044, 1.1545237, 1.1774782, 0.476662, 0....|
| 20|[-0.37784293, -1.2001355, 0.68151534, -0.05149225, -0.25763956, 1.4960907, 1.7991132, 0.2601949, ...|
| 30|[-0.85375524, -1.4291214, -0.12453982, -0.010670745, 1.2250174, 0.2534764, 1.3009347, 1.0005549, ...|
| 40|[-0.91000855, -1.0365245, 0.22827172, -0.8106577, 0.9263951, 1.3194665, -0.25297254, 0.07458614, ...|
| 50|[-0.7174377, -0.80812925, 0.22620797, -0.6710367, -0.4492444, 1.1530977, 0.50407505, 1.2884802, 0...|
| 60|[-0.16236, -1.6731288, -0.5522944, -0.30527198, 0.2654121, 1.7868787, 0.83437276, 0.33115077, 1.4...|
| 70|[-0.3798019, -1.1385587, 0.45205

In [44]:
model.rank

10

In [46]:
test_df = ratings_df.select('user_id', 'movie_id').sample(fraction=0.01, seed=rnd_seed).limit(5)

In [47]:
test_df.show()

+-------+--------+
|user_id|movie_id|
+-------+--------+
|    115|     265|
|     90|     382|
|     90|     648|
|    184|     187|
|    307|     174|
+-------+--------+



In [63]:
predictions = model.transform(test_df)

In [64]:
predictions.show()

+-------+--------+----------+
|user_id|movie_id|prediction|
+-------+--------+----------+
|     90|     648|  4.927757|
|     90|     382| 4.3809342|
|    115|     265| 3.3165402|
|    307|     174| 4.1435966|
|    184|     187|  3.857808|
+-------+--------+----------+



In [68]:
(predictions
 .join(
     ratings_df, on=['user_id', 'movie_id'], 
     how='inner').show())

+-------+--------+----------+------+---------+
|user_id|movie_id|prediction|rating|timestamp|
+-------+--------+----------+------+---------+
|     90|     648|  4.927757|     4|891384754|
|     90|     382| 4.3809342|     5|891383835|
|    115|     265| 3.3165402|     2|881171488|
|    307|     174| 4.1435966|     4|879283480|
|    184|     187|  3.857808|     4|889909024|
+-------+--------+----------+------+---------+



### Top 3 Movie Recommendations for All Users

In [69]:
# top 3 movies recommended for each user, for all users.
user_recs = model.recommendForAllUsers(3)

In [73]:
user_recs.show(truncate=False)

+-------+--------------------------------------------------------+
|user_id|recommendations                                         |
+-------+--------------------------------------------------------+
|471    |[[1286, 12.138259], [1157, 9.89244], [909, 9.815423]]   |
|463    |[[1159, 7.4255023], [1473, 6.0584435], [1269, 5.988885]]|
|833    |[[1368, 6.954572], [1205, 6.66879], [390, 6.251658]]    |
|496    |[[1288, 10.510149], [1120, 9.43979], [267, 8.924306]]   |
|148    |[[1314, 11.984594], [700, 11.881374], [1615, 11.267052]]|
|540    |[[1643, 5.667913], [1449, 5.588215], [1463, 5.3312726]] |
|392    |[[1121, 6.8301516], [1367, 6.3946433], [1643, 6.308919]]|
|243    |[[1643, 5.6771665], [1512, 5.649006], [1131, 5.055449]] |
|623    |[[1120, 6.458686], [574, 6.229914], [1286, 6.126714]]   |
|737    |[[695, 8.843273], [1005, 7.6665554], [516, 7.2666683]]  |
|897    |[[914, 7.284672], [624, 6.965919], [1446, 6.859709]]    |
|858    |[[1643, 7.6432023], [1438, 7.2011805], [1242, 7.11601

In [87]:
user_recs.where(user_recs.user_id == 1).select("recommendations.movie_id", "recommendations.rating").collect()

[Row(movie_id=[1434, 793, 1286], rating=[6.207646369934082, 6.116008758544922, 6.007221698760986])]

In [99]:
user_recs.where(user_recs.user_id == 1).select("user_id", "recommendations.movie_id", "recommendations.rating").show(truncate=False)

+-------+-----------------+---------------------------------+
|user_id|movie_id         |rating                           |
+-------+-----------------+---------------------------------+
|1      |[1434, 793, 1286]|[6.2076464, 6.1160088, 6.0072217]|
+-------+-----------------+---------------------------------+



In [105]:
user_recs_one = user_recs.where(user_recs.user_id == 1)

In [119]:
x = user_recs_one.select("user_id", F.explode(col("recommendations.movie_id")).alias("movie_id"))
x.show()

+-------+--------+
|user_id|movie_id|
+-------+--------+
|      1|    1434|
|      1|     793|
|      1|    1286|
+-------+--------+



In [120]:
y = user_recs_one.select("user_id", F.explode(col("recommendations.rating")).alias("rating"))
y.show()

+-------+---------+
|user_id|   rating|
+-------+---------+
|      1|6.2076464|
|      1|6.1160088|
|      1|6.0072217|
+-------+---------+



In [128]:
x.join(y, on='user_id', how='inner').show()

+-------+--------+---------+
|user_id|movie_id|   rating|
+-------+--------+---------+
|      1|    1434|6.2076464|
|      1|    1434|6.1160088|
|      1|    1434|6.0072217|
|      1|     793|6.2076464|
|      1|     793|6.1160088|
|      1|     793|6.0072217|
|      1|    1286|6.2076464|
|      1|    1286|6.1160088|
|      1|    1286|6.0072217|
+-------+--------+---------+



### Movie Recommendations for a Particular User

**Filter out the user we are interested from the set of all predictions:**

In [139]:
user_recs.where(user_recs.user_id == 2).show(truncate=False)

+-------+---------------------------------------------------------+
|user_id|recommendations                                          |
+-------+---------------------------------------------------------+
|2      |[[1286, 6.4112782], [1473, 6.2863865], [1126, 6.1219964]]|
+-------+---------------------------------------------------------+



**Create a subset user dataframe and invoke `recommendForUserSubset` on the model:**

In [134]:
user_subset = ratings_df.where(ratings_df.user_id == 2)

In [135]:
user_subset_recs = model.recommendForUserSubset(user_subset, 3)

In [138]:
user_subset_recs.select("recommendations.movie_id", "recommendations.rating").show(truncate=False)

+------------------+---------------------------------+
|movie_id          |rating                           |
+------------------+---------------------------------+
|[1286, 1473, 1126]|[6.4112782, 6.2863865, 6.1219964]|
+------------------+---------------------------------+



### Top 3 User Recommendations for All Movies

In [71]:
# top 3 users recommended for each movie, for all movies.
movie_recs = model.recommendForAllItems(3)

In [75]:
movie_recs.show(truncate=False)

+--------+------------------------------------------------------+
|movie_id|recommendations                                       |
+--------+------------------------------------------------------+
|1580    |[[36, 1.3616927], [38, 1.182214], [562, 1.1041416]]   |
|471     |[[688, 5.694274], [849, 5.287952], [810, 5.163946]]   |
|1591    |[[310, 7.459932], [651, 7.0653586], [148, 6.6757364]] |
|1342    |[[820, 5.078589], [153, 4.873718], [461, 4.4691515]]  |
|463     |[[219, 7.262149], [98, 6.8233185], [809, 6.5238543]]  |
|833     |[[9, 6.6418223], [261, 6.598943], [124, 6.5040503]]   |
|1645    |[[809, 6.4994106], [98, 6.410225], [366, 6.3061366]]  |
|496     |[[809, 8.542983], [36, 7.5537395], [61, 6.986885]]    |
|148     |[[688, 5.9158826], [507, 5.723487], [850, 5.718377]]  |
|1088    |[[366, 9.839123], [153, 9.565502], [228, 8.943895]]   |
|1238    |[[148, 6.5439024], [258, 6.339524], [443, 6.311713]]  |
|540     |[[212, 5.9102707], [258, 5.144593], [726, 5.0746555]] |
|1460    |

### Top 3 User Recommendations for a Particular Movie

**Filter out the movie we are interested from the set of all predictions:**

In [141]:
movie_recs.where(movie_recs.movie_id == 2).select("recommendations.user_id", "recommendations.rating").collect()

[Row(user_id=[39, 340, 688], rating=[5.865912437438965, 5.156042098999023, 5.124417304992676])]

**Create a subset movie dataframe and invoke `recommendForItemSubset` on the model:**

In [143]:
movie_subset = ratings_df.where(ratings_df.movie_id == 36)

In [144]:
movie_subset_recs = model.recommendForItemSubset(movie_subset, 3)

In [146]:
movie_subset_recs.select("recommendations.user_id", "recommendations.rating").show(truncate=False)

+---------------+------------------------------+
|user_id        |rating                        |
+---------------+------------------------------+
|[366, 153, 261]|[9.047767, 8.196543, 7.674849]|
+---------------+------------------------------+



In [147]:
spark.stop()

In [None]:
# Evaluate the model on training data
testdata = ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

In [None]:
# Let's get the predicted rating for movie 123 for user 789:
predictedRating = model.predict(789, 123)

In [None]:
predictedRating

In [None]:
# Let's generate the top 10 recommended items for user 789:
userId = 789
K = 10
topKRecs = model.recommendProducts(userId, K)

In [None]:
topKRecs

In [None]:
# Let's find the top 10 rated items by user 789:
topKRated = ratings.keyBy(lambda r: r.user).lookup(userId)
topKRated.sort(key = lambda r: r.rating, reverse=True)

In [None]:
topKRated = topKRated[:K]
topKRated

#### Loading the movie data set

In [None]:
movies = sc.textFile("data/ml-100k/u.item")
print movies.first()

In [None]:
titles = movies.map(lambda line: line.split("|")[:2]).map(lambda record: (int(record[0]), record[1])).collectAsMap()

In [None]:
titles[123]

In [None]:
# check the top recommended items
for rating in topKRecs:
    print (titles[rating.product], rating.rating)

In [None]:
# check the top rated items
for rating in topKRated:
    print (titles[rating.product], rating.rating)