### Exploring the ALS algorithm with MovieLens DataSet

In [1]:
import os
import pandas as pd
import numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
from pyspark.sql.functions import udf, col
import pyspark.sql.functions as F

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS #, Rating #, MatrixFactorizationModel

In [2]:
# Visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)

import matplotlib.pyplot as plt
plt.rc('figure', figsize=(18, 4))

import seaborn as sns
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (18,4)})

np.set_printoptions(precision=4, suppress=True)

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
# setting random seed for notebook reproducability
rnd_seed=42
np.random.seed=rnd_seed
np.random.set_state=rnd_seed

## 2. Creating the Spark Session

In [4]:
#os.environ['SPARK_HOME']

In [5]:
spark = SparkSession.builder.master("local[2]").appName("movie-recommender").getOrCreate()
spark

In [6]:
sc = spark.sparkContext
sc

In [7]:
sqlContext = SQLContext(spark.sparkContext)
sqlContext

<pyspark.sql.context.SQLContext at 0x7efe4408b208>

## 3. Load The Data From a File Into a Dataframe

### Loading the Ratings Data Set

In [8]:
ratings_schema = StructType([
    StructField("user_id", ShortType(), nullable=False),
    StructField("movie_id", ShortType(), nullable=False),
    StructField("rating", ByteType(), nullable=False),
    StructField("timestamp", LongType(), nullable=False)]
  )

In [9]:
# load the dats set
ratings_df = spark.read.csv("data/ml-100k/u.data", sep='\t', schema=ratings_schema)
ratings_df = ratings_df.drop('timestamp')
ratings_df.first()

Row(user_id=196, movie_id=242, rating=3)

In [10]:
ratings_df.show(5)

+-------+--------+------+
|user_id|movie_id|rating|
+-------+--------+------+
|    196|     242|     3|
|    186|     302|     3|
|     22|     377|     1|
|    244|      51|     2|
|    166|     346|     1|
+-------+--------+------+
only showing top 5 rows



## 4. Split the data set into training and test set

In [11]:
train_df, test_df = ratings_df.randomSplit([0.8, 0.2], seed=rnd_seed)

In [12]:
train_df.cache()
test_df.cache()

DataFrame[user_id: smallint, movie_id: smallint, rating: tinyint]

DataFrame[user_id: smallint, movie_id: smallint, rating: tinyint]

## 5. Build the ALS Model

In [13]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(rank=10, maxIter=10, regParam=0.01, userCol='user_id', itemCol='movie_id', ratingCol='rating', coldStartStrategy="drop", seed=rnd_seed)

In [14]:
model = als.fit(train_df)

**Latent Factors for the Users:**

In [15]:
# How many latent factors for users
model.userFactors.show(10, 100)

+---+----------------------------------------------------------------------------------------------------+
| id|                                                                                            features|
+---+----------------------------------------------------------------------------------------------------+
| 10|[-0.39050436, -0.68759257, 0.42521605, -0.4053905, -0.2358239, 0.6315711, 0.5784358, 0.87188184, ...|
| 20|[-0.2737031, -1.242235, 0.20510903, -0.7996875, 0.6034063, 0.48831555, -1.1382004, 1.0760351, -0....|
| 30|[0.20306396, -1.1352254, 0.4857387, -0.40179467, -0.36963782, 0.13361134, 0.51082736, 1.236135, 0...|
| 40|[-1.2356386, -0.907456, -0.32608572, -0.90455693, -0.3631856, -0.3337034, -0.42267585, 0.06201465...|
| 50|[-0.320256, -0.95856935, 0.5764343, -0.5063094, 1.1949714, -0.26180014, 0.43938398, -1.1194192, 2...|
| 60|[-0.41627225, -0.79917467, 0.1304494, -0.5075391, -0.22121532, 0.8499223, 0.35839885, 0.62732285,...|
| 70|[-0.47078156, -0.60464656, 0.148

In [16]:
# How many users
model.userFactors.count()

943

**Latent Factors for the Movies:**

In [17]:
# How many latent factors for items
model.itemFactors.show(10, 100)

+---+----------------------------------------------------------------------------------------------------+
| id|                                                                                            features|
+---+----------------------------------------------------------------------------------------------------+
| 10|[-0.6240372, -0.7733903, -0.14778545, -1.4894197, 0.14048776, 1.3905952, 1.3593376, 0.75925183, 0...|
| 20|[0.06499194, -1.5283695, 0.8059704, -0.8520253, -1.043475, 2.0158541, 1.719008, -0.06771716, -0.1...|
| 30|[-0.14091678, -1.5833839, -0.7239627, -0.6747181, -0.020277992, 0.59907234, 1.0775168, 1.4524177,...|
| 40|[-0.9057549, -1.7316684, 0.6150692, 0.10144663, 0.6366782, 1.0053815, 0.684743, -0.15172155, 0.62...|
| 50|[-1.318344, -0.89823127, 0.012314563, -0.37916046, -0.019412791, 1.2713493, -0.027276067, 1.08483...|
| 60|[-0.7614841, -1.2726282, -0.9319344, -0.9241905, -0.2590239, 0.9237887, 2.05774, 0.62158406, 1.15...|
| 70|[-0.34923142, -1.2910129, 0.4178

In [18]:
# How many Movies
model.itemFactors.count()

1651

In [19]:
# How may Latent Factors?
model.rank

10

## 6. Generate Predictions

In [20]:
test_df.sample(fraction=0.01).limit(10).show(10)

+-------+--------+------+
|user_id|movie_id|rating|
+-------+--------+------+
|      5|     194|     4|
|      6|     284|     2|
|      7|     168|     5|
|     10|     447|     4|
|     13|     757|     3|
|     18|      91|     3|
|     20|     288|     1|
|     29|     748|     2|
|     31|     504|     5|
|     33|     323|     4|
+-------+--------+------+



In [21]:
movie_rating_preds = model.transform(test_df).cache()

In [22]:
movie_rating_preds.show(10)

+-------+--------+------+----------+
|user_id|movie_id|rating|prediction|
+-------+--------+------+----------+
|    251|     148|     2| 2.4135408|
|     26|     148|     3|   2.75505|
|     27|     148|     3|  4.452523|
|    606|     148|     3| 3.7467272|
|    916|     148|     2| 2.2659419|
|    190|     148|     4| 3.3019488|
|    297|     148|     3|  2.685518|
|     15|     148|     3| 3.2057416|
|    269|     148|     1| 0.6581736|
|    423|     148|     3| 2.8569834|
+-------+--------+------+----------+
only showing top 10 rows



In [23]:
movie_rating_preds = movie_rating_preds.withColumn("prediction_rounded", F.rint('prediction'))

In [24]:
movie_rating_preds.show(10)

+-------+--------+------+----------+------------------+
|user_id|movie_id|rating|prediction|prediction_rounded|
+-------+--------+------+----------+------------------+
|    251|     148|     2| 2.4135408|               2.0|
|     26|     148|     3|   2.75505|               3.0|
|     27|     148|     3|  4.452523|               4.0|
|    606|     148|     3| 3.7467272|               4.0|
|    916|     148|     2| 2.2659419|               2.0|
|    190|     148|     4| 3.3019488|               3.0|
|    297|     148|     3|  2.685518|               3.0|
|     15|     148|     3| 3.2057416|               3.0|
|    269|     148|     1| 0.6581736|               1.0|
|    423|     148|     3| 2.8569834|               3.0|
+-------+--------+------+----------+------------------+
only showing top 10 rows



### Evaluate the Predictions

In [25]:
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='rating', metricName='rmse')

In [26]:
RMSE = evaluator.evaluate(movie_rating_preds)

In [27]:
RMSE

1.0741757524628464

In [28]:
print("Root Mean Squared Error = {0}".format(RMSE))

Root Mean Squared Error = 1.0741757524628464


In [29]:
# RMSE with rating rounded column
RegressionEvaluator(predictionCol='prediction_rounded', labelCol='rating', metricName='rmse').evaluate(movie_rating_preds)

1.1077614249908798

### Top 3 Movie Recommendations for All Users

In [30]:
# top 3 movies recommended for each user
user_recs = model.recommendForAllUsers(3)

In [31]:
user_recs.show(truncate=False)

+-------+---------------------------------------------------------+
|user_id|recommendations                                          |
+-------+---------------------------------------------------------+
|471    |[[775, 15.412552], [1001, 14.694576], [1264, 11.060541]] |
|463    |[[697, 6.775662], [543, 6.3009353], [601, 6.115304]]     |
|833    |[[1005, 5.7406154], [896, 5.7370267], [1174, 5.6021786]] |
|496    |[[1019, 8.455024], [1159, 8.205235], [1005, 8.202681]]   |
|148    |[[1038, 13.820026], [974, 13.600708], [1315, 12.948069]] |
|540    |[[1643, 5.618454], [1449, 5.286267], [1368, 5.2669053]]  |
|392    |[[1643, 7.4311757], [1121, 6.83654], [962, 6.368367]]    |
|243    |[[1643, 6.50006], [320, 6.1525774], [718, 5.9883833]]    |
|623    |[[1120, 7.199861], [1159, 6.737999], [532, 6.485504]]    |
|737    |[[1245, 10.361174], [1192, 10.1093855], [1163, 9.538757]]|
|897    |[[1421, 7.339346], [1093, 7.29515], [958, 7.224092]]     |
|858    |[[960, 8.113142], [1038, 7.770085], [12

In [32]:
# python trick to display data types inside the dataframe
user_recs

DataFrame[user_id: int, recommendations: array<struct<movie_id:int,rating:float>>]

**Reshape the movie_id and rating arrays into columns for an user:**

In [33]:
user_recs.where(user_recs.user_id == 1).select("user_id", "recommendations.movie_id", "recommendations.rating").show()

+-------+-----------------+--------------------+
|user_id|         movie_id|              rating|
+-------+-----------------+--------------------+
|      1|[1085, 1203, 745]|[6.1223927, 6.075...|
+-------+-----------------+--------------------+



In [34]:
user_recs.where(user_recs.user_id == 1).select("user_id", "recommendations.movie_id", "recommendations.rating").collect()

[Row(user_id=1, movie_id=[1085, 1203, 745], rating=[6.122392654418945, 6.075290679931641, 5.954720973968506])]

In [35]:
user_recs_one = user_recs.where(user_recs.user_id == 1)

In [36]:
user_recs_one.show(truncate=False)

+-------+-------------------------------------------------------+
|user_id|recommendations                                        |
+-------+-------------------------------------------------------+
|1      |[[1085, 6.1223927], [1203, 6.0752907], [745, 5.954721]]|
+-------+-------------------------------------------------------+



In [37]:
user_recs_one

DataFrame[user_id: int, recommendations: array<struct<movie_id:int,rating:float>>]

In [38]:
user_recs_one = user_recs_one.select("user_id", "recommendations.movie_id", "recommendations.rating")
user_recs_one.show(truncate=False)

+-------+-----------------+--------------------------------+
|user_id|movie_id         |rating                          |
+-------+-----------------+--------------------------------+
|1      |[1085, 1203, 745]|[6.1223927, 6.0752907, 5.954721]|
+-------+-----------------+--------------------------------+



In [39]:
user_recs_one

DataFrame[user_id: int, movie_id: array<int>, rating: array<float>]

**Introduce a Sequence Id:**

In order to join the recommended movies and recommended ratings we need to introduce an additional id column. In order to ensure that the values in the id column is increasing we use the monotonically_increasing_id() function. This function is guaranteed to produce increasing numbers but not guaranteed to produce sequential increasing numbers if there are more than 1 partition in the dataframe. So we also repartition the exploded dataframe into 1 partition.

In [40]:
only_movies = user_recs_one.select("user_id", F.explode(col("movie_id")).alias("movie_id"))
only_movies = only_movies.repartition(1).withColumn('id', F.monotonically_increasing_id())
only_movies = only_movies.select('id', 'user_id', 'movie_id')
only_movies.show()

+---+-------+--------+
| id|user_id|movie_id|
+---+-------+--------+
|  0|      1|    1085|
|  1|      1|    1203|
|  2|      1|     745|
+---+-------+--------+



In [41]:
only_ratings = user_recs_one.select("user_id", F.explode(col("rating")).alias("rating"))
only_ratings = only_ratings.repartition(1).withColumn('id', F.monotonically_increasing_id())
only_ratings = only_ratings.select('id', 'user_id', 'rating')
only_ratings.show()

+---+-------+---------+
| id|user_id|   rating|
+---+-------+---------+
|  0|      1|6.1223927|
|  1|      1|6.0752907|
|  2|      1| 5.954721|
+---+-------+---------+



In [42]:
only_movies.join(only_ratings.drop('user_id'), on='id', how='inner').drop('id').show()

+-------+--------+---------+
|user_id|movie_id|   rating|
+-------+--------+---------+
|      1|    1085|6.1223927|
|      1|    1203|6.0752907|
|      1|     745| 5.954721|
+-------+--------+---------+



### Movie Recommendations for a Particular User

**Filter out the user we are interested from the set of all predictions:**

In [43]:
user_recs.where(user_recs.user_id == 2).show(truncate=False)

+-------+----------------------------------------------------+
|user_id|recommendations                                     |
+-------+----------------------------------------------------+
|2      |[[534, 8.056154], [601, 7.446925], [634, 6.5791397]]|
+-------+----------------------------------------------------+



**Create a subset user dataframe and invoke `recommendForUserSubset` on the model:**

In [44]:
user_subset = ratings_df.where(ratings_df.user_id == 2)

In [45]:
user_subset_recs = model.recommendForUserSubset(user_subset, 3)

In [46]:
user_subset_recs.select("recommendations.movie_id", "recommendations.rating").show(truncate=False)

+---------------+-------------------------------+
|movie_id       |rating                         |
+---------------+-------------------------------+
|[534, 601, 634]|[8.056154, 7.446925, 6.5791397]|
+---------------+-------------------------------+



### Top 3 User Recommendations for All Movies

In [47]:
# top 3 users recommended for each movie, for all movies.
movie_recs = model.recommendForAllItems(3)

In [48]:
movie_recs.show(truncate=False)

+--------+------------------------------------------------------+
|movie_id|recommendations                                       |
+--------+------------------------------------------------------+
|1580    |[[471, 1.5046953], [818, 1.1132871], [434, 1.1100312]]|
|471     |[[688, 5.251872], [939, 5.0367737], [357, 4.964419]]  |
|1591    |[[231, 10.863197], [212, 9.507906], [131, 9.288162]]  |
|1342    |[[589, 2.206946], [38, 2.0421913], [97, 1.941875]]    |
|463     |[[702, 7.7433176], [772, 6.674911], [126, 6.324211]]  |
|833     |[[438, 6.989997], [636, 5.9221754], [89, 5.703195]]   |
|1645    |[[89, 6.98346], [39, 6.9104958], [810, 6.581453]]     |
|496     |[[702, 7.4271765], [310, 6.937604], [166, 6.9291487]] |
|148     |[[93, 7.2038307], [202, 6.9377155], [260, 6.195242]]  |
|1088    |[[434, 8.120373], [677, 8.094688], [614, 7.9772997]]  |
|1238    |[[333, 6.86069], [168, 6.535516], [134, 6.350939]]    |
|540     |[[61, 5.666322], [333, 5.2608666], [261, 5.230006]]   |
|392     |

### Top 3 User Recommendations for a Particular Movie

**Filter out the movie we are interested from the set of all predictions:**

In [49]:
movie_recs.where(movie_recs.movie_id == 36).select("recommendations.user_id", "recommendations.rating").collect()

[Row(user_id=[418, 765, 461], rating=[11.34333610534668, 6.878576278686523, 6.868257999420166])]

**Create a subset movie dataframe and invoke `recommendForItemSubset` on the model:**

In [50]:
movie_subset = ratings_df.where(ratings_df.movie_id == 36)

In [51]:
movie_subset_recs = model.recommendForItemSubset(movie_subset, 3)

In [52]:
movie_subset_recs.select("recommendations.user_id", "recommendations.rating").show(truncate=False)

+---------------+--------------------------------+
|user_id        |rating                          |
+---------------+--------------------------------+
|[418, 765, 461]|[11.343336, 6.8785763, 6.868258]|
+---------------+--------------------------------+



## Incorporating Movie Details Data

**Load the Movie Details:**

In [53]:
movies_schema = StructType([
    StructField("movie_id", ShortType(), nullable=True),
    StructField("title", StringType(), nullable=True),
    StructField("release_date", StringType(), nullable=False),
    StructField("video_release_date", StringType(), nullable=False),
    StructField("imdb_url", StringType(), nullable=False),
    StructField("unknown", ByteType(), nullable=False),
    StructField("action", ByteType(), nullable=False),
    StructField("adventure", ByteType(), nullable=False),
    StructField("animation", ByteType(), nullable=False),
    StructField("childrens", ByteType(), nullable=False),
    StructField("comedy", ByteType(), nullable=False),
    StructField("crime", ByteType(), nullable=False),
    StructField("documentary", ByteType(), nullable=False),
    StructField("drama", ByteType(), nullable=False),
    StructField("fantasy", ByteType(), nullable=False),
    StructField("filmnoir", ByteType(), nullable=False),
    StructField("horror", ByteType(), nullable=False),
    StructField("musical", ByteType(), nullable=False),
    StructField("mystery", ByteType(), nullable=False),
    StructField("romance", ByteType(), nullable=False),
    StructField("scifi", ByteType(), nullable=False),
    StructField("thriller", ByteType(), nullable=False),
    StructField("war", ByteType(), nullable=False),
    StructField("western", ByteType(), nullable=False)]
  )

In [54]:
movies_df = spark.read.csv("data/ml-100k/u.item", sep="|", schema=movies_schema).cache()

In [55]:
movies_df.limit(10).toPandas()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,unknown,action,adventure,animation,childrens,comedy,crime,documentary,drama,fantasy,filmnoir,horror,musical,mystery,romance,scifi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%20(1995),0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(1995),0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995),0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) (1995),01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monkeys%20(1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
7,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0
8,9,Dead Man Walking (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Dead%20Man%20Walking%20(1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9,10,Richard III (1995),22-Jan-1996,,http://us.imdb.com/M/title-exact?Richard%20III%20(1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0


In [56]:
from pyspark.sql import Row
test_user_movie_df = spark.createDataFrame([Row(user_id=789, movie_id=123)], ['user_id', 'movie_id'])

In [57]:
model.transform(test_user_movie_df).show()

+-------+--------+----------+
|user_id|movie_id|prediction|
+-------+--------+----------+
|    123|     789| 2.6726801|
+-------+--------+----------+



**Generate top 10 Predictions for User 789:**

In [58]:
from pyspark.sql import Row
test_user_df = spark.createDataFrame([Row(user_id=789)], ['user_id'])

In [59]:
test_user_df.show()

+-------+
|user_id|
+-------+
|    789|
+-------+



In [60]:
test_user_recs = model.recommendForUserSubset(test_user_df, 10).cache()

In [61]:
test_user_recs.select("user_id", "recommendations.movie_id", "recommendations.rating").show(1, 50)

+-------+--------------------------------------------------+--------------------------------------------------+
|user_id|                                          movie_id|                                            rating|
+-------+--------------------------------------------------+--------------------------------------------------+
|    789|[793, 1019, 962, 6, 904, 1643, 1368, 1121, 1113...|[9.706029, 8.729713, 8.645201, 8.499296, 8.1531...|
+-------+--------------------------------------------------+--------------------------------------------------+



In [62]:
test_user_recs.select("user_id", F.explode(col("recommendations.movie_id")).alias("movie_id")).show()

+-------+--------+
|user_id|movie_id|
+-------+--------+
|    789|     793|
|    789|    1019|
|    789|     962|
|    789|       6|
|    789|     904|
|    789|    1643|
|    789|    1368|
|    789|    1121|
|    789|    1113|
|    789|     337|
+-------+--------+



In [63]:
movie_genres = (['unknown', 'action', 'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 
                 'fantasy', 'filmnoir', 'horror', 'musical', 'mystery', 'romance', 'scifi', 'thriller', 'war', 'western'])

In [64]:
# An utility function to combine all active genres of the movie into an array
from numpy import ma
def combine_genres(genres):
    x = ma.masked_array(movie_genres, mask=(1 - np.array(genres)))
    return x[~x.mask].data.tolist()

In [65]:
combine_genres([0,0,1,0,0,1,0,1,0,1,0,0,0,1,0,1,0,1,0])

['adventure', 'comedy', 'documentary', 'fantasy', 'mystery', 'scifi', 'war']

In [66]:
udf_combine_genres = udf(lambda genres: combine_genres(genres), ArrayType(elementType=StringType()))

In [67]:
print('Recommended Top 10 Movies For User 789:')
(test_user_recs
 .select('user_id', F.explode(col("recommendations.movie_id")).alias("movie_id"))
 .join(movies_df, on=['movie_id'], how='inner')
 .withColumn('genres', udf_combine_genres(F.array('unknown', 'action', 'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'filmnoir', 'horror', 'musical', 'mystery', 'romance', 'scifi', 'thriller', 'war', 'western')))
 .select('user_id', 'movie_id', 'title', 'genres')
).show(truncate=False)

Recommended Top 10 Movies For User 789:
+-------+--------+-----------------------------------------------------------------+-------------------------+
|user_id|movie_id|title                                                            |genres                   |
+-------+--------+-----------------------------------------------------------------+-------------------------+
|789    |6       |Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)             |[drama]                  |
|789    |337     |House of Yes, The (1997)                                         |[comedy, drama, thriller]|
|789    |793     |Crooklyn (1994)                                                  |[comedy]                 |
|789    |904     |Ma vie en rose (My Life in Pink) (1997)                          |[comedy, drama]          |
|789    |962     |Ruby in Paradise (1993)                                          |[drama]                  |
|789    |1019    |Die xue shuang xiong (Killer, The) (1989)             

**Retrieve top 10 Movies rated by User 789:**

In [68]:
movies_df.createOrReplaceTempView('movies')
ratings_df.createOrReplaceTempView('movie_ratings')

In [69]:
import re

def strip_margin(text):
    nomargin = re.sub('\n[ \t]*\|', ' ', text)
    trimmed = re.sub('\s+', ' ', nomargin)
    return trimmed

In [70]:
sqlContext.udf.register("sql_udf_combine_genres", combine_genres, ArrayType(elementType=StringType()))

<function __main__.combine_genres>

In [71]:
# Top 10 movies rated by user 789
print('Top 10 Rated Movies by User 789:')
spark.sql(strip_margin(
                    """SELECT r.user_id, r.movie_id, m.title, sql_udf_combine_genres(array(unknown, action, adventure, animation, childrens, comedy, crime, documentary, drama, fantasy, filmnoir, horror, musical, mystery, romance, scifi, thriller, war, western)) as genres
                      |FROM movie_ratings r, movies m
                      |WHERE r.user_id = 789
                      |AND r.movie_id = m.movie_id
                      |ORDER BY r.rating DESC, r.movie_id 
                      |LIMIT 10
                    """)).show(truncate=False)

Top 10 Rated Movies by User 789:
+-------+--------+-------------------------------+----------------------------------------+
|user_id|movie_id|title                          |genres                                  |
+-------+--------+-------------------------------+----------------------------------------+
|789    |9       |Dead Man Walking (1995)        |[drama]                                 |
|789    |50      |Star Wars (1977)               |[action, adventure, romance, scifi, war]|
|789    |100     |Fargo (1996)                   |[crime, drama, thriller]                |
|789    |127     |Godfather, The (1972)          |[action, crime, drama]                  |
|789    |129     |Bound (1996)                   |[crime, drama, romance, thriller]       |
|789    |150     |Swingers (1996)                |[comedy, drama]                         |
|789    |276     |Leaving Las Vegas (1995)       |[drama, romance]                        |
|789    |475     |Trainspotting (1996)         

Comparing the Recommendations with the Movies that the user already Rated with high rating it appears he likes movies around 1995 time frame and of genre Drama.

In [72]:
spark.stop()