### Exploring the ALS algorithm

In [1]:
import os
import pandas as pd
import numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
from pyspark.sql.functions import udf, col
import pyspark.sql.functions as F

from pyspark.ml.recommendation import ALS #, Rating #, MatrixFactorizationModel

In [2]:
# Visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)

import matplotlib.pyplot as plt
plt.rc('figure', figsize=(18, 4))

import seaborn as sns
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (18,4)})

np.set_printoptions(precision=4, suppress=True)

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
# setting random seed for notebook reproducability
rnd_seed=42
np.random.seed=rnd_seed
np.random.set_state=rnd_seed

## 2. Creating the Spark Session

In [4]:
#os.environ['SPARK_HOME']

In [5]:
spark = SparkSession.builder.master("local[2]").appName("movie-recommender").getOrCreate()
spark

In [6]:
sc = spark.sparkContext
sc

In [7]:
sqlContext = SQLContext(spark.sparkContext)
sqlContext

<pyspark.sql.context.SQLContext at 0x7fccb17307f0>

## 3. Load The Data From a File Into a Dataframe

### Loading the Ratings Data Set

In [8]:
ratings_schema = StructType([
    StructField("user_id", IntegerType(), nullable=False),
    StructField("movie_id", IntegerType(), nullable=False),
    StructField("rating", IntegerType(), nullable=False),
    StructField("timestamp", LongType(), nullable=False)]
  )

In [9]:
# load the dats set
ratings_df = spark.read.csv("data/ml-100k/u.data", sep='\t', schema=ratings_schema)
ratings_df = ratings_df.drop('timestamp')
ratings_df.first()

Row(user_id=196, movie_id=242, rating=3)

In [10]:
ratings_df.show(5)

+-------+--------+------+
|user_id|movie_id|rating|
+-------+--------+------+
|    196|     242|     3|
|    186|     302|     3|
|     22|     377|     1|
|    244|      51|     2|
|    166|     346|     1|
+-------+--------+------+
only showing top 5 rows



## 4. Build the ALS Model

In [11]:
# Build the recommendation model using Alternating Least Squares
als = ALS(rank=10, maxIter=10, regParam=0.01, userCol='user_id', itemCol='movie_id', ratingCol='rating', seed=rnd_seed)

In [12]:
model = als.fit(ratings_df)

**Latent Factors for the Users:**

In [13]:
model.userFactors.show(10, 100)

+---+----------------------------------------------------------------------------------------------------+
| id|                                                                                            features|
+---+----------------------------------------------------------------------------------------------------+
| 10|[-0.54010206, -0.84998554, 0.31779268, -0.2886768, 0.23462527, 0.5715468, 0.46049356, 0.83112156,...|
| 20|[-1.033407, -0.2879485, 1.2173023, -1.8964379, 0.22374271, 0.99104583, -0.50815064, 0.53431267, 0...|
| 30|[-0.6141571, -1.2998686, -0.35556945, -0.49162617, 0.2718643, 0.38572592, 0.5292643, 0.8750414, 1...|
| 40|[-0.97532904, -0.34082136, 0.7854108, -0.2649228, -0.3352775, 0.0044832723, 0.17719568, 0.6497875...|
| 50|[-0.49595955, 1.5031621, 1.1059343, -1.40197, 1.6300769, 0.5451597, 0.93870777, -0.42847547, 1.67...|
| 60|[-0.5351085, -1.138615, -0.0115808295, -0.6367293, 0.10694349, 0.6845996, 0.12883754, 0.8526163, ...|
| 70|[-0.2827961, -0.6142456, 0.52760

**Latent Factors for the Movies:**

In [14]:
model.itemFactors.show(10, 100)

+---+----------------------------------------------------------------------------------------------------+
| id|                                                                                            features|
+---+----------------------------------------------------------------------------------------------------+
| 10|[-0.69717366, -1.2098774, -0.7123832, -0.82355976, 0.79044044, 1.1545237, 1.1774782, 0.476662, 0....|
| 20|[-0.37784293, -1.2001355, 0.68151534, -0.05149225, -0.25763956, 1.4960907, 1.7991132, 0.2601949, ...|
| 30|[-0.85375524, -1.4291214, -0.12453982, -0.010670745, 1.2250174, 0.2534764, 1.3009347, 1.0005549, ...|
| 40|[-0.91000855, -1.0365245, 0.22827172, -0.8106577, 0.9263951, 1.3194665, -0.25297254, 0.07458614, ...|
| 50|[-0.7174377, -0.80812925, 0.22620797, -0.6710367, -0.4492444, 1.1530977, 0.50407505, 1.2884802, 0...|
| 60|[-0.16236, -1.6731288, -0.5522944, -0.30527198, 0.2654121, 1.7868787, 0.83437276, 0.33115077, 1.4...|
| 70|[-0.3798019, -1.1385587, 0.45205

In [15]:
model.rank

10

### Generate Predictions

**Create a Test Set:**

In [16]:
# take a sample of the available data and generate predictions
test_ratings_df = ratings_df.select('user_id', 'movie_id').sample(fraction=0.01, seed=rnd_seed).limit(5)

In [17]:
test_ratings_df.show()

+-------+--------+
|user_id|movie_id|
+-------+--------+
|    115|     265|
|     90|     382|
|     90|     648|
|    184|     187|
|    307|     174|
+-------+--------+



In [18]:
movie_rating_preds = model.transform(test_ratings_df)

In [19]:
(movie_rating_preds
 .join(ratings_df, on=['user_id', 'movie_id'], how='inner').show())

+-------+--------+----------+------+
|user_id|movie_id|prediction|rating|
+-------+--------+----------+------+
|     90|     648|  4.927757|     4|
|     90|     382| 4.3809342|     5|
|    115|     265| 3.3165402|     2|
|    307|     174| 4.1435966|     4|
|    184|     187|  3.857808|     4|
+-------+--------+----------+------+



### Top 3 Movie Recommendations for All Users

In [20]:
# top 3 movies recommended for each user, for all users.
user_recs = model.recommendForAllUsers(3)

In [21]:
user_recs.show(truncate=False)

+-------+--------------------------------------------------------+
|user_id|recommendations                                         |
+-------+--------------------------------------------------------+
|471    |[[1286, 12.138259], [1157, 9.89244], [909, 9.815423]]   |
|463    |[[1159, 7.4255023], [1473, 6.0584435], [1269, 5.988885]]|
|833    |[[1368, 6.954572], [1205, 6.66879], [390, 6.251658]]    |
|496    |[[1288, 10.510149], [1120, 9.43979], [267, 8.924306]]   |
|148    |[[1314, 11.984594], [700, 11.881374], [1615, 11.267052]]|
|540    |[[1643, 5.667913], [1449, 5.588215], [1463, 5.3312726]] |
|392    |[[1121, 6.8301516], [1367, 6.3946433], [1643, 6.308919]]|
|243    |[[1643, 5.6771665], [1512, 5.649006], [1131, 5.055449]] |
|623    |[[1120, 6.458686], [574, 6.229914], [1286, 6.126714]]   |
|737    |[[695, 8.843273], [1005, 7.6665554], [516, 7.2666683]]  |
|897    |[[914, 7.284672], [624, 6.965919], [1446, 6.859709]]    |
|858    |[[1643, 7.6432023], [1438, 7.2011805], [1242, 7.11601

**Reshape the movie_id and rating arrays into columns for an user:**

In [22]:
user_recs.where(user_recs.user_id == 1).select("user_id", "recommendations.movie_id", "recommendations.rating").collect()

[Row(user_id=1, movie_id=[1434, 793, 1286], rating=[6.207646369934082, 6.116008758544922, 6.007221698760986])]

In [23]:
user_recs.where(user_recs.user_id == 1).select("user_id", "recommendations.movie_id", "recommendations.rating").show(truncate=False)

+-------+-----------------+---------------------------------+
|user_id|movie_id         |rating                           |
+-------+-----------------+---------------------------------+
|1      |[1434, 793, 1286]|[6.2076464, 6.1160088, 6.0072217]|
+-------+-----------------+---------------------------------+



In [24]:
user_recs_one = user_recs.where(user_recs.user_id == 1)

In [25]:
user_recs_one.show(truncate=False)

+-------+--------------------------------------------------------+
|user_id|recommendations                                         |
+-------+--------------------------------------------------------+
|1      |[[1434, 6.2076464], [793, 6.1160088], [1286, 6.0072217]]|
+-------+--------------------------------------------------------+



In [26]:
user_recs_one.dtypes

[('user_id', 'int'),
 ('recommendations', 'array<struct<movie_id:int,rating:float>>')]

In [27]:
x = user_recs_one.select("user_id", F.explode(col("recommendations.movie_id")).alias("movie_id"))
x.show()

+-------+--------+
|user_id|movie_id|
+-------+--------+
|      1|    1434|
|      1|     793|
|      1|    1286|
+-------+--------+



In [28]:
y = user_recs_one.select("user_id", F.explode(col("recommendations.rating")).alias("rating"))
y.show()

+-------+---------+
|user_id|   rating|
+-------+---------+
|      1|6.2076464|
|      1|6.1160088|
|      1|6.0072217|
+-------+---------+



In [29]:
x.join(y, on=['user_id'], how='inner').show()

+-------+--------+---------+
|user_id|movie_id|   rating|
+-------+--------+---------+
|      1|    1434|6.2076464|
|      1|    1434|6.1160088|
|      1|    1434|6.0072217|
|      1|     793|6.2076464|
|      1|     793|6.1160088|
|      1|     793|6.0072217|
|      1|    1286|6.2076464|
|      1|    1286|6.1160088|
|      1|    1286|6.0072217|
+-------+--------+---------+



### Movie Recommendations for a Particular User

**Filter out the user we are interested from the set of all predictions:**

In [30]:
user_recs.where(user_recs.user_id == 2).show(truncate=False)

+-------+---------------------------------------------------------+
|user_id|recommendations                                          |
+-------+---------------------------------------------------------+
|2      |[[1286, 6.4112782], [1473, 6.2863865], [1126, 6.1219964]]|
+-------+---------------------------------------------------------+



**Create a subset user dataframe and invoke `recommendForUserSubset` on the model:**

In [31]:
user_subset = ratings_df.where(ratings_df.user_id == 2)

In [32]:
user_subset_recs = model.recommendForUserSubset(user_subset, 3)

In [33]:
user_subset_recs.select("recommendations.movie_id", "recommendations.rating").show(truncate=False)

+------------------+---------------------------------+
|movie_id          |rating                           |
+------------------+---------------------------------+
|[1286, 1473, 1126]|[6.4112782, 6.2863865, 6.1219964]|
+------------------+---------------------------------+



### Top 3 User Recommendations for All Movies

In [34]:
# top 3 users recommended for each movie, for all movies.
movie_recs = model.recommendForAllItems(3)

In [35]:
movie_recs.show(truncate=False)

+--------+------------------------------------------------------+
|movie_id|recommendations                                       |
+--------+------------------------------------------------------+
|1580    |[[36, 1.3616927], [38, 1.182214], [562, 1.1041416]]   |
|471     |[[688, 5.694274], [849, 5.287952], [810, 5.163946]]   |
|1591    |[[310, 7.459932], [651, 7.0653586], [148, 6.6757364]] |
|1342    |[[820, 5.078589], [153, 4.873718], [461, 4.4691515]]  |
|463     |[[219, 7.262149], [98, 6.8233185], [809, 6.5238543]]  |
|833     |[[9, 6.6418223], [261, 6.598943], [124, 6.5040503]]   |
|1645    |[[809, 6.4994106], [98, 6.410225], [366, 6.3061366]]  |
|496     |[[809, 8.542983], [36, 7.5537395], [61, 6.986885]]    |
|148     |[[688, 5.9158826], [507, 5.723487], [850, 5.718377]]  |
|1088    |[[366, 9.839123], [153, 9.565502], [228, 8.943895]]   |
|1238    |[[148, 6.5439024], [258, 6.339524], [443, 6.311713]]  |
|540     |[[212, 5.9102707], [258, 5.144593], [726, 5.0746555]] |
|1460    |

### Top 3 User Recommendations for a Particular Movie

**Filter out the movie we are interested from the set of all predictions:**

In [36]:
movie_recs.where(movie_recs.movie_id == 36).select("recommendations.user_id", "recommendations.rating").collect()

[Row(user_id=[366, 153, 261], rating=[9.04776668548584, 8.196542739868164, 7.674849033355713])]

**Create a subset movie dataframe and invoke `recommendForItemSubset` on the model:**

In [37]:
movie_subset = ratings_df.where(ratings_df.movie_id == 36)

In [38]:
movie_subset_recs = model.recommendForItemSubset(movie_subset, 3)

In [39]:
movie_subset_recs.select("recommendations.user_id", "recommendations.rating").show(truncate=False)

+---------------+------------------------------+
|user_id        |rating                        |
+---------------+------------------------------+
|[366, 153, 261]|[9.047767, 8.196543, 7.674849]|
+---------------+------------------------------+



## Evaluate the Predictions

In [40]:
ratings_and_preds = movie_rating_preds.join(ratings_df, on=['user_id', 'movie_id'], how='inner').cache()

In [41]:
ratings_and_preds.show(5)

+-------+--------+----------+------+
|user_id|movie_id|prediction|rating|
+-------+--------+----------+------+
|     90|     648|  4.927757|     4|
|     90|     382| 4.3809342|     5|
|    115|     265| 3.3165402|     2|
|    307|     174| 4.1435966|     4|
|    184|     187|  3.857808|     4|
+-------+--------+----------+------+



In [42]:
MSE = ratings_and_preds.select(F.avg(F.pow((col('rating') - col('prediction')), 2)).alias('mse')).first().mse

In [43]:
print("Mean Squared Error = " + str(MSE))

Mean Squared Error = 0.6036183617834012


In [44]:
user_movie_rating_preds = model.transform(ratings_df.filter((ratings_df.user_id == 789) & (ratings_df.movie_id == 123)))

In [45]:
ratings_df.filter((ratings_df.user_id == 789) & (ratings_df.movie_id == 123)).show()

+-------+--------+------+
|user_id|movie_id|rating|
+-------+--------+------+
+-------+--------+------+



In [46]:
user_movie_rating_preds.show()

+-------+--------+------+----------+
|user_id|movie_id|rating|prediction|
+-------+--------+------+----------+
+-------+--------+------+----------+



In [47]:
ratings_df.createOrReplaceTempView('movie_ratings')

In [48]:
import re

def strip_margin(text):
    nomargin = re.sub('\n[ \t]*\|', ' ', text)
    trimmed = re.sub('\s+', ' ', nomargin)
    return trimmed

In [49]:
# Top 5 longest departure delays using SQL
spark.sql(strip_margin(
                    """SELECT user_id, movie_id, rating
                      |FROM movie_ratings
                      |WHERE user_id = 789
                      |ORDER BY rating DESC, movie_id LIMIT 10
                    """)).show()

+-------+--------+------+
|user_id|movie_id|rating|
+-------+--------+------+
|    789|       9|     5|
|    789|      50|     5|
|    789|     100|     5|
|    789|     127|     5|
|    789|     129|     5|
|    789|     150|     5|
|    789|     276|     5|
|    789|     475|     5|
|    789|     741|     5|
|    789|      93|     4|
+-------+--------+------+



In [50]:
ratings_df.filter(col('user_id') == 789).orderBy(['rating', 'movie_id'], ascending=[0, 1]).limit(10).show()

+-------+--------+------+
|user_id|movie_id|rating|
+-------+--------+------+
|    789|       9|     5|
|    789|      50|     5|
|    789|     100|     5|
|    789|     127|     5|
|    789|     129|     5|
|    789|     150|     5|
|    789|     276|     5|
|    789|     475|     5|
|    789|     741|     5|
|    789|      93|     4|
+-------+--------+------+



#### Loading the movie data set

In [51]:
movies_schema = StructType([
    StructField("movie_id", IntegerType(), nullable=True),
    StructField("title", StringType(), nullable=True),
    StructField("release_date", StringType(), nullable=False),
    StructField("video_release_date", StringType(), nullable=False),
    StructField("imdb_url", StringType(), nullable=False),
    StructField("unknown", IntegerType(), nullable=False),
    StructField("action", IntegerType(), nullable=False),
    StructField("adventure", IntegerType(), nullable=False),
    StructField("animation", IntegerType(), nullable=False),
    StructField("childrens", IntegerType(), nullable=False),
    StructField("comedy", IntegerType(), nullable=False),
    StructField("crime", IntegerType(), nullable=False),
    StructField("documentary", IntegerType(), nullable=False),
    StructField("drama", IntegerType(), nullable=False),
    StructField("fantasy", IntegerType(), nullable=False),
    StructField("film-noir", IntegerType(), nullable=False),
    StructField("horror", IntegerType(), nullable=False),
    StructField("musical", IntegerType(), nullable=False),
    StructField("mystery", IntegerType(), nullable=False),
    StructField("romance", IntegerType(), nullable=False),
    StructField("sci-fi", IntegerType(), nullable=False),
    StructField("thriller", IntegerType(), nullable=False),
    StructField("war", IntegerType(), nullable=False),
    StructField("western", IntegerType(), nullable=False)]
  )

In [52]:
movies_df = spark.read.csv("data/ml-100k/u.item", sep="|", schema=movies_schema).cache()

In [53]:
movies_df.limit(10).toPandas()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,unknown,action,adventure,animation,childrens,comedy,crime,documentary,drama,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%20(1995),0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(1995),0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995),0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) (1995),01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monkeys%20(1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
7,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0
8,9,Dead Man Walking (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Dead%20Man%20Walking%20(1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9,10,Richard III (1995),22-Jan-1996,,http://us.imdb.com/M/title-exact?Richard%20III%20(1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0


In [54]:
spark.stop()