# Import libraries

In [None]:
#pip install pyspark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Recommender System").config("spark.sql.crossJoin.enabled","true").getOrCreate()

In [None]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import functions as f

# Load and verify data

In [None]:
from pyspark.sql.types import StringType, DoubleType, IntegerType, StructType, StructField
schema = StructType([StructField('userId', IntegerType(), True),
                     StructField('movieId', IntegerType(), True),
                     StructField('rating', IntegerType(), True),
                     StructField('timestamp', DoubleType(), True)])


In [None]:
data = spark.read.csv('ratings.dat',sep = '::', header = False, schema = schema)
# Data taken from https://grouplens.org/datasets/movielens/1m/

In [None]:
data.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: double (nullable = true)



In [None]:
data.head(3)

[Row(userId=1, movieId=1193, rating=5, timestamp=978300760.0),
 Row(userId=1, movieId=661, rating=3, timestamp=978302109.0),
 Row(userId=1, movieId=914, rating=3, timestamp=978301968.0)]

In [None]:
for item in data.head(1)[0]:
    print(item)

1
1193
5
978300760.0


In [None]:
data.columns

['userId', 'movieId', 'rating', 'timestamp']

In [None]:
data.describe().show()

+-------+------------------+------------------+------------------+--------------------+
|summary|            userId|           movieId|            rating|           timestamp|
+-------+------------------+------------------+------------------+--------------------+
|  count|            769087|            769087|            769087|              769087|
|   mean|2332.4967331394237|1879.2739898086952|3.5693699152371576| 9.747231800665334E8|
| stddev|1326.6839275368804| 1104.108134693938| 1.119513046863757|1.0933486896116085E7|
|    min|                 1|                 1|                 1|        9.64384455E8|
|    max|              4579|              3952|                 5|       1.046393499E9|
+-------+------------------+------------------+------------------+--------------------+



# Train Test split

In [None]:
train_data,test_data = data.randomSplit([0.7,0.3])

In [None]:
train_data.describe().show()

+-------+------------------+------------------+-----------------+-------------------+
|summary|            userId|           movieId|           rating|          timestamp|
+-------+------------------+------------------+-----------------+-------------------+
|  count|            537697|            537697|           537697|             537697|
|   mean|  2332.92298264636|1880.8771575813143|3.567907204243282|9.747217079465033E8|
| stddev|1326.8470208291915| 1104.357293903732|1.120196636856997|1.092769448377752E7|
|    min|                 1|                 1|                1|       9.64384455E8|
|    max|              4579|              3952|                5|      1.046393499E9|
+-------+------------------+------------------+-----------------+-------------------+



In [None]:
test_data.describe().show()

+-------+------------------+------------------+-----------------+--------------------+
|summary|            userId|           movieId|           rating|           timestamp|
+-------+------------------+------------------+-----------------+--------------------+
|  count|            231390|            231390|           231390|              231390|
|   mean|  2331.50622758114|1875.5485976057737|3.572768918276503| 9.747266009340032E8|
| stddev|1326.3071965222293|1103.5223201739313|1.117917954765692|1.0946958155504022E7|
|    min|                 1|                 1|                1|        9.64385114E8|
|    max|              4579|              3952|                5|       1.046351654E9|
+-------+------------------+------------------+-----------------+--------------------+



# Build Model

In [None]:
recommender = ALS(maxIter = 5, regParam = 0.01, userCol='userId', itemCol='movieId', ratingCol='rating')
# recommender = ALS(maxIter = 5, regParam = 0.01, userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy="drop")

In [None]:
model = recommender.fit(train_data)

# Evaluate Model

In [None]:
pred_data = model.transform(test_data)

In [None]:
pred_data.show()

+------+-------+------+------------+----------+
|userId|movieId|rating|   timestamp|prediction|
+------+-------+------+------------+----------+
|     1|   1721|     4|9.78300055E8|  5.143253|
|     1|   2797|     4|9.78302039E8| 4.1459026|
|     1|   2294|     4|9.78824291E8| 4.1674776|
|     1|   2018|     4|9.78301777E8|  5.086158|
|     1|   2804|     5|9.78300719E8|  4.121637|
|     1|   2692|     4| 9.7830157E8| 3.2970908|
|     1|   2791|     4|9.78302188E8| 3.4589617|
|     2|    292|     3|9.78300123E8|  3.572881|
|     1|   1029|     5|9.78302205E8|  4.635245|
|     2|    163|     4|9.78299809E8| 2.6820014|
|     1|   1035|     5|9.78301753E8|  4.731936|
|     1|   1545|     4|9.78824139E8|  3.519789|
|     2|    380|     5|9.78299809E8| 3.6033304|
|     2|    434|     2|9.78300174E8| 3.0063083|
|     1|    938|     4|9.78301752E8| 4.1122117|
|     2|    356|     5|9.78299686E8| 4.5251327|
|     1|   2321|     3|9.78302205E8| 3.3810415|
|     1|   1907|     4| 9.7882433E8|  4.

In [None]:
evaluator = RegressionEvaluator(metricName='rmse',labelCol='rating',predictionCol='prediction')
print("Root-mean-square error = " + str(evaluator.evaluate(pred_data)))

Root-mean-square error = nan


In [None]:
# A NaN result is due to SPARK-14489 and because the model can't predict values for users for which there's no data.
# A temporary workaround is to exclude rows with predicted NaN values or to replace them with a constant, for instance,
# the general mean rating. However, to map to a real business problem, the data scientist, in collaboration with the
# business owner, must define what happens if such an event occurs. For example, you can provide no recommendation for
# a user until that user rates a few items. Alternatively, before user rates five items, you can use a user-based recommender
# system that's based on the user's profile (that's another recommender system to develop).

# Replace predicted NaN values with the average rating and evaluate the model:

In [None]:
avgRatings = data.select('rating').groupBy().avg().first()[0]
print ('The average rating in the dataset is: {}'.format(avgRatings))

The average rating in the dataset is: 3.5693699152371576


In [None]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
print ('The root mean squared error for our model is: {}'.format(evaluator.evaluate(pred_data.na.fill(avgRatings))))

The root mean squared error for our model is: 0.9185122992096438


In [None]:
# Now exclude predicted NaN values and evaluate the model:

In [None]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
print ('The root mean squared error for our model is: {}'.format(evaluator.evaluate(pred_data.na.drop())))

The root mean squared error for our model is: 0.9183040628504067


# Define Movie details

In [None]:
schema = StructType([StructField('movieId', IntegerType(), True),
                     StructField('title', StringType(), True),
                     StructField('genres', StringType(), True)])
movieDetails = spark.read.csv('movies.dat',sep = '::', header = False, schema = schema)
movieDetails.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [None]:
movieDetails.head(3)

[Row(movieId=1, title='Toy Story (1995)', genres="Animation|Children's|Comedy"),
 Row(movieId=2, title='Jumanji (1995)', genres="Adventure|Children's|Fantasy"),
 Row(movieId=3, title='Grumpier Old Men (1995)', genres='Comedy|Romance')]

In [None]:
schema = StructType([StructField('UserID', IntegerType(), True),
                     StructField('Gender', StringType(), True),
                     StructField('Age', IntegerType(), True),
                     StructField('Occupation', IntegerType(), True),
                     StructField('Zipcode', IntegerType(), True)])
occupation ={
  0:  "other",
  1:  "academic/educator",
  2:  "artist",
  3:  "clerical/admin",
  4:  "college/grad student",
  5:  "customer service",
  6:  "doctor/health care",
  7:  "executive/managerial",
  8:  "farmer",
  9:  "homemaker",
 10:  "K-12 student",
 11:  "lawyer",
 12:  "programmer",
 13:  "retired",
 14:  "sales/marketing",
 15:  "scientist",
 16:  "self-employed",
 17:  "technician/engineer",
 18:  "tradesman/craftsman",
 19:  "unemployed",
 20:  "writer"

}
userDetails = spark.read.csv('users.dat',sep = '::', header = False, schema = schema)
userDetails.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- Zipcode: integer (nullable = true)



In [None]:
userDetails.head(3)

[Row(UserID=1, Gender='F', Age=1, Occupation=10, Zipcode=48067),
 Row(UserID=2, Gender='M', Age=56, Occupation=16, Zipcode=70072),
 Row(UserID=3, Gender='M', Age=25, Occupation=15, Zipcode=55117)]

# Movie recommendations

In [None]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{2913, 10.849915...|
|     3|[{2874, 7.6534467...|
|     5|[{659, 6.7760086}...|
|     6|[{2305, 8.729602}...|
|     9|[{862, 6.222961},...|
|    12|[{862, 12.147653}...|
|    13|[{1749, 6.6426344...|
|    15|[{1749, 5.922634}...|
|    16|[{1471, 11.969424...|
|    17|[{2964, 6.4469056...|
|    19|[{1749, 8.616003}...|
|    20|[{411, 8.306289},...|
|    22|[{2964, 5.8524256...|
|    26|[{3885, 6.9804587...|
|    27|[{3531, 8.415962}...|
|    28|[{1046, 5.6865506...|
|    31|[{2809, 8.148663}...|
|    34|[{980, 9.069587},...|
|    35|[{3531, 6.024864}...|
|    37|[{3349, 9.12696},...|
+------+--------------------+
only showing top 20 rows



In [None]:
userRecsExplode = userRecs.select(userRecs.userId,f.explode(userRecs.recommendations)).orderBy(userRecs.userId)
userRecsExplode.show()

+------+-----------------+
|userId|              col|
+------+-----------------+
|     1|{2913, 10.849915}|
|     1|  {1715, 8.61826}|
|     1| {2626, 8.278925}|
|     1|  {598, 7.550913}|
|     1|{1117, 7.2084966}|
|     1|{1317, 7.1806145}|
|     1| {3106, 7.141319}|
|     1|{1651, 7.0557613}|
|     1| {1529, 6.996839}|
|     1|{2998, 6.9615116}|
|     2|{2913, 6.4855647}|
|     2|{3456, 6.2140985}|
|     2|  {824, 6.183017}|
|     2|{3913, 6.1469707}|
|     2|{1117, 5.8745103}|
|     2| {2800, 5.792125}|
|     2|{2257, 5.7253056}|
|     2|{1715, 5.6650496}|
|     2|   {572, 5.65858}|
|     2|{1749, 5.6473136}|
+------+-----------------+
only showing top 20 rows



In [None]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.join(movieDetails,movieRecs.movieId==movieDetails.movieId,"left").select([movieRecs.movieId,movieDetails.title,movieDetails.genres,movieRecs.recommendations]).show()

+-------+--------------------+--------------------+--------------------+
|movieId|               title|              genres|     recommendations|
+-------+--------------------+--------------------+--------------------+
|      1|    Toy Story (1995)|Animation|Childre...|[{2269, 5.8271937...|
|      3|Grumpier Old Men ...|      Comedy|Romance|[{1070, 6.285731}...|
|      5|Father of the Bri...|              Comedy|[{4419, 7.8553452...|
|      6|         Heat (1995)|Action|Crime|Thri...|[{3149, 6.155888}...|
|      9| Sudden Death (1995)|              Action|[{1213, 7.5812917...|
|     12|Dracula: Dead and...|       Comedy|Horror|[{1664, 8.210773}...|
|     13|        Balto (1995)|Animation|Children's|[{2432, 6.6584587...|
|     15|Cutthroat Island ...|Action|Adventure|...|[{1070, 8.677435}...|
|     16|       Casino (1995)|      Drama|Thriller|[{745, 7.294376},...|
|     17|Sense and Sensibi...|       Drama|Romance|[{1213, 6.6665998...|
|     19|Ace Ventura: When...|              Comedy|

In [None]:
movieRecsExplode = movieRecs.select(movieRecs.movieId,f.explode(movieRecs.recommendations)).orderBy(movieRecs.movieId)
movieRecsExplode.join(movieDetails,movieRecsExplode.movieId==movieDetails.movieId,"left").select([movieRecsExplode.movieId,movieDetails.title,movieDetails.genres,movieRecsExplode.col.alias('recommendation')]).show()

+-------+--------------------+--------------------+-----------------+
|movieId|               title|              genres|   recommendation|
+-------+--------------------+--------------------+-----------------+
|      1|    Toy Story (1995)|Animation|Childre...|{2269, 5.8271937}|
|      1|    Toy Story (1995)|Animation|Childre...|{2534, 5.8148546}|
|      1|    Toy Story (1995)|Animation|Childre...| {4143, 5.787467}|
|      1|    Toy Story (1995)|Animation|Childre...| {1414, 5.694665}|
|      1|    Toy Story (1995)|Animation|Childre...|{1190, 5.6699004}|
|      1|    Toy Story (1995)|Animation|Childre...| {4166, 5.640699}|
|      1|    Toy Story (1995)|Animation|Childre...| {3204, 5.597963}|
|      1|    Toy Story (1995)|Animation|Childre...|{2432, 5.5800285}|
|      1|    Toy Story (1995)|Animation|Childre...| {2412, 5.578589}|
|      1|    Toy Story (1995)|Animation|Childre...|{1039, 5.5421066}|
|      3|Grumpier Old Men ...|      Comedy|Romance| {1070, 6.285731}|
|      3|Grumpier Ol

In [None]:
# Generate top 10 movie recommendations for a specified set of users
singleUser = test_data.filter(test_data.userId==11).select(['movieId','userId'])
singleUser.join(movieDetails,singleUser.movieId==movieDetails.movieId,"left").select([singleUser.userId,singleUser.movieId,movieDetails.title,movieDetails.genres]).show()

+------+-------+--------------------+--------------------+
|userId|movieId|               title|              genres|
+------+-------+--------------------+--------------------+
|    11|     36|Dead Man Walking ...|               Drama|
|    11|    110|   Braveheart (1995)|    Action|Drama|War|
|    11|    333|    Tommy Boy (1995)|              Comedy|
|    11|    435|    Coneheads (1993)|       Comedy|Sci-Fi|
|    11|    441|Dazed and Confuse...|              Comedy|
|    11|    481|   Kalifornia (1993)|      Drama|Thriller|
|    11|    531|Secret Garden, Th...|    Children's|Drama|
|    11|    543|So I Married an A...|Comedy|Romance|Th...|
|    11|    608|        Fargo (1996)|Crime|Drama|Thriller|
|    11|    663|Kids in the Hall:...|              Comedy|
|    11|    708|Truth About Cats ...|      Comedy|Romance|
|    11|   1042|That Thing You Do...|              Comedy|
|    11|   1059|William Shakespea...|       Drama|Romance|
|    11|   1244|    Manhattan (1979)|Comedy|Drama|Romanc

In [None]:
userSubsetRecs = model.recommendForUserSubset(singleUser, 10)
userSubsetRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    11|[{659, 8.511926},...|
+------+--------------------+



In [None]:
userSubsetRecsExplode = userSubsetRecs.select(userSubsetRecs.userId,f.explode(userSubsetRecs.recommendations))
userSubsetRecsExplode.show()

+------+-----------------+
|userId|              col|
+------+-----------------+
|    11|  {659, 8.511926}|
|    11|{3587, 7.4576254}|
|    11|{2813, 7.1647534}|
|    11|{1846, 7.1045833}|
|    11| {103, 6.8451047}|
|    11|{3900, 6.7213573}|
|    11|{1613, 6.6876445}|
|    11|{1450, 6.6132026}|
|    11|{1685, 6.4672146}|
|    11| {2627, 6.434741}|
+------+-----------------+



In [None]:
recommendations = model.transform(singleUser)
userRecommendations= recommendations.orderBy('prediction',ascending=False)
userRecommendations.show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|    333|    11|  4.296135|
|    543|    11| 4.1670003|
|   1916|    11|  4.134341|
|    441|    11| 4.1017694|
|   1704|    11|  3.960262|
|    608|    11| 3.9440448|
|   2329|    11| 3.9005089|
|   1265|    11| 3.8445477|
|    110|    11|  3.797694|
|    481|    11| 3.6844852|
|   1259|    11|  3.647805|
|   3148|    11|  3.644402|
|   1244|    11| 3.5904589|
|   2706|    11| 3.5192206|
|   2762|    11| 3.4867923|
|   1278|    11| 3.4645953|
|   1673|    11| 3.4592566|
|   2282|    11| 3.4097736|
|   2321|    11|  3.393981|
|   3174|    11| 3.3334394|
+-------+------+----------+
only showing top 20 rows



In [None]:
userRecommendations.join(movieDetails,userRecommendations.movieId==movieDetails.movieId,"left").select([userRecommendations.userId,movieDetails.title,movieDetails.genres,userRecommendations.prediction]).show()

+------+--------------------+--------------------+----------+
|userId|               title|              genres|prediction|
+------+--------------------+--------------------+----------+
|    11|Dead Man Walking ...|               Drama|  2.851837|
|    11|   Braveheart (1995)|    Action|Drama|War|  3.797694|
|    11|    Tommy Boy (1995)|              Comedy|  4.296135|
|    11|    Coneheads (1993)|       Comedy|Sci-Fi| 1.9362859|
|    11|Dazed and Confuse...|              Comedy| 4.1017694|
|    11|   Kalifornia (1993)|      Drama|Thriller| 3.6844852|
|    11|Secret Garden, Th...|    Children's|Drama| 2.1065564|
|    11|So I Married an A...|Comedy|Romance|Th...| 4.1670003|
|    11|        Fargo (1996)|Crime|Drama|Thriller| 3.9440448|
|    11|Kids in the Hall:...|              Comedy| 2.8981588|
|    11|Truth About Cats ...|      Comedy|Romance| 2.2880802|
|    11|That Thing You Do...|              Comedy| 2.6426508|
|    11|William Shakespea...|       Drama|Romance| 2.9554908|
|    11|

In [None]:
# Generate top 10 user recommendations for a specified set of movies
singleMovie = test_data.filter(test_data.movieId==1).select(['movieId','userId'])
singleMovie.join(movieDetails,singleMovie.movieId==movieDetails.movieId,"left").select([singleMovie.movieId,movieDetails.title,movieDetails.genres,singleMovie.userId]).show()

+-------+----------------+--------------------+------+
|movieId|           title|              genres|userId|
+-------+----------------+--------------------+------+
|      1|Toy Story (1995)|Animation|Childre...|     6|
|      1|Toy Story (1995)|Animation|Childre...|     9|
|      1|Toy Story (1995)|Animation|Childre...|    44|
|      1|Toy Story (1995)|Animation|Childre...|    48|
|      1|Toy Story (1995)|Animation|Childre...|    51|
|      1|Toy Story (1995)|Animation|Childre...|    56|
|      1|Toy Story (1995)|Animation|Childre...|    60|
|      1|Toy Story (1995)|Animation|Childre...|    65|
|      1|Toy Story (1995)|Animation|Childre...|    76|
|      1|Toy Story (1995)|Animation|Childre...|    80|
|      1|Toy Story (1995)|Animation|Childre...|    90|
|      1|Toy Story (1995)|Animation|Childre...|   114|
|      1|Toy Story (1995)|Animation|Childre...|   119|
|      1|Toy Story (1995)|Animation|Childre...|   123|
|      1|Toy Story (1995)|Animation|Childre...|   131|
|      1|T

In [None]:
movieSubSetRecs = model.recommendForItemSubset(singleMovie, 10)
movieSubSetRecs.join(movieDetails,movieSubSetRecs.movieId==movieDetails.movieId,"left").select([movieSubSetRecs.movieId,movieDetails.title,movieDetails.genres,movieSubSetRecs.recommendations]).show()

+-------+----------------+--------------------+--------------------+
|movieId|           title|              genres|     recommendations|
+-------+----------------+--------------------+--------------------+
|      1|Toy Story (1995)|Animation|Childre...|[{2269, 5.8271937...|
+-------+----------------+--------------------+--------------------+



In [None]:
movieSubSetRecsExplode = movieSubSetRecs.select(movieSubSetRecs.movieId,f.explode(movieSubSetRecs.recommendations))
movieSubSetRecsExplode.join(movieDetails,movieSubSetRecsExplode.movieId==movieDetails.movieId,"left").select([movieSubSetRecsExplode.movieId,movieDetails.title,movieDetails.genres,movieSubSetRecsExplode.col.alias('recommendation')]).show()

+-------+----------------+--------------------+-----------------+
|movieId|           title|              genres|   recommendation|
+-------+----------------+--------------------+-----------------+
|      1|Toy Story (1995)|Animation|Childre...|{2269, 5.8271937}|
|      1|Toy Story (1995)|Animation|Childre...|{2534, 5.8148546}|
|      1|Toy Story (1995)|Animation|Childre...| {4143, 5.787467}|
|      1|Toy Story (1995)|Animation|Childre...| {1414, 5.694665}|
|      1|Toy Story (1995)|Animation|Childre...|{1190, 5.6699004}|
|      1|Toy Story (1995)|Animation|Childre...| {4166, 5.640699}|
|      1|Toy Story (1995)|Animation|Childre...| {3204, 5.597963}|
|      1|Toy Story (1995)|Animation|Childre...|{2432, 5.5800285}|
|      1|Toy Story (1995)|Animation|Childre...| {2412, 5.578589}|
|      1|Toy Story (1995)|Animation|Childre...|{1039, 5.5421066}|
+-------+----------------+--------------------+-----------------+

