# Problem 3: Collaborative Filtering Implementation

# Spark ALS based colloborative filtering model

Get the dataset from s3 bucket

In [1]:
import os
dbfs_dir = 's3://netflixdata-yr/'
movies_filename = dbfs_dir + 'movie_titles.txt'
testing_filename = dbfs_dir + 'TestingRatings.txt'
training_filename = dbfs_dir + 'TrainingRatings.txt'
# https://github.com/snehalnair/als-recommender-pyspark/blob/master/Recommendation_Engine_MovieLens.ipynb

Define the schemas

In [2]:
from pyspark.sql.types import *

movies_df_schema = StructType(
  [StructField('movie_Id', IntegerType()),
   StructField('year', IntegerType()),
   StructField('title', StringType())]
)

testing_df_schema = StructType(
  [StructField('movie_Id', IntegerType()),
   StructField('userId', IntegerType()),
   StructField('rating', DoubleType())]
)

training_df_schema = StructType(
  [StructField('movie_Id', IntegerType()),
   StructField('userId', IntegerType()),
   StructField('rating', DoubleType())]
)

Load and cache the data 

In [3]:
from pyspark.sql.functions import regexp_extract
from pyspark.sql.types import *

movies_titles_df = sqlContext.read.options(header=False, inferSchema=False).schema(movies_df_schema).csv("s3://netflixdata-yr/movie_titles.txt")
testing_df = sqlContext.read.options(header=False, inferSchema=False).schema(testing_df_schema).csv("s3://netflixdata-yr/TestingRatings.txt")
training_df = sqlContext.read.options(header=False, inferSchema=False).schema(testing_df_schema).csv("s3://netflixdata-yr/TrainingRatings.txt")

movies_titles_df.cache()
testing_df.cache()
training_df.cache()

DataFrame[movie_Id: int, userId: int, rating: double]

 Merge movie titles dataframe with testing and training dataframe and drop the duplicate column movieID 

In [4]:
# testing_df = movies_titles_df.join(testing_ratings_df, movies_titles_df["movie_Id"]==testing_ratings_df["movieId"])
# testing_df = testing_df.drop("movieId")
# print(testing_df.show(3))

In [5]:
# training_df = movies_titles_df.join(training_ratings_df, movies_titles_df["movie_Id"]==training_ratings_df["movieId"])
# training_df = training_df.drop("movieId")
# print(training_df.show(3))

Creating temp view for analysis

In [6]:
from pyspark.sql import SQLContext
import pyspark
sqlContext = pyspark.SQLContext(sc)  
testing_df.createOrReplaceTempView('testing_df')
training_df.createOrReplaceTempView('training_df')

In [7]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [8]:
# Create & Confirm ALS model
als = ALS(userCol="userId", itemCol="movie_Id", ratingCol="rating", nonnegative = True, implicitPrefs = False, coldStartStrategy="drop")
type(als)

pyspark.ml.recommendation.ALS

In [9]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [5,10,15]) \
            .addGrid(als.regParam, [.01, .05, 0.1]) \
            .build()
           
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  9


In [10]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Confirm cv was built
print(cv)

CrossValidator_60c9bb2617d8


In [11]:
#Fit cross validator to the 'train' dataset
model = cv.fit(training_df)

#Extract best model from the cv model above
best_model = model.bestModel

                                                                                

In [42]:
# Print best_model
print(type(best_model))

# Print the best ALS model parameters
print("**Best Model**")

# # # Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())

# # Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())

# # Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**
  Rank: 15
  MaxIter: 10
  RegParam: 0.05


In [43]:
train_predictions = best_model.transform(training_df)
RMSE_train = evaluator.evaluate(train_predictions)
print("RMSE for training data is:", RMSE_train)

mae_train = evaluator.evaluate(train_predictions, {evaluator.metricName: "mae"})
print("MAE for training data is: %.3f" % mae_train)

                                                                                

RMSE for training data is: 0.767821286433335




MAE for training data is: 0.606


                                                                                

In [44]:
# View the predictions
test_predictions = best_model.transform(testing_df)
RMSE = evaluator.evaluate(test_predictions)
print("RMSE for testing data is:", RMSE)

mae = evaluator.evaluate(test_predictions, {evaluator.metricName: "mae"})
print("MAE for testing data is:: %.3f" % mae)

                                                                                

RMSE for testing data is: 0.8371780505884548




MAE for testing data is:: 0.660


                                                                                

In [45]:
#view top 10 test prediction movies
test_predictions.show(10)

+--------+-------+------+----------+
|movie_Id| userId|rating|prediction|
+--------+-------+------+----------+
|      28|2358799|   3.0| 3.9407175|
|     156| 973051|   5.0| 4.0387187|
|     851|1189060|   3.0| 3.4920712|
|    1100|2376892|   2.0| 2.2084737|
|    1123|1628484|   3.0| 3.4529614|
|    1289|1552084|   3.0| 3.5745318|
|    1744|2376892|   5.0| 3.7657683|
|    1851| 675056|   4.0| 3.5719743|
|    1983|2376892|   4.0|  3.174658|
|    1983|2629660|   3.0| 2.8910725|
+--------+-------+------+----------+
only showing top 10 rows



In [46]:
#  Generate n Recommendations for all users and list top 10
reco = best_model.recommendForAllUsers(10)
reco.limit(10).show()



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|  3039|[[6991, 4.479667]...|
|  3694|[[12293, 4.332815...|
|  4247|[[12293, 4.847693...|
|  7601|[[11812, 4.940127...|
|  8095|[[6991, 5.582224]...|
|  8135|[[12952, 6.137531...|
|  9399|[[12952, 5.355542...|
| 10897|[[14283, 4.554458...|
| 11215|[[12952, 5.475445...|
| 11430|[[12952, 6.523889...|
+------+--------------------+



                                                                                

In [47]:
from pyspark.sql.functions import UserDefinedFunction, explode, desc
from pyspark.sql.functions import col, avg, when, count

reco1 = reco\
    .withColumn("rec_exp", explode("recommendations"))\
    .select('userId', col("rec_exp.movie_Id"), col("rec_exp.rating"))

reco1.limit(10).show()



+------+--------+---------+
|userId|movie_Id|   rating|
+------+--------+---------+
|   481|    6991| 5.736088|
|   481|   14648| 5.140416|
|   481|   10743| 5.124085|
|   481|   14361| 5.055481|
|   481|    7569|5.0513654|
|   481|   12952|5.0289598|
|   481|     634| 4.975964|
|   481|    4238|4.9639053|
|   481|   15567|  4.94801|
|   481|   10947| 4.940893|
+------+--------+---------+



                                                                                

In [48]:
# filter for userId 79
reco1.join(movies_titles_df, on='movie_Id').filter('userId = 79').show()

                                                                                

+--------+------+---------+----+--------------------+
|movie_Id|userId|   rating|year|               title|
+--------+------+---------+----+--------------------+
|   14185|    79| 4.634067|1964|        Mary Poppins|
|    6287|    79| 4.590269|1990|        Pretty Woman|
|    1256|    79|4.5632133|1994|The Best of Frien...|
|   14283|    79|4.5457993|1994|The Best of Frien...|
|   12952|    79|4.5401516|2005|The God Who Wasn'...|
|    4129|    79|4.5198183|1970|Santa Claus Is Co...|
|   14648|    79|4.4529114|2003|Finding Nemo (Ful...|
|   15183|    79|4.4301085|1975|      MASH: Season 4|
|   15836|    79|4.4262037|1979|      MASH: Season 8|
|    2290|    79|4.3734174|1992|Aladdin: Platinum...|
+--------+------+---------+----+--------------------+



In [49]:
training_df.filter('userId = 79').sort('rating', ascending=False).limit(10).show()

+--------+------+------+
|movie_Id|userId|rating|
+--------+------+------+
|    6971|    79|   5.0|
|    3538|    79|   5.0|
|    3541|    79|   5.0|
|    2660|    79|   5.0|
|    8512|    79|   5.0|
|   13489|    79|   5.0|
|   13748|    79|   5.0|
|   14185|    79|   5.0|
|    4569|    79|   5.0|
|    4640|    79|   5.0|
+--------+------+------+



In [50]:
testing_df.filter('userId = 79').sort('rating', ascending=False).limit(10).show()

+--------+------+------+
|movie_Id|userId|rating|
+--------+------+------+
|   14648|    79|   5.0|
|    2913|    79|   4.0|
|   12497|    79|   4.0|
|    8163|    79|   3.0|
+--------+------+------+



In [51]:
#Recommending Movies with ALS for the user 79
single_user = testing_df.filter(testing_df['userId']==79).select(['movie_Id','rating','userId'])
single_user.show()

+--------+------+------+
|movie_Id|rating|userId|
+--------+------+------+
|    2913|   4.0|    79|
|    8163|   3.0|    79|
|   12497|   4.0|    79|
|   14648|   5.0|    79|
+--------+------+------+



In [52]:
recomendations = model.transform(single_user)
recomendations.orderBy('prediction',ascending=False).show()

+--------+------+------+----------+
|movie_Id|rating|userId|prediction|
+--------+------+------+----------+
|   14648|   5.0|    79| 4.4529114|
|    2913|   4.0|    79| 3.9445221|
|   12497|   4.0|    79|  3.583714|
|    8163|   3.0|    79|  3.225082|
+--------+------+------+----------+



# Step 3: Does your approach work for your own preferences? 

Add myself as a new user to the data set by createing a new,unique user ID for yourself. 
Selecting some movies that I  have seen among those in the training set, and add my ratings for those.

In [53]:
from pyspark.sql import Row
my_user_id = 11111

my_rated_movies = [
     ( 2959, my_user_id, 3),
     ( 2571, my_user_id, 4),
     ( 1207, my_user_id,5),
     ( 296, my_user_id, 1),
     ( 2858, my_user_id, 5), 
     ( 1172, my_user_id, 5), 
     ( 593, my_user_id,1),
     ( 745, my_user_id,2), 
     ( 1198,my_user_id, 4),
     ( 6016, my_user_id, 1)    
]

my_ratings_df = sqlContext.createDataFrame(my_rated_movies, ['movie_Id', 'user_Id','rating'])
print ('My movie ratings:')
my_ratings_df.show(3)

My movie ratings:
+--------+-------+------+
|movie_Id|user_Id|rating|
+--------+-------+------+
|    2959|  11111|     3|
|    2571|  11111|     4|
|    1207|  11111|     5|
+--------+-------+------+
only showing top 3 rows



In [54]:
from pyspark.sql import SparkSession

training_with_my_ratings_df = training_df.union(my_ratings_df)

user_data = (training_with_my_ratings_df[training_with_my_ratings_df.userId == '11111'])
user_data.show(3)


+--------+------+------+
|movie_Id|userId|rating|
+--------+------+------+
|    2959| 11111|   3.0|
|    2571| 11111|   4.0|
|    1207| 11111|   5.0|
+--------+------+------+
only showing top 3 rows



In [55]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [5, 10, 15]) \
            .addGrid(als.regParam, [.05, 0.1]) \
            .build()
           
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  6


In [56]:
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Confirm cv was built
print(cv)

CrossValidator_fbb0e94f47ca


In [57]:
#Fit cross validator to the 'train' dataset
model_my_data = cv.fit(training_with_my_ratings_df)

#Extract best model from the cv model above for the new added data set
best_model_data = model_my_data.bestModel

                                                                                

In [60]:
#evaluate the new added data training set
train_predictions_data = best_model_data.transform(training_with_my_ratings_df)
RMSE_train_data = evaluator.evaluate(train_predictions_data)
print("RMSE for training data with added data is:", RMSE_train_data)

mae_train_data = evaluator.evaluate(train_predictions_data, {evaluator.metricName: "mae"})
print("MAE for training data with added data is: %.3f" % mae_train_data)

                                                                                

RMSE for training data with added data is: 0.7682813125220835




MAE for training data with added data is: 0.607


                                                                                

In [61]:
#evaluate test set from the model built on new added training data
test_predictions_data = best_model_data.transform(testing_df)
RMSE_data = evaluator.evaluate(test_predictions_data)
print("RMSE for testing data with model trained on added data is:", RMSE_data)

mae_data = evaluator.evaluate(test_predictions_data, {evaluator.metricName: "mae"})
print("MAE for testing data with model trained on added data is:: %.3f" % mae_data)

                                                                                

RMSE for testing data with model trained on added data is: 0.8375366920102414




MAE for testing data with model trained on added data is:: 0.661


                                                                                

In [62]:
#list to 10 predictions with added data
test_predictions_data.show(10)

+--------+-------+------+----------+
|movie_Id| userId|rating|prediction|
+--------+-------+------+----------+
|      28|2358799|   3.0|  3.798907|
|     156| 973051|   5.0|  4.303177|
|     851|1189060|   3.0|  3.399252|
|    1100|2376892|   2.0| 2.2637596|
|    1123|1628484|   3.0| 3.4634414|
|    1289|1552084|   3.0| 3.4370534|
|    1744|2376892|   5.0|  3.907899|
|    1851| 675056|   4.0| 3.6191602|
|    1983|2376892|   4.0| 3.1283734|
|    1983|2629660|   3.0|  2.908888|
+--------+-------+------+----------+
only showing top 10 rows



In [63]:
reco_data = best_model_data.recommendForAllUsers(10)
reco_data.limit(10).show()



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|  3039|[[1172, 5.317329]...|
|  3694|[[1172, 6.6968307...|
|  4247|[[1172, 7.3059635...|
|  7601|[[1172, 6.402927]...|
|  8095|[[1207, 7.9479885...|
|  8135|[[2858, 11.406401...|
|  9399|[[1172, 7.9214673...|
| 10897|[[1207, 6.5118833...|
| 11215|[[1207, 7.7310195...|
| 11430|[[2858, 10.239397...|
+------+--------------------+



                                                                                

In [72]:
from pyspark.sql.functions import UserDefinedFunction, explode, desc
from pyspark.sql.functions import col, avg, when, count

reco1_data = reco_data\
    .withColumn("rec_exp", explode("recommendations"))\
    .select('userId', col("rec_exp.movie_Id"), col("rec_exp.rating"))

reco1_data.limit(5).show()



+------+--------+---------+
|userId|movie_Id|   rating|
+------+--------+---------+
|  3039|    1207| 5.317329|
|  3039|    1172| 5.317329|
|  3039|    2858| 5.317329|
|  3039|    7016|4.5633254|
|  3039|    3151| 4.405146|
+------+--------+---------+



                                                                                

In [70]:
# recommendations for me 
reco1_data.join(movies_titles_df, on='movie_Id').filter('userId = 11111').show(5)



+--------+------+---------+----+--------------------+
|movie_Id|userId|   rating|year|               title|
+--------+------+---------+----+--------------------+
|    1207| 11111|4.9714956|1962|Experiment in Terror|
|    2858| 11111|4.9714956|2000|Bounce: Bonus Mat...|
|    1172| 11111|4.9714956|1998| Krippendorf's Tribe|
|    1198| 11111|3.9771967|1971|The Cat O'Nine Tails|
|    2571| 11111|3.9771967|2002|Woodrow Wilson: A...|
+--------+------+---------+----+--------------------+
only showing top 5 rows



                                                                                

In [73]:
# Movies for myself in the training data
single_user_data= (training_with_my_ratings_df[training_with_my_ratings_df.userId == '11111'])
single_user_data.show(5)

+--------+------+------+
|movie_Id|userId|rating|
+--------+------+------+
|    2959| 11111|   3.0|
|    2571| 11111|   4.0|
|    1207| 11111|   5.0|
|     296| 11111|   1.0|
|    2858| 11111|   5.0|
+--------+------+------+
only showing top 5 rows



In [74]:
# predicted movies based on ALS for me
recomendations_data = model_my_data.transform(single_user_data)
recomendations_data.orderBy('prediction',ascending=False).show(5)

+--------+------+------+----------+
|movie_Id|userId|rating|prediction|
+--------+------+------+----------+
|    1172| 11111|   5.0| 4.9714956|
|    1207| 11111|   5.0| 4.9714956|
|    2858| 11111|   5.0| 4.9714956|
|    2571| 11111|   4.0| 3.9771967|
|    1198| 11111|   4.0| 3.9771967|
+--------+------+------+----------+
only showing top 5 rows



Top 5 movies titles recommended to me by ALS model

In [75]:
recomendations_data.join(movies_titles_df, on='movie_Id').show(5)

+--------+------+------+----------+----+--------------------+
|movie_Id|userId|rating|prediction|year|               title|
+--------+------+------+----------+----+--------------------+
|    2959| 11111|   3.0| 2.9828975|1991|Legend of the Dra...|
|    2571| 11111|   4.0| 3.9771967|2002|Woodrow Wilson: A...|
|    1207| 11111|   5.0| 4.9714956|1962|Experiment in Terror|
|     296| 11111|   1.0| 0.9942992|2000|In His Life: The ...|
|    2858| 11111|   5.0| 4.9714956|2000|Bounce: Bonus Mat...|
+--------+------+------+----------+----+--------------------+
only showing top 5 rows



In [38]:
# Get the learning curve

In [76]:
import math
def get_training_curve(train,rank,regular_param, iterations):
    errors = []
    for num_iters in range(1,iterations):
        model = ALS(rank = rank, maxIter = num_iters, regParam = regular_param, userCol = 'userId', itemCol = 'movie_Id', nonnegative = True ).fit(train)
        predictions = model.transform(train).fillna(0)
        condition = [train.userId == predictions.userId,  train.movie_Id == predictions.movie_Id]
        error = predictions.join(train,condition).select(predictions.prediction,train.rating).rdd.map(lambda x: (x[0]-x[1])**2).mean()
        errors.append(math.sqrt(error))
        predictions.select(predictions.userId,predictions.movie_Id,predictions.prediction).withColumnRenamed("prediction","rating")
    return errors

In [77]:
#use the best model parameters
errors = get_training_curve(training_df,rank=15,regular_param=0.5,iterations=10)

                                                                                

In [78]:
#get trainings set min error: Unable to plot learing curve as matpotlib is not working on pyspark
print("minimum training error", min(errors))

minimum training error 1.0025921288190505


In [79]:
# test_predictions error
test_error = math.sqrt(test_predictions.rdd.map(lambda x: (x.rating-x.prediction)**2).mean())
test_error

                                                                                

0.8371780505884567