In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = SparkSession.builder.appName("Chapter4-5").getOrCreate()

In [5]:
ratings = (
            spark.read.csv('/home/jovyan/data-sets/ml-latest-small/ratings.csv',inferSchema= True,header=True)
            .select("userId", "movieId", "rating")
    .cache()
)

In [34]:
movies = (
spark.read.csv('/home/jovyan/data-sets/ml-latest-small/movies.csv',inferSchema= True,header=True)
        .withColumn('release_year', f.regexp_extract(f.col("title"), "\s?\((\d{4})\)",1))\
        .withColumn('title', f.regexp_replace(f.col("title"), "\s?\((\d{4})\)",""))
)              

In [35]:
movies.show()

+-------+--------------------+--------------------+------------+
|movieId|               title|              genres|release_year|
+-------+--------------------+--------------------+------------+
|      1|           Toy Story|Adventure|Animati...|        1995|
|      2|             Jumanji|Adventure|Childre...|        1995|
|      3|    Grumpier Old Men|      Comedy|Romance|        1995|
|      4|   Waiting to Exhale|Comedy|Drama|Romance|        1995|
|      5|Father of the Bri...|              Comedy|        1995|
|      6|                Heat|Action|Crime|Thri...|        1995|
|      7|             Sabrina|      Comedy|Romance|        1995|
|      8|        Tom and Huck|  Adventure|Children|        1995|
|      9|        Sudden Death|              Action|        1995|
|     10|           GoldenEye|Action|Adventure|...|        1995|
|     11|American Presiden...|Comedy|Drama|Romance|        1995|
|     12|Dracula: Dead and...|       Comedy|Horror|        1995|
|     13|               B

In [36]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
 

In [39]:
als = ALS(
   
    userCol='userId',
    itemCol='movieId',
    ratingCol='rating',
    
)

train_data,test_data = ratings.randomSplit([.7,.3])
model = als.fit(train_data)
preditions = model.transform(test_data)
evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="rating", predictionCol="prediction"
)

In [43]:
rmse = evaluator.evaluate(preditions.na.drop())

In [44]:
print(rmse)

0.9008659942703514


In [45]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [58]:
parameter_grid = (
    ParamGridBuilder()
    .addGrid(als.rank, [1, 5, 10])
    .addGrid(als.maxIter, [20])
    .addGrid(als.regParam, [0.05, 0.1])
    .build()
)


In [60]:
crossvalidator = CrossValidator(
    estimator=als,
    estimatorParamMaps=parameter_grid,
    evaluator=evaluator,
    numFolds=2,
)


In [62]:
cvModel = crossvalidator.fit(train_data)

In [63]:
model = cvModel.bestModel

In [67]:
userId = input("Enter userId :")

rec_all_user = model.recommendForAllUsers(5).cache()
#rec_all_user.show(20,False)
rec_for_user1=(
  rec_all_user.filter(f"userId=={userId}")
  .withColumn("rec",f.explode("recommendations"))
  .select(
        "userId",
          f.col("rec").movieId.alias("movieId"),
          f.col("rec").rating.alias("rating"),
       )
  .join(movies,"movieId")
  .orderBy('rating',ascending = False)
  .select('movieId','title','release_year')
  )


Enter userId : 250


In [68]:
rec_for_user1.show(5,False)

+-------+------------------------------------------------+------------+
|movieId|title                                           |release_year|
+-------+------------------------------------------------+------------+
|40491  |Match Factory Girl, The (Tulitikkutehtaan tyttö)|1990        |
|136850 |Villain                                         |1971        |
|156605 |Paterson                                        |            |
|68536  |Stanley Kubrick: A Life in Pictures             |2001        |
|8477   |Jetée, La                                       |1962        |
+-------+------------------------------------------------+------------+

