In [0]:
%pip install mlflow

In [0]:
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.functions import desc
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
import mlflow

In [0]:
mlflow.pyspark.ml.autolog()

In [0]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark
spark = init_spark()

In [0]:
ratings_df = spark.read.format("csv").option("header", "true").load("/FileStore/tables")
## Drop timestamp column, because it is not relevant to my calculations
ratings_df = ratings_df.drop('timestamp')
ratings_df = ratings_df.withColumn("rating",ratings_df.rating.cast('float'))
ratings_df = ratings_df.withColumn("userId",ratings_df.userId.cast('int'))
ratings_df = ratings_df.withColumn("movieId",ratings_df.movieId.cast('int'))
ratings_df.count()

In [0]:
(training_df, test_df) = ratings_df.randomSplit([0.8, 0.2], seed=42)


In [0]:
user_means = training_df.groupBy("userId").avg('rating').withColumnRenamed('avg(rating)', 'user_average')
movie_means = training_df.groupBy('movieId').avg('rating').withColumnRenamed('avg(rating)', 'movie_average')
display(user_means)

userId,user_average
148,4.018018018018018
463,3.348051948051948
471,3.646892655367232
496,3.883495145631068
243,3.3846153846153846
392,3.026315789473684
540,3.7058823529411766
623,4.04320987654321
31,4.169642857142857
516,3.4833333333333334


In [0]:
global_mean = training_df.agg({"rating": "avg"}).collect()[0][0]

In [0]:
training_with_user_means = training_df.join(user_means, 'userId')
training_with_movie_user_means = training_with_user_means.join(movie_means, 'movieId')
display(training_with_movie_user_means)

movieId,userId,rating,user_average,movie_average
31,1,2.5,2.3666666666666667,3.128571428571429
1029,1,3.0,2.3666666666666667,3.683333333333333
1129,1,2.0,2.3666666666666667,3.276315789473684
1172,1,4.0,2.3666666666666667,4.220588235294118
1263,1,2.0,2.3666666666666667,3.7906976744186047
1293,1,2.0,2.3666666666666667,3.8684210526315783
1343,1,2.0,2.3666666666666667,3.8620689655172415
1371,1,2.5,2.3666666666666667,3.027027027027027
1405,1,1.0,2.3666666666666667,2.986842105263158
1953,1,4.0,2.3666666666666667,4.0


In [0]:
i = training_with_movie_user_means.rating - (training_with_movie_user_means.user_average+training_with_movie_user_means.movie_average-global_mean)

df_everything_training = training_with_movie_user_means.withColumn('user_item_interaction', i)

display(df_everything_training)

movieId,userId,rating,user_average,movie_average,user_item_interaction
31,1,2.5,2.3666666666666667,3.128571428571429,0.546880670796571
1029,1,3.0,2.3666666666666667,3.683333333333333,0.4921187660346664
1129,1,2.0,2.3666666666666667,3.276315789473684,-0.100863690105684
1172,1,4.0,2.3666666666666667,4.220588235294118,0.9548638640738814
1263,1,2.0,2.3666666666666667,3.7906976744186047,-0.6152455750506052
1293,1,2.0,2.3666666666666667,3.8684210526315783,-0.6929689532635797
1343,1,2.0,2.3666666666666667,3.8620689655172415,-0.686616866149242
1371,1,2.5,2.3666666666666667,3.027027027027027,0.6484250723409728
1405,1,1.0,2.3666666666666667,2.986842105263158,-0.8113900058951584
1953,1,4.0,2.3666666666666667,4.0,1.175452099367999


In [0]:
als = ALS(maxIter=5, rank=70, regParam=0.01, coldStartStrategy='drop', userCol='userId', itemCol='movieId', 
    ratingCol='user_item_interaction')

In [0]:
als_model = als.fit(df_everything_training)

In [0]:
predictions = als_model.transform(test_df)
display(predictions)

userId,movieId,rating,prediction
1,1061,3.0,0.118493624
1,1287,2.0,0.11644205
1,1339,3.5,-0.5567111
1,2105,4.0,-0.07447565
1,3671,3.0,0.32592866
2,47,4.0,-1.023632
2,150,5.0,-0.7063101
2,186,3.0,-0.5331015
2,266,5.0,-0.2980757
2,272,3.0,-0.20773014


In [0]:
predictions_with_user_means = predictions.join(user_means, 'userId')
predictions_with_movie_user_means = predictions_with_user_means.join(movie_means, 'movieId')

In [0]:
i = predictions_with_movie_user_means.rating - (predictions_with_movie_user_means.user_average+predictions_with_movie_user_means.movie_average-global_mean)

df_everything_predictions = predictions_with_movie_user_means.withColumn('user_item_interaction', i)

display(df_everything_predictions)

movieId,userId,rating,prediction,user_average,movie_average,user_item_interaction
1061,1,3.0,0.118493624,2.3666666666666667,3.576923076923077,0.5985290224449225
1287,1,2.0,0.11644205,2.3666666666666667,4.057142857142857,-0.8816907577748578
1339,1,3.5,-0.5567111,2.3666666666666667,3.365853658536585,1.3095984408314143
2105,1,4.0,-0.07447565,2.3666666666666667,3.414285714285714,1.7611663850822854
3671,1,3.0,0.32592866,2.3666666666666667,3.9705882352941178,0.2048638640738813
47,2,4.0,-1.023632,3.421875,4.092948717948718,0.0272950480859481
150,2,5.0,-0.7063101,3.421875,3.936305732484077,1.1839380335505902
186,2,3.0,-0.5331015,3.421875,2.875,0.2452437660346662
266,2,5.0,-0.2980757,3.421875,3.508928571428572,1.611315194606095
272,2,3.0,-0.20773014,3.421875,4.09375,-0.9735062339653338


In [0]:
ratings_predictions = df_everything_predictions.withColumn("rating_prediction", df_everything_predictions.prediction + df_everything_predictions.user_average + 
    df_everything_predictions.movie_average - lit(global_mean))
display(ratings_predictions)

movieId,userId,rating,prediction,user_average,movie_average,user_item_interaction,rating_prediction
1061,1,3.0,0.118493624,2.3666666666666667,3.576923076923077,0.5985290224449225,2.5199646015866213
1287,1,2.0,0.11644205,2.3666666666666667,4.057142857142857,-0.8816907577748578,2.998132804834394
1339,1,3.5,-0.5567111,2.3666666666666667,3.365853658536585,1.3095984408314143,1.6336904814784612
2105,1,4.0,-0.07447565,2.3666666666666667,3.414285714285714,1.7611663850822854,2.164357961448152
3671,1,3.0,0.32592866,2.3666666666666667,3.9705882352941178,0.2048638640738813,3.1210647941731127
47,2,4.0,-1.023632,3.421875,4.092948717948718,0.0272950480859481,2.949072902353505
150,2,5.0,-0.7063101,3.421875,3.936305732484077,1.1839380335505902,3.109751873046547
186,2,3.0,-0.5331015,3.421875,2.875,0.2452437660346662,2.221654734884676
266,2,5.0,-0.2980757,3.421875,3.508928571428572,1.611315194606095,3.090609099627227
272,2,3.0,-0.20773014,3.421875,4.09375,-0.9735062339653338,3.76577608970302


In [0]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='rating_prediction')
rmse = evaluator.evaluate(ratings_predictions)
rmse