<a href="https://colab.research.google.com/github/boffett/paytm_test/blob/main/PaytmTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!pip install pyspark



In [21]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *
from pyspark.mllib.recommendation import ALS

**First, read the data from the following files (movies.dat, ratings.dat) and reorganize/clean the data to be used in the model.**

In [22]:
path = "/content/paytm_test/"
spark = SparkSession.builder.appName("PaytmTest").getOrCreate()

In [41]:
movie_df = spark.read.text(path + "movies.dat").select(
    split("value", "::").getItem(0).cast("integer").alias("MovieID"),
    split("value", "::").getItem(1).alias("Title"),
    split("value", "::").getItem(2).alias("Genres")
)
movie_df = movie_df.dropna()
#movie_df.cache()
movie_df.show(3)
movie_df.count()

+-------+--------------------+--------------------+
|MovieID|               Title|              Genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



10686

In [43]:
rating_df = spark.read.text(path + "ratings.dat").select(
    split("value", "::").getItem(0).cast("integer").alias("UserID"),
    split("value", "::").getItem(1).cast("integer").alias("MovieID"),
    split("value", "::").getItem(2).cast("float").alias("Rating"),
    split("value", "::").getItem(3).alias("Timestamp")
)
rating_df = rating_df.dropna()
rating_df = rating_df.filter((rating_df.Rating >= 0) & (rating_df.Rating <= 5))
rating_df = rating_df.withColumn("Date", to_date(from_unixtime("Timestamp")))
rating_df = rating_df.drop("Timestamp")
#rating_df.cache()
rating_df.show(3)
rating_df.count()

+------+-------+------+----------+
|UserID|MovieID|Rating|      Date|
+------+-------+------+----------+
|     1|    122|   5.0|1996-08-02|
|     1|    185|   5.0|1996-08-02|
|     1|    231|   5.0|1996-08-02|
+------+-------+------+----------+
only showing top 3 rows



10000032

**(Q1) What are the titles of top 5 most popular movies i.e. have the most ranking in the whole dataset?**

In [44]:
movie_rating_count = rating_df.groupBy("MovieID").agg(
    count("*").alias("#Ratings")).orderBy("#Ratings", ascending=False)
movie_rating_count = movie_rating_count.withColumn("Rank", monotonically_increasing_id())
popular_movies = movie_rating_count.join(movie_df, "MovieID").select(
    "Rank", "#Ratings", "Title")
#popular_movies.cache()

In [45]:
top_n_movie = 5
popular_movies.show(top_n_movie)

+----+--------+--------------------+
|Rank|#Ratings|               Title|
+----+--------+--------------------+
|   0|   34864| Pulp Fiction (1994)|
|   1|   34457| Forrest Gump (1994)|
|   2|   33668|Silence of the La...|
|   3|   32631|Jurassic Park (1993)|
|   4|   31126|Shawshank Redempt...|
+----+--------+--------------------+
only showing top 5 rows



**(Q2) What are the top 5 ranked movie genres on average in the whole dataset?**

In [46]:
movie_avg_ratings = rating_df.groupBy("MovieID").agg(avg("Rating").alias('Avg_Rating'))
movie_avg_ratings.show(10)

+-------+------------------+
|MovieID|        Avg_Rating|
+-------+------------------+
|   1580| 3.563920531231442|
|   5300|3.7041884816753927|
|    471| 3.659111243662392|
|   1591| 2.591865858009276|
|   3175|3.6245300142616363|
|   3997| 2.072898032200358|
|   1959|3.6309438040345823|
|   2366|3.6127175743964064|
|   2866| 3.607728337236534|
|   1088|3.1912112010796223|
+-------+------------------+
only showing top 10 rows



In [49]:
movie_genres_rating = movie_avg_ratings.join(movie_df,"MovieID").select(
    "MovieID", "Avg_Rating", "Genres")
movie_genres_rating.show(10)
#movie_genres_rating.count()

+-------+------------------+--------------------+
|MovieID|        Avg_Rating|              Genres|
+-------+------------------+--------------------+
|   1580| 3.563920531231442|Action|Comedy|Sci-Fi|
|   5300|3.7041884816753927|Action|Adventure|...|
|    471| 3.659111243662392|Comedy|Drama|Fant...|
|   1591| 2.591865858009276|Action|Adventure|...|
|   3175|3.6245300142616363|Adventure|Comedy|...|
|   3997| 2.072898032200358|   Adventure|Fantasy|
|   1959|3.6309438040345823|       Drama|Romance|
|   2366|3.6127175743964064|Action|Adventure|...|
|   2866| 3.607728337236534|               Drama|
|   1088|3.1912112010796223|Drama|Musical|Rom...|
+-------+------------------+--------------------+
only showing top 10 rows



In [50]:
genre_ratings = movie_genres_rating.withColumn("Genre", explode(
    split("Genres", "\|"))).select("Genre", "Avg_Rating")
genre_ratings.show(10)
#genre_ratings.count()

+---------+------------------+
|    Genre|        Avg_Rating|
+---------+------------------+
|   Action| 3.563920531231442|
|   Comedy| 3.563920531231442|
|   Sci-Fi| 3.563920531231442|
|   Action|3.7041884816753927|
|Adventure|3.7041884816753927|
|    Drama|3.7041884816753927|
| Thriller|3.7041884816753927|
|  Western|3.7041884816753927|
|   Comedy| 3.659111243662392|
|    Drama| 3.659111243662392|
+---------+------------------+
only showing top 10 rows



In [51]:
genre_avg_rating = genre_ratings.groupBy("Genre").agg(
    avg("Avg_Rating").alias('Avg_Rating'))
genre_avg_rating = genre_avg_rating.orderBy('Avg_Rating', ascending = False)
#genre_ratings.cache()
#genre_ratings.count()

In [52]:
top_n_genre = 5
genre_avg_rating.show(top_n_genre)

+------------------+------------------+
|             Genre|        Avg_Rating|
+------------------+------------------+
|         Film-Noir|3.7118715983793593|
|(no genres listed)| 3.642857142857143|
|       Documentary|3.4621763397993477|
|               War| 3.454612791239219|
|             Drama|3.3498928844848557|
+------------------+------------------+
only showing top 5 rows



**(Q3) How many movies have been ranked the most consecutive days?**

In [53]:
movie_rating_date = rating_df.select(["MovieID","Date"]).drop_duplicates()
movie_rating_date.show(10)
#movie_rating_date.count()

+-------+----------+
|MovieID|      Date|
+-------+----------+
|    589|1996-08-02|
|   1212|2003-04-11|
|    208|2005-05-20|
|    288|2005-03-23|
|   1080|2005-05-12|
|   1396|2005-03-24|
|   2948|2005-05-12|
|   2997|2005-03-24|
|   3452|2005-03-24|
|   5810|2005-05-12|
+-------+----------+
only showing top 10 rows



In [54]:
winspec = Window.partitionBy("MovieID").orderBy("Date")
movie_rating_date = movie_rating_date.withColumn("date_diff",
    datediff("Date", lag("Date", 1).over(winspec)))
movie_rating_date = movie_rating_date.withColumn("winID",
    sum(when(col("date_diff") != 1, 1).otherwise(0)).over(
        winspec.rowsBetween(Window.unboundedPreceding, 0)))
movie_rating_date.show(10)

+-------+----------+---------+-----+
|MovieID|      Date|date_diff|winID|
+-------+----------+---------+-----+
|      1|1996-01-29|     NULL|    0|
|      1|1996-02-01|        3|    1|
|      1|1996-02-02|        1|    1|
|      1|1996-02-05|        3|    2|
|      1|1996-02-12|        7|    3|
|      1|1996-02-22|       10|    4|
|      1|1996-02-23|        1|    4|
|      1|1996-02-26|        3|    5|
|      1|1996-03-04|        7|    6|
|      1|1996-03-05|        1|    6|
+-------+----------+---------+-----+
only showing top 10 rows



In [55]:
movie_rated_days = movie_rating_date.groupBy("MovieID", "winID").count()
movie_rated_days = movie_rated_days.groupBy("MovieID").agg(max("count").alias("Max_Consecutive_Days"))
movie_rated_days = movie_rated_days.orderBy("Max_Consecutive_Days", ascending=False)
#movie_rated_days.cache()
movie_rated_days.show(10)

+-------+--------------------+
|MovieID|Max_Consecutive_Days|
+-------+--------------------+
|   5952|                 622|
|   2858|                 485|
|   4993|                 483|
|    356|                 425|
|   6377|                 425|
|   2571|                 364|
|   2762|                 362|
|   1270|                 327|
|    593|                 317|
|   3578|                 307|
+-------+--------------------+
only showing top 10 rows



**Second, split the data into test and training sets and create a recommender system.**

In [56]:
(train_df, test_df) = rating_df.randomSplit([0.8, 0.2])
train_df.show(5)
#train_df.count()

+------+-------+------+----------+
|UserID|MovieID|Rating|      Date|
+------+-------+------+----------+
|     1|    122|   5.0|1996-08-02|
|     1|    231|   5.0|1996-08-02|
|     1|    292|   5.0|1996-08-02|
|     1|    316|   5.0|1996-08-02|
|     1|    329|   5.0|1996-08-02|
+------+-------+------+----------+
only showing top 5 rows



8000861

In [58]:
from pyspark.ml.recommendation import ALS
als = ALS(
    userCol="UserID",
    itemCol="MovieID",
    ratingCol="Rating",
    maxIter=10,
    regParam=0.01,
    nonnegative=True,
    implicitPrefs=False,
    coldStartStrategy="drop"
)
model = als.fit(train_df)
model.save(path + "als.model")

**(Q4) What are the top 5 recommended movies made to one user, e.g. , UserID = 122 (any user can be selected)**

In [None]:
def recommend_movies_for_user(model, user_id, num_movies, movie_df):
    # 为指定的用户ID创建一个DataFrame
    users_df = spark.createDataFrame([(user_id,)], ["UserID"])

    # 使用模型为用户生成推荐
    user_recs = model.recommendForUserSubset(users_df, num_movies)

    # 展开推荐结果
    user_recs = user_recs.withColumn("rec_exp", explode("recommendations")).select('UserID', col("rec_exp.MovieID"), col("rec_exp.rating"))

    # 连接电影数据集以获取电影标题
    movie_details = user_recs.join(movie_df, user_recs.MovieID == movie_df.MovieID).select(user_recs.UserID, movie_df.Title, user_recs.rating)

    return movie_details

In [59]:
user_id = 122
top_n_rec = 5
user_recs = model.recommendForUserSubset(
    rating_df.filter(col("UserID") == user_id), top_n_rec)
user_recs.show(truncate=False)

+------+--------------------------------------------------------------------------------------------------+
|UserID|recommendations                                                                                   |
+------+--------------------------------------------------------------------------------------------------+
|122   |[{32657, 5.728113}, {4454, 5.4159226}, {33264, 5.3966208}, {64275, 5.3798594}, {53355, 5.2210445}]|
+------+--------------------------------------------------------------------------------------------------+



**(Q5) What are the top 5 movies that are most frequently recommended by your model? (use training set)**

In [63]:
user_recs = model.recommendForAllUsers(5)
movie_recs = user_recs.withColumn("recommendations",
    explode("recommendations")).select(col("recommendations.MovieID"))
movie_freq = movie_recs.groupBy("MovieID").count().orderBy(desc("count"))
top_5_movies = movie_freq.limit(5)
top_5_movies_with_titles = top_5_movies.join(movie_df, "MovieID").select("MovieID", "Title", "count")
top_5_movies_with_titles.orderBy(desc("count")).show(truncate=False)

+-------+---------------------------------------------------------------------+-----+
|MovieID|Title                                                                |count|
+-------+---------------------------------------------------------------------+-----+
|32657  |Man Who Planted Trees, The (Homme qui plantait des arbres, L') (1987)|63331|
|4454   |More (1998)                                                          |40566|
|33264  |Satan's Tango (Sátántangó) (1994)                                    |34676|
|64275  |Blue Light, The (Das Blaue Licht) (1932)                             |21981|
|61037  |Silent Light (Stellet licht) (2007)                                  |18564|
+-------+---------------------------------------------------------------------+-----+



**(Q6) Calculate the RMSE of your model for your test set.**

In [64]:
from pyspark.ml.evaluation import RegressionEvaluator
predict_test = model.transform(test_df)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Rating", predictionCol="prediction")
rmse = evaluator.evaluate(predict_test)
print(f"Root Mean Square Error (RMSE) on test data = {rmse}")

Root Mean Square Error (RMSE) on test data = 0.8239066614293313
