<a href="https://colab.research.google.com/github/boffett/paytm_test/blob/main/PaytmTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *
from pyspark.mllib.recommendation import ALS

**First, read the data from the following files (movies.dat, ratings.dat) and reorganize/clean the data to be used in the model.**

In [None]:
path = "/content/paytm_test/"
spark = SparkSession.builder.appName("PaytmTest").getOrCreate()

In [None]:
movie_df = spark.read.text(path + "movies.dat").select(
    split("value", "::").getItem(0).cast("integer").alias("MovieID"),
    split("value", "::").getItem(1).alias("Title"),
    split("value", "::").getItem(2).alias("Genres")
)
movie_df = movie_df.dropna()
#movie_df.cache()
movie_df.show(3)
movie_df.count()

In [None]:
rating_df = spark.read.text(path + "ratings.dat").select(
    split("value", "::").getItem(0).cast("integer").alias("UserID"),
    split("value", "::").getItem(1).cast("integer").alias("MovieID"),
    split("value", "::").getItem(2).cast("float").alias("Rating"),
    split("value", "::").getItem(3).alias("Timestamp")
)
rating_df = rating_df.dropna()
rating_df = rating_df.filter((rating_df.Rating >= 0) & (rating_df.Rating <= 5))
rating_df = rating_df.withColumn("Date", to_date(from_unixtime("Timestamp")))
rating_df = rating_df.drop("Timestamp")
#rating_df.cache()
rating_df.show(3)
rating_df.count()

**(Q1) What are the titles of top 5 most popular movies i.e. have the most ranking in the whole dataset?**

In [None]:
movie_rating_count = rating_df.groupBy("MovieID").agg(
    count("*").alias("#Ratings")).orderBy("#Ratings", ascending=False)
movie_rating_count = movie_rating_count.withColumn("Rank", monotonically_increasing_id())
popular_movies = movie_rating_count.join(movie_df, "MovieID").select(
    "Rank", "#Ratings", "Title")
#popular_movies.cache()

In [None]:
top_n_movie = 5
popular_movies.show(top_n_movie)

**(Q2) What are the top 5 ranked movie genres on average in the whole dataset?**

In [None]:
movie_avg_ratings = rating_df.groupBy("MovieID").agg(avg("Rating").alias('Avg_Rating'))
movie_avg_ratings.show(10)

In [None]:
movie_genres_rating = movie_avg_ratings.join(movie_df,"MovieID").select(
    "MovieID", "Avg_Rating", "Genres")
movie_genres_rating.show(10)
#movie_genres_rating.count()

In [None]:
genre_ratings = movie_genres_rating.withColumn("Genre", explode(
    split("Genres", "\|"))).select("Genre", "Avg_Rating")
genre_ratings.show(10)
#genre_ratings.count()

In [None]:
genre_avg_rating = genre_ratings.groupBy("Genre").agg(
    avg("Avg_Rating").alias('Avg_Rating'))
genre_avg_rating = genre_avg_rating.orderBy('Avg_Rating', ascending = False)
#genre_ratings.cache()
#genre_ratings.count()

In [None]:
top_n_genre = 5
genre_avg_rating.show(top_n_genre)

**(Q3) How many movies have been ranked the most consecutive days?**

In [None]:
movie_rating_date = rating_df.select(["MovieID","Date"]).drop_duplicates()
movie_rating_date.show(10)
#movie_rating_date.count()

In [None]:
winspec = Window.partitionBy("MovieID").orderBy("Date")
movie_rating_date = movie_rating_date.withColumn("date_diff",
    datediff("Date", lag("Date", 1).over(winspec)))
movie_rating_date = movie_rating_date.withColumn("winID",
    sum(when(col("date_diff") != 1, 1).otherwise(0)).over(
        winspec.rowsBetween(Window.unboundedPreceding, 0)))
movie_rating_date.show(10)

In [None]:
movie_rated_days = movie_rating_date.groupBy("MovieID", "winID").count()
movie_rated_days = movie_rated_days.groupBy("MovieID").agg(max("count").alias("Max_Consecutive_Days"))
movie_rated_days = movie_rated_days.orderBy("Max_Consecutive_Days", ascending=False)
#movie_rated_days.cache()
movie_rated_days.show(10)

**Second, split the data into test and training sets and create a recommender system.**

In [None]:
(train_df, test_df) = rating_df.randomSplit([0.8, 0.2])
train_df.show(5)
#train_df.count()

In [None]:
from pyspark.ml.recommendation import ALS
als = ALS(
    userCol="UserID",
    itemCol="MovieID",
    ratingCol="Rating",
    maxIter=10,
    regParam=0.01,
    nonnegative=True,
    implicitPrefs=False,
    coldStartStrategy="drop"
)
model = als.fit(train_df)
model.save(path + "als.model")

**(Q4) What are the top 5 recommended movies made to one user, e.g. , UserID = 122 (any user can be selected)**

In [None]:
def recommend_movies_for_user(model, user_id, num_movies, movie_df):
    # 为指定的用户ID创建一个DataFrame
    users_df = spark.createDataFrame([(user_id,)], ["UserID"])

    # 使用模型为用户生成推荐
    user_recs = model.recommendForUserSubset(users_df, num_movies)

    # 展开推荐结果
    user_recs = user_recs.withColumn("rec_exp", explode("recommendations")).select('UserID', col("rec_exp.MovieID"), col("rec_exp.rating"))

    # 连接电影数据集以获取电影标题
    movie_details = user_recs.join(movie_df, user_recs.MovieID == movie_df.MovieID).select(user_recs.UserID, movie_df.Title, user_recs.rating)

    return movie_details

In [None]:
user_id = 122
top_n_rec = 5
user_recs = model.recommendForUserSubset(
    rating_df.filter(col("UserID") == user_id), top_n_rec)
user_recs.show(truncate=False)

**(Q5) What are the top 5 movies that are most frequently recommended by your model? (use training set)**

In [None]:
user_recs = model.recommendForAllUsers(5)
movie_recs = user_recs.withColumn("recommendations",
    explode("recommendations")).select(col("recommendations.MovieID"))
movie_freq = movie_recs.groupBy("MovieID").count().orderBy(desc("count"))
top_5_movies = movie_freq.limit(5)
top_5_movies_with_titles = top_5_movies.join(movie_df, "MovieID").select("MovieID", "Title", "count")
top_5_movies_with_titles.orderBy(desc("count")).show(truncate=False)

**(Q6) Calculate the RMSE of your model for your test set.**

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
predict_test = model.transform(test_df)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Rating", predictionCol="prediction")
rmse = evaluator.evaluate(predict_test)
print(f"Root Mean Square Error (RMSE) on test data = {rmse}")