<a href="https://colab.research.google.com/github/boffett/paytm_test/blob/main/PaytmTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, DateType, FloatType
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

**First, read the data from the following files (movies.dat, ratings.dat) and reorganize/clean the data to be used in the model.**

In [None]:
path = "/content/paytm_test/"
spark = SparkSession.builder.appName("PaytmTest").getOrCreate()

In [None]:
movie_df = spark.read.text(path + "movies.dat").select(
    split("value", "::").getItem(0).alias("MovieID"),
    split("value", "::").getItem(1).alias("Title"),
    split("value", "::").getItem(2).alias("Genres")
)
movie_df = movie_df.dropna()
movie_df.cache()
movie_df.show(3)
movie_df.count()

+-------+--------------------+--------------------+
|MovieID|               Title|              Genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



10690

In [None]:
rating_df = spark.read.text(path + "ratings.dat").select(
    split("value", "::").getItem(1).alias("MovieID"),
    split("value", "::").getItem(2).cast("float").alias("Rating"),
    split("value", "::").getItem(3).alias("Timestamp")
)
rating_df = rating_df.dropna()
rating_df = rating_df.filter((rating_df.Rating >= 0) & (rating_df.Rating <= 5))
rating_df = rating_df.withColumn("Date", to_date(from_unixtime("Timestamp", "yyyy-MM-dd HH:mm:ss")))
rating_df.cache()
rating_df.show(3)
rating_df.count()

+-------+------+---------+----------+
|MovieID|Rating|Timestamp|      Date|
+-------+------+---------+----------+
|    122|   5.0|838985046|1996-08-02|
|    185|   5.0|838983525|1996-08-02|
|    231|   5.0|838983392|1996-08-02|
+-------+------+---------+----------+
only showing top 3 rows



10000035

**(Q1) What are the titles of top 5 most popular movies i.e. have the most ranking in the whole dataset?**

In [None]:
movie_rating_count = rating_df.groupBy("MovieID").agg(
    count("*").alias("#Ratings")).orderBy("#Ratings", ascending=False)
movie_rating_count = movie_rating_count.withColumn("Rank", monotonically_increasing_id())
popular_movies = movie_rating_count.join(movie_df, "MovieID").select(
    "Rank", "#Ratings", "Title")
popular_movies.cache()

DataFrame[Rank: bigint, #Ratings: bigint, Title: string]

In [None]:
top_n_movie = 5
popular_movies.show(top_n_movie)

+----+--------+--------------------+
|Rank|#Ratings|               Title|
+----+--------+--------------------+
|   0|   34864| Pulp Fiction (1994)|
|   1|   34457| Forrest Gump (1994)|
|   2|   33668|Silence of the La...|
|   3|   32631|Jurassic Park (1993)|
|   4|   31126|Shawshank Redempt...|
+----+--------+--------------------+
only showing top 5 rows



**(Q2) What are the top 5 ranked movie genres on average in the whole dataset?**

In [None]:
movie_avg_ratings = rating_df.groupBy("MovieID").agg(avg("Rating").alias('Avg_Rating'))
movie_avg_ratings.show(10)

+-------+------------------+
|MovieID|        Avg_Rating|
+-------+------------------+
|   2294| 3.369998048018739|
|    296| 4.157425998164295|
|   1090| 3.925811260176585|
|   2136|2.9290281329923276|
|   3210|3.7045234639574263|
|    829|  2.66636197440585|
|   2162|  2.34106239460371|
|   2088|2.5516912815626487|
|   3959| 3.709309689677011|
|   2069| 3.789208633093525|
+-------+------------------+
only showing top 10 rows



In [None]:
movie_genres = movie_df.withColumn("Genre", explode(
    split("Genres", "\|"))).select("MovieID", "Genre")
movie_genres.show(10)
movie_genres.count()

+-------+---------+
|MovieID|    Genre|
+-------+---------+
|      1|Adventure|
|      1|Animation|
|      1| Children|
|      1|   Comedy|
|      1|  Fantasy|
|      2|Adventure|
|      2| Children|
|      2|  Fantasy|
|      3|   Comedy|
|      3|  Romance|
+-------+---------+
only showing top 10 rows



21588

In [None]:
movie_genres_rating = movie_genres.join(movie_avg_ratings,"MovieID")
movie_genres_rating.show(10)
movie_genres_rating.count()

+-------+---------+-----------------+
|MovieID|    Genre|       Avg_Rating|
+-------+---------+-----------------+
|   2294|  Fantasy|3.369998048018739|
|   2294|   Comedy|3.369998048018739|
|   2294| Children|3.369998048018739|
|   2294|Animation|3.369998048018739|
|   2294|Adventure|3.369998048018739|
|    296|    Drama|4.157425998164295|
|    296|    Crime|4.157425998164295|
|    296|   Comedy|4.157425998164295|
|   1090|      War|3.925811260176585|
|   1090|    Drama|3.925811260176585|
+-------+---------+-----------------+
only showing top 10 rows



21581

In [None]:
genre_ratings = movie_genres_rating.groupBy("Genre").agg(
    avg("Avg_Rating").alias('Avg_Rating'))
genre_ratings = genre_ratings.orderBy('Avg_Rating', ascending = False)
genre_ratings.cache()
genre_ratings.count()

20

In [None]:
top_n_genre = 5
genre_ratings.show(top_n_genre)

+------------------+------------------+
|             Genre|        Avg_Rating|
+------------------+------------------+
|         Film-Noir|3.7118715983793593|
|(no genres listed)| 3.642857142857143|
|       Documentary|3.4602546086567334|
|               War| 3.454612791239216|
|             Drama| 3.349892884484847|
+------------------+------------------+
only showing top 5 rows



**(Q3) How many movies have been ranked the most consecutive days?**

In [None]:
movie_rating_date = rating_df.select(["MovieID","Date"]).drop_duplicates()
movie_rating_date.show(10)
movie_rating_date.count()

+-------+----------+
|MovieID|      Date|
+-------+----------+
|   1409|2005-05-12|
|   1994|2005-05-20|
|   3973|2005-05-20|
|   6567|2005-05-20|
|   7395|2005-05-20|
|   7448|2005-05-20|
|  30822|2005-05-20|
|    165|2005-03-23|
|    344|2005-03-23|
|   2403|2005-03-23|
+-------+----------+
only showing top 10 rows



4574511

In [None]:
winspec = Window.partitionBy("MovieID").orderBy("Date")
movie_rating_date = movie_rating_date.withColumn("date_diff",
    datediff("Date", lag("Date", 1).over(winspec)))
movie_rating_date = movie_rating_date.withColumn("winID",
    sum(when(col("date_diff") != 1, 1).otherwise(0)).over(
        winspec.rowsBetween(Window.unboundedPreceding, 0)))
movie_rating_date.show(10)

+-------+----------+---------+-----+
|MovieID|      Date|date_diff|winID|
+-------+----------+---------+-----+
|    100|1996-02-15|     NULL|    0|
|    100|1996-03-12|       26|    1|
|    100|1996-03-28|       16|    2|
|    100|1996-03-30|        2|    3|
|    100|1996-03-31|        1|    3|
|    100|1996-04-01|        1|    3|
|    100|1996-04-02|        1|    3|
|    100|1996-04-04|        2|    4|
|    100|1996-04-07|        3|    5|
|    100|1996-04-08|        1|    5|
+-------+----------+---------+-----+
only showing top 10 rows



In [None]:
movie_rated_days = movie_rating_date.groupBy("MovieID", "winID").count()
movie_rated_days = movie_rated_days.groupBy("MovieID").agg(max("count").alias("Max_Consecutive_Days"))
movie_rated_days = movie_rated_days.orderBy("Max_Consecutive_Days", ascending=False)
movie_rated_days.cache()
movie_rated_days.show(10)