<a href="https://colab.research.google.com/github/boffett/paytm_test/blob/main/PaytmTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark



In [155]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, monotonically_increasing_id, split, explode
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, DateType, FloatType
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [52]:
path = "/content/paytm_test/"
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [160]:
movie_schema = StructType([
    StructField("MovieID", IntegerType(), True),
    StructField("Title", StringType(), True),
    StructField("Genres", StringType(), True)
])

movie_df = spark.read.csv(path + "movies.dat", sep="::", header=False, schema=movie_schema, mode='DROPMALFORMED')
movie_df.show(3)
movie_df.count()

+-------+--------------------+--------------------+
|MovieID|               Title|              Genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



10690

In [135]:
rating_schema = StructType([
    StructField("UserID", IntegerType(), True),
    StructField("MovieID", IntegerType(), True),
    StructField("Rating", FloatType(), True),
    StructField("Timestamp", IntegerType(), True)
])

rating_df = spark.read.csv(path + "ratings.dat", sep="::", header=False, schema=rating_schema, mode='DROPMALFORMED')
rating_df.show(3)
rating_df.count()

+------+-------+------+---------+
|UserID|MovieID|Rating|Timestamp|
+------+-------+------+---------+
|     1|    122|   5.0|838985046|
|     1|    185|   5.0|838983525|
|     1|    231|   5.0|838983392|
+------+-------+------+---------+
only showing top 3 rows



18525891

**(Q1) What are the titles of top 5 most popular movies i.e. have the most ranking in the whole dataset?**

In [145]:
movie_rating_count = rating_df.groupBy("MovieID").count().orderBy("count", ascending=False)
popular_movie_count = movie_rating_count.withColumn("Rank#", monotonically_increasing_id())
popular_movies = popular_movie_count.join(movie_df, "MovieID").select(
    "Rank#", "count", "Title")
popular_movies.persist()

DataFrame[Rank#: bigint, count: bigint, Title: string]

In [148]:
top_N = 5
popular_movies.show(top_N)

+-----+-----+--------------------+
|Rank#|count|               Title|
+-----+-----+--------------------+
|    0|64716| Pulp Fiction (1994)|
|    1|63998| Forrest Gump (1994)|
|    2|62286|Silence of the La...|
|    3|60661|Jurassic Park (1993)|
|    4|57814|Shawshank Redempt...|
+-----+-----+--------------------+
only showing top 5 rows



**(Q2) What are the top 5 ranked movie genres on average in the whole dataset?**

In [173]:
movie_avg_ratings = rating_df.groupBy("MovieID").agg({"Rating":"mean"})
movie_avg_ratings = movie_avg_ratings.withColumnRenamed('avg(Rating)', 'movie_avg_rating')
movie_avg_ratings.show(10)

+-------+------------------+
|MovieID|  movie_avg_rating|
+-------+------------------+
|   1580| 3.561991929352594|
|   5300|  3.68732782369146|
|    471|3.6608205953338695|
|   1591| 2.596472629144179|
|   3175|3.6237578726382087|
|   3997|2.0719424460431655|
|   1959|3.6305361305361306|
|   2366| 3.619745029594779|
|   2866|3.6053639846743293|
|   1088| 3.191771459814106|
+-------+------------------+
only showing top 10 rows



In [174]:
movie_genres = movie_df.withColumn("genre_list", split(col("Genres"), "\|"))
movie_genres = movie_genres.withColumn("genre", explode(col("genre_list"))).select(
    "MovieID", "genre")
movie_genres.show(10)
movie_genres.count()

+-------+---------+
|MovieID|    genre|
+-------+---------+
|      1|Adventure|
|      1|Animation|
|      1| Children|
|      1|   Comedy|
|      1|  Fantasy|
|      2|Adventure|
|      2| Children|
|      2|  Fantasy|
|      3|   Comedy|
|      3|  Romance|
+-------+---------+
only showing top 10 rows



21588

In [175]:
movie_genres_rating = movie_genres.join(movie_avg_ratings,"MovieID")
movie_genres_rating.show(10)
movie_genres_rating.count()

+-------+---------+------------------+
|MovieID|    genre|  movie_avg_rating|
+-------+---------+------------------+
|   1580|   Sci-Fi| 3.561991929352594|
|   1580|   Comedy| 3.561991929352594|
|   1580|   Action| 3.561991929352594|
|   5300|  Western|  3.68732782369146|
|   5300| Thriller|  3.68732782369146|
|   5300|    Drama|  3.68732782369146|
|   5300|Adventure|  3.68732782369146|
|   5300|   Action|  3.68732782369146|
|    471|  Romance|3.6608205953338695|
|    471|  Fantasy|3.6608205953338695|
+-------+---------+------------------+
only showing top 10 rows



21576

In [176]:
genre_rating = movie_genres_rating.groupBy("genre").agg({"movie_avg_rating":"mean"})
genre_rating.show(10)

+-----------+---------------------+
|      genre|avg(movie_avg_rating)|
+-----------+---------------------+
|      Crime|   12.079643166332636|
|    Romance|    3039.260323399675|
|   Thriller|    7.175836935662055|
|  Adventure|   1484.7369801194804|
|      Drama|    869.8180305309024|
|        War|   1886.1335649144946|
|Documentary|    3.457128262558261|
|    Fantasy|   1730.4554947929948|
|    Mystery|   268.90436148393985|
|    Musical|   3.2551734937213523|
+-----------+---------------------+
only showing top 10 rows

