<a href="https://colab.research.google.com/github/boffett/paytm_test/blob/main/PaytmTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark



In [180]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, monotonically_increasing_id, split, explode
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, DateType, FloatType
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [181]:
path = "/content/paytm_test/"
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [182]:
movie_schema = StructType([
    StructField("MovieID", IntegerType(), True),
    StructField("Title", StringType(), True),
    StructField("Genres", StringType(), True)
])

movie_df = spark.read.csv(path + "movies.dat", sep="::", header=False, schema=movie_schema, mode='DROPMALFORMED')
movie_df = movie_df.dropna()
movie_df.show(3)
movie_df.count()

+-------+--------------------+--------------------+
|MovieID|               Title|              Genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



10684

In [189]:
rating_schema = StructType([
    StructField("UserID", IntegerType(), True),
    StructField("MovieID", IntegerType(), True),
    StructField("Rating", FloatType(), True),
    StructField("Timestamp", IntegerType(), True)
])

rating_df = spark.read.csv(path + "ratings.dat", sep="::", header=False, schema=rating_schema, mode='DROPMALFORMED')
rating_df = rating_df.dropna()
rating_df = rating_df.filter((rating_df.Rating >= 0) & (rating_df.Rating <= 5))
rating_df.show(3)
rating_df.count()

+------+-------+------+---------+
|UserID|MovieID|Rating|Timestamp|
+------+-------+------+---------+
|     1|    122|   5.0|838985046|
|     1|    185|   5.0|838983525|
|     1|    231|   5.0|838983392|
+------+-------+------+---------+
only showing top 3 rows



18525500

**(Q1) What are the titles of top 5 most popular movies i.e. have the most ranking in the whole dataset?**

In [145]:
movie_rating_count = rating_df.groupBy("MovieID").count().orderBy("count", ascending=False)
popular_movie_count = movie_rating_count.withColumn("Rank#", monotonically_increasing_id())
popular_movies = popular_movie_count.join(movie_df, "MovieID").select(
    "Rank#", "count", "Title")
popular_movies.persist()

DataFrame[Rank#: bigint, count: bigint, Title: string]

In [148]:
top_n_movie = 5
popular_movies.show(top_n_movie)

+-----+-----+--------------------+
|Rank#|count|               Title|
+-----+-----+--------------------+
|    0|64716| Pulp Fiction (1994)|
|    1|63998| Forrest Gump (1994)|
|    2|62286|Silence of the La...|
|    3|60661|Jurassic Park (1993)|
|    4|57814|Shawshank Redempt...|
+-----+-----+--------------------+
only showing top 5 rows



**(Q2) What are the top 5 ranked movie genres on average in the whole dataset?**

In [191]:
movie_avg_ratings = rating_df.groupBy("MovieID").avg("Rating")
movie_avg_ratings = movie_avg_ratings.withColumnRenamed('avg(Rating)', 'movie_avg_rating')
movie_avg_ratings.show(10)

+-------+-----------------+
|MovieID| movie_avg_rating|
+-------+-----------------+
|  51209|              5.0|
|  53355|              5.0|
|  42783|              5.0|
|  33264|              5.0|
|  64275|              5.0|
|  65001|             4.75|
|   4454|             4.75|
|  26048|             4.75|
|  26073|             4.75|
|   5194|4.714285714285714|
|   5849|4.666666666666667|
|  63808|4.666666666666667|
|  32657|            4.625|
|  53883|              4.5|
|  60990|              4.5|
|   7452|              4.5|
|  63179|              4.5|
|  64418|              4.5|
|   3226|              4.5|
|  58185|              4.5|
+-------+-----------------+
only showing top 20 rows



In [192]:
movie_genres = movie_df.withColumn("genre_list", split(col("Genres"), "\|"))
movie_genres = movie_genres.withColumn("genre", explode(col("genre_list"))).select(
    "MovieID", "genre")
movie_genres.show(10)
movie_genres.count()

+-------+---------+
|MovieID|    genre|
+-------+---------+
|      1|Adventure|
|      1|Animation|
|      1| Children|
|      1|   Comedy|
|      1|  Fantasy|
|      2|Adventure|
|      2| Children|
|      2|  Fantasy|
|      3|   Comedy|
|      3|  Romance|
+-------+---------+
only showing top 10 rows



21580

In [194]:
movie_genres_rating = movie_genres.join(movie_avg_ratings,"MovieID")
movie_genres_rating.show(10)
movie_genres_rating.count()

+-------+---------+------------------+
|MovieID|    genre|  movie_avg_rating|
+-------+---------+------------------+
|   1580|   Sci-Fi| 3.561991929352594|
|   1580|   Comedy| 3.561991929352594|
|   1580|   Action| 3.561991929352594|
|   5300|  Western|  3.68732782369146|
|   5300| Thriller|  3.68732782369146|
|   5300|    Drama|  3.68732782369146|
|   5300|Adventure|  3.68732782369146|
|   5300|   Action|  3.68732782369146|
|    471|  Romance|3.6608205953338695|
|    471|  Fantasy|3.6608205953338695|
+-------+---------+------------------+
only showing top 10 rows



21573

In [196]:
genre_ratings = movie_genres_rating.groupBy("genre").avg("movie_avg_rating")
genre_ratings = genre_ratings.withColumnRenamed('avg(movie_avg_rating)', 'genre_avg_rating')
genre_ratings = genre_ratings.orderBy('genre_avg_rating', ascending = False)
genre_ratings.cache()
top_n_genre = 5
genre_ratings.show(top_n_genre)

+------------------+------------------+
|             genre|  genre_avg_rating|
+------------------+------------------+
|         Film-Noir|3.7145568991978926|
|(no genres listed)|3.4583333333333335|
|       Documentary|3.4571269547082286|
|               War|3.4529868830864836|
|             Drama|3.3490765797551654|
|           Mystery|3.3274023557458645|
|           Western| 3.309819232484588|
|           Romance|3.2996282169342153|
|             Crime|  3.29887958387637|
|              IMAX|3.2696978301234534|
+------------------+------------------+
only showing top 10 rows

