<a href="https://colab.research.google.com/github/boffett/paytm_test/blob/main/PaytmTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark



In [125]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, monotonically_increasing_id

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, DateType, FloatType
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [52]:
path = "/content/paytm_test/"
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [75]:
movie_schema = StructType([
    StructField("MovieID", IntegerType(), True),
    StructField("Title", StringType(), True),
    StructField("Genres", StringType(), True)
])

movie_df = spark.read.csv(path + "movies.dat", sep="::", header=False, schema=movie_schema, mode='DROPMALFORMED')
movie_df.show(3)
movie_df.count()

+-------+--------------------+--------------------+
|MovieID|               Title|              Genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



10690

In [135]:
rating_schema = StructType([
    StructField("UserID", IntegerType(), True),
    StructField("MovieID", IntegerType(), True),
    StructField("Rating", FloatType(), True),
    StructField("Timestamp", IntegerType(), True)
])

rating_df = spark.read.csv(path + "ratings.dat", sep="::", header=False, schema=rating_schema, mode='DROPMALFORMED')
rating_df.show(3)
rating_df.count()

+------+-------+------+---------+
|UserID|MovieID|Rating|Timestamp|
+------+-------+------+---------+
|     1|    122|   5.0|838985046|
|     1|    185|   5.0|838983525|
|     1|    231|   5.0|838983392|
+------+-------+------+---------+
only showing top 3 rows



18525891

**(Q1) What are the titles of top 5 most popular movies i.e. have the most ranking in the whole dataset?**

In [145]:
movie_rating_count = rating_df.groupBy("MovieID").count().orderBy("count", ascending=False)
popular_movie_count = movie_rating_count.withColumn("Rank#", monotonically_increasing_id())
popular_movies = popular_movie_count.join(movie_df, "MovieID").select(
    "Rank#", "count", "Title")
popular_movies.persist()

DataFrame[Rank#: bigint, count: bigint, Title: string]

In [148]:
top_N = 5
popular_movies.show(top_N)

+-----+-----+--------------------+
|Rank#|count|               Title|
+-----+-----+--------------------+
|    0|64716| Pulp Fiction (1994)|
|    1|63998| Forrest Gump (1994)|
|    2|62286|Silence of the La...|
|    3|60661|Jurassic Park (1993)|
|    4|57814|Shawshank Redempt...|
+-----+-----+--------------------+
only showing top 5 rows

