<a href="https://colab.research.google.com/github/afrahriyaz/PySpark/blob/main/PopularMovieSpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Finding most popular movies from 100k movie lens dataset

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions

In [19]:
def loadMovieNames():
  movieNames = {}
  with open("/u.item", encoding='latin-1') as f:
    for line in f:
      fields = line.split('|')
      movieNames[int(fields[0])] = fields[1]
  return movieNames

In [20]:
def parseInput(line):
  fields = line.split()
  return Row(userId=int(fields[0]), movieId=int(fields[1]), rating=float(fields[2]), timestamp=int(fields[3]))

In [34]:
if __name__ == "__main__":
  #Creating a spark session
  spark = SparkSession.builder.appName("PopularMovies").getOrCreate()
  sc = spark.sparkContext

  #Get raw data
  lines = sc.textFile("/u.data")

  #Load dictionary (movieId, movieName) into movieNames
  movieNames = loadMovieNames()

  #Convert the raw data into RDD of Row objects
  movies = lines.map(parseInput)

  #Convert that to Dataframe
  movieDataset = spark.createDataFrame(movies)

  #Compute avg rating for each movie
  averageRatings = movieDataset.groupBy("movieId").avg("rating")

  #Counting number of people who rated each movie
  counts = movieDataset.groupBy("movieId").count()
  averageAndCounts = averageRatings.join(counts, "movieId")
  averageAndCounts.show()

  #Filter out movies with less than 100 ratings
  popularMovies = averageAndCounts.filter("count > 100").orderBy("avg(rating)", ascending=False).take(10)
  for movie in popularMovies:
    print(movieNames[movie[0]], movie[1], movie[2])

  spark.stop()

+-------+------------------+-----+
|movieId|       avg(rating)|count|
+-------+------------------+-----+
|    474| 4.252577319587629|  194|
|     29|2.6666666666666665|  114|
|     26| 3.452054794520548|   73|
|    964|3.3333333333333335|    9|
|     65|3.5391304347826087|  115|
|    191| 4.163043478260869|  276|
|   1224|2.6666666666666665|   12|
|    558|3.6714285714285713|   70|
|   1010|              3.25|   44|
|    418|3.5813953488372094|  129|
|   1277|3.4210526315789473|   19|
|   1258|2.5217391304347827|   23|
|    541| 2.877551020408163|   49|
|   1360|               1.5|    2|
|    222|  3.66027397260274|  365|
|    938|              2.88|   25|
|    293| 3.802721088435374|  147|
|    270|3.5955882352941178|  136|
|   1127| 2.909090909090909|   11|
|   1371|               1.5|    2|
+-------+------------------+-----+
only showing top 20 rows

Close Shave, A (1995) 4.491071428571429 112
Schindler's List (1993) 4.466442953020135 298
Wrong Trousers, The (1993) 4.466101694915254