In [1]:
# Aim is to find the most popular or most watched movie base on MovieID and Ratings.
# Therefore we need to find the frequency of occurence of each MovidID in the list.

In [2]:
from pyspark import SparkConf, SparkContext
import collections

In [3]:
conf = SparkConf().setMaster("local").setAppName("MostPopularMovie")  #Running on local machine and setting app name
sc = SparkContext.getOrCreate(conf = conf)  #Create a spark context object

In [4]:
movie = sc.textFile("/FileStore/tables/u.data")
type(movie)

for i in movie.collect()[0:9]:  #Looking at the top 10 results
  print(i)

In [5]:
#MovieID is the 2nd column.

MovieID = movie.map(lambda x: x.split()[1])

for i in MovieID.collect()[0:9]:
  print(i)

In [6]:
count = MovieID.countByValue()
type(count)
for key, value in count.items():
  print(key, '\t', value)

In [7]:
# We need to sort it by the count i.e. column 2
result1 = sorted(count.items(), key = lambda x: x[1], reverse = True)  # This way of sorting converts into list of tuples

result2 = collections.OrderedDict(sorted(count.items(), key = lambda x: x[1], reverse = True))  #This way of sorting converts into ordered dict

In [8]:
#We can print result1 in below way:

for i in result1:
  print(i[0], '\t', i[1])

In [9]:
# We can print result2 in the beloew way:
for key, value in result2.items():
  print(key, '\t', value)

In [10]:
# Therefore the most watched movie is with MovieID = 50.

In [11]:
pairs = movie.map(lambda x: (x.split()[1], 1))      #Forming a (value, 1) pair

for i in pairs.collect()[0:5]:
  print(i)

In [12]:
# In the above output, first value is KEY and the second value is VALUE.
# Therefore, we can add the VALUES for same KEY.

count = pairs.reduceByKey(lambda x,y: x+y) #where 'x' is first row, and 'y' is next row.
# And KEY is ignored here as we are using reduceByKey function. Thus it only adds values.

for i in count.collect()[0:5]:
  print (i)

In [13]:
# Let's sort by count of each movie.
# Since we are soring by Values, we can't use "sortByKey" function
# Therefore we will use "sortBy" function and specify the Value in the parameter.

result = count.sortBy(lambda x: x[1], ascending = False)  #Here we are sorting by Value i.e. x[1]

for i in result.collect():
  print (i[0], '\t', i[1])