# Find the most popular movie from the dataset


## u.data   -- The full u data set, 100000 ratings by 943 users on 1682 items.
Each user has rated at least 20 movies.  Users and items are numbered consecutively from 1.  The data is randomly ordered. This is a tab separated list of  
user id | item id | rating | timestamp. 
The time stamps are unix seconds since 1/1/1970 UTC 
## u.item     -- Information about the items (movies); 
this is a tab separated list of
movie id | movie title | release date | video release date |
IMDb URL | unknown | Action | Adventure | Animation |
Children's | Comedy | Crime | Documentary | Drama | Fantasy |
Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
Thriller | War | Western |
The last 19 fields are the genres, a 1 indicates the movie
is of that genre, a 0 indicates it is not; movies can be in
several genres at once.
The movie ids are the ones used in the u.data data set.

In [37]:
# import modules
from pyspark import SparkConf,SparkContext

In [38]:
# define directory
import os
root=os.getcwd()

In [39]:
# define the map function

def getMovieNames():
    data_location=os.path.join(root,'ml_100k/u.item')
    movieNames={} # dictionary, key:movie id, value:movie name
    with open(data_location) as f:
        for line in f:
            elements=line.split('|')
            movieNames[int(elements[0])]=elements[1]
    return movieNames

In [40]:
# set the configuration for spark context
# the cluster will run on local matchines
# remember: name should include white space, or there will be errors
conf=SparkConf().setMaster("local").setAppName('PopMovies')
sc=SparkContext(conf=conf)

In [41]:
#broadcast the movieNames dic to all nodes in cluster
nameDict=sc.broadcast(getMovieNames())

In [43]:
# create a rdd by reading data from a text file
lines=sc.textFile(os.path.join(root,'ml_100k/u.data'))

In [47]:
# map the row to (movie id,1)
movies=lines.map(lambda x: (int(x.split()[1]),1))                

In [57]:
# reduce by movie id, getting the total number of ratings for each
# then map the movie id to movid name
movieCounts=movies.reduceByKey(lambda x,y:x+y)
movieCounts2=movieCounts.map(lambda x: (x[0],nameDict.value[x[0]],x[1]))

In [58]:
# collect results
results=movieCounts2.collect()

In [61]:
# sort the result by count of ratings
results.sort(key=lambda x:x[2])

In [62]:
# list the most popular ten movies
results[-10:]

[(121, 'Independence Day (ID4) (1996)', 429),
 (300, 'Air Force One (1997)', 431),
 (1, 'Toy Story (1995)', 452),
 (288, 'Scream (1996)', 478),
 (286, 'English Patient, The (1996)', 481),
 (294, 'Liar Liar (1997)', 485),
 (181, 'Return of the Jedi (1983)', 507),
 (100, 'Fargo (1996)', 508),
 (258, 'Contact (1997)', 509),
 (50, 'Star Wars (1977)', 583)]

In [63]:
sc.stop()