In [1]:
from pyspark import SparkConf, SparkContext
import pyspark
conf = SparkConf()
#set validateOutputSpecs to false to ignore writing file to exists output directory
conf.set("spark.hadoop.validateOutputSpecs", "false")
sc = SparkContext.getOrCreate()
sc.stop()
sc = SparkContext(appName = 'FindTopTenMoviesByRating', conf = conf)

In [2]:
sc # to make sure that spark is work !

# Rating section

In [3]:
#load ratings data
ratings_raw = sc.textFile("ml-10M100K/ratings_small.dat")
ratings_raw.takeSample(False, 5)

['139::2109::3::974488133',
 '51::1929::5::982368105',
 '289::1093::3::948357538',
 '182::2687::3::998911740',
 '323::3175::5::1022309843']

In [4]:
#movie_id, rating
movies_ratings = ratings_raw.map(lambda line: (line.split('::')[1],float((line.split('::')[2]))))
movies_ratings.takeSample(False, 5)

[('4066', 3.0), ('1222', 5.0), ('19', 4.0), ('3994', 4.0), ('3203', 3.0)]

In [5]:
#get total sum of rating and total number of rating from users seperated by movie_id
sum_count = (0,0)
sum_movies_ratings = movies_ratings.aggregateByKey(sum_count, lambda a,b: (a[0] + b,    a[1] + 1),
                                  lambda a,b: (a[0] + b[0], a[1] + b[1]))
sum_movies_ratings.takeSample(False, 5)

[('5809', (2.5, 1)),
 ('53464', (3.0, 1)),
 ('3928', (3.0, 1)),
 ('381', (40.0, 11)),
 ('458', (11.0, 3))]

In [6]:
#get only average rating of each movie
#movie_id, avg_rating
movie_avg_rating = sum_movies_ratings.mapValues(lambda v: round(v[0]/v[1],3)).takeOrdered(10, key = lambda x: -x[1])
print(movie_avg_rating)

[('2931', 5.0), ('2938', 5.0), ('3885', 5.0), ('116', 5.0), ('1489', 5.0), ('8916', 5.0), ('47099', 5.0), ('47970', 5.0), ('602', 5.0), ('1152', 5.0)]


# Movie section

In [7]:
#load movies data
movies_raw = sc.textFile("ml-10M100K/movies.dat")
movies_raw.takeSample(False, 5)

['3713::Long Walk Home, The (1990)::Drama',
 '1898::Land Girls, The (1998)::Drama|War',
 '2873::Lulu on the Bridge (1998)::Drama|Mystery|Romance',
 '25766::Crowd, The (1928)::Drama',
 '400::Homage (1995)::Drama']

In [8]:
#movie_id, movie_name
movies = movies_raw.map(lambda line: (line.split('::')[0], line.split('::')[1]))
movies.takeSample(False, 5)

[('377', 'Speed (1994)'),
 ('6554', 'Garage Days (2002)'),
 ('2522', "Airport '77 (1977)"),
 ('2775', 'Head On (1998)'),
 ('1391', 'Mars Attacks! (1996)')]

In [9]:
#convert rdd to be dictionary data
movies_list = movies.collect()
movies_list = dict((key, value) for (key,value) in movies_list)

In [10]:
#get the movie name from movies_list and rating from result_movies
top_ten_movies = [(movies_list.get(r[0]),r[1]) for r in movie_avg_rating]
print(top_ten_movies)

[('Time of the Gypsies (Dom za vesanje) (1989)', 5.0), ('Man Facing Southeast (Hombre mirando al sudeste) (1986)', 5.0), ('Love & Sex (2000)', 5.0), ('Anne Frank Remembered (1995)', 5.0), ("Cats Don't Dance (1997)", 5.0), ('Shall We Dance? (2004)', 5.0), ('Pursuit of Happyness, The (2006)', 5.0), ('Last Kiss, The (2006)', 5.0), ('Great Day in Harlem, A (1994)', 5.0), ('He Walked by Night (1948)', 5.0)]


# Output

In [11]:
#save output to hdfs
sc.parallelize(top_ten_movies).coalesce(1).saveAsTextFile('output/FindTopTenMoviesByRating/')