In [1]:
from pyspark import SparkConf, SparkContext
import pyspark
conf = SparkConf()
#set validateOutputSpecs to false to ignore writing file to exists output directory
conf.set("spark.hadoop.validateOutputSpecs", "false")
sc = SparkContext.getOrCreate()
sc.stop()
sc = SparkContext(appName = 'FindTopTenMoviesByRatingAndGenre', conf = conf)

In [2]:
sc # to make sure that spark is work !

# Genre section

In [3]:
#load movies data
movies_raw = sc.textFile("ml-10M100K/movies.dat")
movies_raw.takeSample(False, 5)

['841::Eyes Without a Face (Les Yeux sans visage) (1959)::Horror',
 '5047::Kung Pow: Enter the Fist (2002)::Action|Comedy',
 '558::Pagemaster, The (1994)::Action|Adventure|Animation|Children|Fantasy',
 '8070::Grill Point (Halbe Treppe) (2002)::Drama',
 '58347::Penelope (2006)::Children|Comedy|Fantasy|Romance']

In [4]:
genres = movies_raw.flatMap(lambda line: line.split('::')[2].split('|')).distinct()
genres.takeSample(False, 5)

['Crime', 'Drama', 'Thriller', 'War', 'Children']

In [5]:
total_genres = genres.collect()
print(genres.collect())

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi', 'IMAX', 'Documentary', 'War', 'Musical', 'Film-Noir', 'Western', '(no genres listed)']


# Rating section

In [6]:
#load ratings data
ratings_raw = sc.textFile("ml-10M100K/ratings_small.dat")
ratings_raw.takeSample(False, 5)

['295::2763::5::981595234',
 '103::1347::3::868266274',
 '174::2391::2::959913815',
 '262::3408::2.5::1157118105',
 '215::6264::4::1147232337']

In [7]:
#movie_id, rating
movies_ratings = ratings_raw.map(lambda line: (line.split('::')[1],float((line.split('::')[2]))))
movies_ratings.takeSample(False, 5)

[('3571', 5.0), ('527', 4.0), ('1409', 3.0), ('2529', 4.0), ('1047', 4.5)]

In [8]:
rdd = {}
for g in total_genres:
    #get only movie_id in that genre
    movies_list = movies_raw.filter(lambda line: g in line.split('::')[2]).map(lambda line: (line.split('::')[0])).collect()

    #get only rating from those movies
    movies_ratings_filter = movies_ratings.filter(lambda mv: mv[0] in movies_list)
    
    #get total sum of rating and total number of rating from users seperated by movie_id
    sum_count = (0,0)
    sum_movies_ratings = movies_ratings_filter.aggregateByKey(sum_count, lambda a,b: (a[0] + b,    a[1] + 1),
                                  lambda a,b: (a[0] + b[0], a[1] + b[1]))

    movie_avg_rating = sum_movies_ratings.mapValues(lambda v: round(v[0]/v[1],3)).takeOrdered(10, key = lambda x: -x[1])
    
    #find movie_name
    movies = movies_raw.map(lambda line: (line.split('::')[0], line.split('::')[1]))
    movies_list = movies.collect()
    movies_list = dict((key, value) for (key,value) in movies_list)
    top_ten_movies = [(movies_list.get(r[0]),r[1]) for r in movie_avg_rating]
    
    rdd[g] = top_ten_movies


In [9]:
print(rdd)

{'Adventure': [('Rio Lobo (1970)', 5.0), ('Chronicles of Narnia: Prince Caspian, The (2008)', 5.0), ('Day of the Beast, The (El Día de la bestia) (1995)', 5.0), ('Days of Glory (Indigènes) (2006)', 5.0), ('Wages of Fear, The (Le Salaire de la peur) (1953)', 4.875), ('WALL·E (2008)', 4.833), ('Dersu Uzala (1975)', 4.6), ('Man Who Knew Too Much, The (1956)', 4.571), ('After the Sunset (2004)', 4.5), ('Horse Soldiers, The (1959)', 4.5)], 'Animation': [("Cats Don't Dance (1997)", 5.0), ('WALL·E (2008)', 4.833), ('Winnie the Pooh and the Blustery Day (1968)', 4.8), ('Last Unicorn, The (1982)', 4.75), ('Persepolis (2007)', 4.5), ('Wallace & Gromit: The Best of Aardman Animation (1996)', 4.471), ('Castle in the Sky (Tenkû no shiro Rapyuta) (1986)', 4.417), ('Wallace & Gromit: The Wrong Trousers (1993)', 4.381), ('Creature Comforts (1989)', 4.35), ("Howl's Moving Castle (Hauru no ugoku shiro) (2004)", 4.333)], 'Children': [("Cats Don't Dance (1997)", 5.0), ('Chronicles of Narnia: Prince Caspia

# Output

In [10]:
#save output to hdfs
sc.parallelize(rdd.items()).coalesce(1).saveAsTextFile('output/FindTopTenMoviesByRatingAndGenre/')