In [293]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [294]:
ratings = sc.textFile("ml-100k/u.data").map(lambda line: line.split("\t"))
movies = sc.textFile("ml-100k/u.item").map(lambda line: line.split("|"))
users = sc.textFile("ml-100k/u.user").map(lambda line: line.split("|"))
#genres = sc.textFile("ml-100k/u.genre").map(lambda line: line.split("|"))

In [295]:
ratings_Schema= StructType([StructField("user id", StringType(), True),
                            StructField("item id", StringType(), True),
                            StructField("rating", StringType(), True),
                            StructField("timestamp", StringType(), True)])
movies_Schema= StructType([StructField("movie id", StringType(), True),
                           StructField("movie title", StringType(), True),
                           StructField("release date", StringType(), True),
                           StructField("video release date", StringType(), True),
                           StructField("IMBd URL", StringType(), True),
                           StructField("unknown", StringType(), True),
                           StructField("action", StringType(), True),
                           StructField("adventure", StringType(), True),
                           StructField("animation", StringType(), True),
                           StructField("Childen's", StringType(), True),
                           StructField("comedy", StringType(), True),
                           StructField("crime", StringType(), True),
                           StructField("documentary", StringType(), True),
                           StructField("drama", StringType(), True),
                           StructField("fantasy", StringType(), True),
                           StructField("film-noir", StringType(), True),
                           StructField("horror", StringType(), True),
                           StructField("musical", StringType(), True),
                           StructField("mystery", StringType(), True),
                           StructField("romance", StringType(), True),
                           StructField("sci-fi", StringType(), True),
                           StructField("thriller", StringType(), True),
                           StructField("war", StringType(), True),
                           StructField("western", StringType(), True)])
users_Schema= StructType([StructField("user id", StringType(), True),
                          StructField("age", StringType(), True),
                          StructField("gender", StringType(), True),
                          StructField("occupation", StringType(), True),
                          StructField("zip code", StringType(), True)])

In [296]:
def parseInput(line):
    fields = line.split("\t")
    return (int(fields[1]), (float(fields[2]), 1.0))

def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.item") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

# 1-2) Print top 10 movies by ratings

In [297]:
ratings_df = spark.createDataFrame(ratings, ratings_Schema)
movies_df = spark.createDataFrame(movies, movies_Schema)
users_df = spark.createDataFrame(users, users_Schema)


In [298]:
lines = sc.textFile("ml-100k/u.data")
df = lines.map(parseInput)
df2 = df.reduceByKey(lambda m1, m2: (m1[0] + m2[0], m1[1] + m2[1]))
df3 = df2.mapValues(lambda total_and_count: total_and_count[0] / total_and_count[1])
df4 = df3.sortBy(lambda x : -x[1])
df5 = df4.take(10)
movieNames = loadMovieNames()
for result in df5:
    print(movieNames[result[0]], result[1])

Santa with Muscles (1996) 5.0
Great Day in Harlem, A (1994) 5.0
Aiqing wansui (1994) 5.0
They Made Me a Criminal (1939) 5.0
Prefontaine (1997) 5.0
Star Kid (1997) 5.0
Someone Else's America (1995) 5.0
Entertaining Angels: The Dorothy Day Story (1996) 5.0
Saint of Fort Washington, The (1993) 5.0
Marlene Dietrich: Shadow and Light (1996)  5.0


In [289]:
df4.collect()

[(1500, 5.0),
 (814, 5.0),
 (1536, 5.0),
 (1122, 5.0),
 (1189, 5.0),
 (1293, 5.0),
 (1599, 5.0),
 (1653, 5.0),
 (1467, 5.0),
 (1201, 5.0),
 (1449, 4.625),
 (1398, 4.5),
 (1642, 4.5),
 (1594, 4.5),
 (119, 4.5),
 (408, 4.491071428571429),
 (318, 4.466442953020135),
 (169, 4.466101694915254),
 (483, 4.45679012345679),
 (114, 4.447761194029851),
 (64, 4.445229681978798),
 (603, 4.3875598086124405),
 (12, 4.385767790262173),
 (50, 4.3584905660377355),
 (178, 4.344),
 (513, 4.333333333333333),
 (1191, 4.333333333333333),
 (1639, 4.333333333333333),
 (134, 4.292929292929293),
 (963, 4.2926829268292686),
 (427, 4.292237442922374),
 (357, 4.291666666666667),
 (98, 4.28974358974359),
 (480, 4.284916201117318),
 (127, 4.283292978208232),
 (285, 4.265432098765432),
 (272, 4.262626262626263),
 (251, 4.260869565217392),
 (657, 4.259541984732825),
 (474, 4.252577319587629),
 (174, 4.252380952380952),
 (479, 4.251396648044692),
 (1524, 4.25),
 (1064, 4.25),
 (1125, 4.25),
 (313, 4.2457142857142856),
 

In [198]:
ratings_df.show()

+-------+-------+------+---------+
|user id|item id|rating|timestamp|
+-------+-------+------+---------+
|    196|    242|     3|881250949|
|    186|    302|     3|891717742|
|     22|    377|     1|878887116|
|    244|     51|     2|880606923|
|    166|    346|     1|886397596|
|    298|    474|     4|884182806|
|    115|    265|     2|881171488|
|    253|    465|     5|891628467|
|    305|    451|     3|886324817|
|      6|     86|     3|883603013|
|     62|    257|     2|879372434|
|    286|   1014|     5|879781125|
|    200|    222|     5|876042340|
|    210|     40|     3|891035994|
|    224|     29|     3|888104457|
|    303|    785|     3|879485318|
|    122|    387|     5|879270459|
|    194|    274|     2|879539794|
|    291|   1042|     4|874834944|
|    234|   1184|     2|892079237|
+-------+-------+------+---------+
only showing top 20 rows

