In [19]:
from pyspark.sql import SparkSession, Row, functions
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import lit

In [20]:
def loadMovieNames():
    movieNames = {}
    with open("/Users/ayusman/migrate/hadooop/hadoop-basics/ml-100k/u.item", encoding = "ISO-8859-1") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

In [44]:
# Convert u.data lines into (userID, movieID, rating) rows
def parseInput(line):
    fields = line.split()
    return Row(userID = int(fields[0]), movieID = int(fields[1]), rating = float(fields[2]))

In [45]:
# Create a SparkSession (the config bit is only for Windows!)
spark = SparkSession.builder.appName("MovieRecs").getOrCreate()

# This line is necessary on HDP 2.6.5:
spark.conf.set("spark.sql.crossJoin.enabled", "true")

# Load up our movie ID -> name dictionary
movieNames = loadMovieNames()

# Get the raw data
lines = spark.sparkContext.textFile('/Users/ayusman/migrate/hadooop/hadoop-basics/ml-100k/u.data')

In [46]:
# Convert it to a RDD of Row objects with (userID, movieID, rating)
ratingsRDD = lines.map(parseInput)
ratingsRDD.take(2)

[Row(userID=196, movieID=242, rating=3.0),
 Row(userID=186, movieID=302, rating=3.0)]

In [47]:
# Convert to a DataFrame and cache it
ratings = spark.createDataFrame(ratingsRDD).cache()

In [49]:
# Create an ALS collaborative filtering model from the complete data set
als = ALS(maxIter=5, regParam=0.01, userCol="userID", itemCol="movieID", ratingCol="rating")
model = als.fit(ratings)

In [60]:
#Print out 10 ratings from user 1:
count = 0
print("\nRatings for user ID 1:")
userRatings = ratings.filter("userID = 1")
for rating in userRatings.collect():
    print(movieNames[rating['movieID']], rating['rating'])
    count += 1
    if count >10 :
        break


Ratings for user ID 1:
Three Colors: White (1994) 4.0
Grand Day Out, A (1992) 3.0
Desperado (1995) 4.0
Glengarry Glen Ross (1992) 4.0
Angels and Insects (1995) 4.0
Groundhog Day (1993) 5.0
Delicatessen (1991) 5.0
Hunt for Red October, The (1990) 4.0
Dirty Dancing (1987) 2.0
Rock, The (1996) 3.0
Ed Wood (1994) 4.0


In [65]:
# Find movies rated more than 100 times
ratingCounts = ratings.groupBy("movieID").count().filter("count > 100")
# Construct a "test" dataframe for user 1 with every movie rated more than 100 times
popularMovies = ratingCounts.select("movieID").withColumn('userID', lit(1))


Top 20 recommendations:


In [66]:
# Run our model on that list of popular movies for user ID 1
recommendations = model.transform(popularMovies)

In [68]:
# Get the top 20 movies with the highest predicted rating for this user
topRecommendations = recommendations.sort(recommendations.prediction.desc()).take(20)

In [70]:
print("\nTop 20 recommendations:")
for recommendation in topRecommendations:
    print (movieNames[recommendation['movieID']], recommendation['prediction'])



Top 20 recommendations:
12 Angry Men (1957) 4.923419952392578
Godfather, The (1972) 4.886929035186768
Philadelphia Story, The (1940) 4.861466884613037
Citizen Kane (1941) 4.832269668579102
Swingers (1996) 4.802018165588379
Casablanca (1942) 4.793004989624023
Chinatown (1974) 4.7650885581970215
Being There (1979) 4.757648468017578
Monty Python's Life of Brian (1979) 4.748608589172363
Clerks (1994) 4.732601642608643
One Flew Over the Cuckoo's Nest (1975) 4.73155403137207
Secrets & Lies (1996) 4.730564594268799
Big Night (1996) 4.722156524658203
Rear Window (1954) 4.710968971252441
Leaving Las Vegas (1995) 4.7042741775512695
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) 4.7017083168029785
Sling Blade (1996) 4.6992998123168945
Close Shave, A (1995) 4.693426132202148
Fantasia (1940) 4.6842851638793945
Monty Python and the Holy Grail (1974) 4.676304817199707


In [72]:
topRecommendations

[Row(movieID=178, userID=1, prediction=4.923419952392578),
 Row(movieID=127, userID=1, prediction=4.886929035186768),
 Row(movieID=478, userID=1, prediction=4.861466884613037),
 Row(movieID=134, userID=1, prediction=4.832269668579102),
 Row(movieID=150, userID=1, prediction=4.802018165588379),
 Row(movieID=483, userID=1, prediction=4.793004989624023),
 Row(movieID=654, userID=1, prediction=4.7650885581970215),
 Row(movieID=663, userID=1, prediction=4.757648468017578),
 Row(movieID=154, userID=1, prediction=4.748608589172363),
 Row(movieID=42, userID=1, prediction=4.732601642608643),
 Row(movieID=357, userID=1, prediction=4.73155403137207),
 Row(movieID=285, userID=1, prediction=4.730564594268799),
 Row(movieID=137, userID=1, prediction=4.722156524658203),
 Row(movieID=603, userID=1, prediction=4.710968971252441),
 Row(movieID=276, userID=1, prediction=4.7042741775512695),
 Row(movieID=474, userID=1, prediction=4.7017083168029785),
 Row(movieID=223, userID=1, prediction=4.69929981231689