In [21]:
from pyspark.sql import SparkSession, Row, functions
from pyspark.sql.functions import lit
from pyspark.ml.recommendation import ALS

# Create Spark Session

In [3]:
spark = SparkSession.builder.appName("MovieRecommendations").getOrCreate()
spark

23/11/29 11:25:19 WARN Utils: Your hostname, anbish resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp1s0)
23/11/29 11:25:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/29 11:25:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/29 11:25:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Read raw data

In [4]:
lines = spark.sparkContext.textFile("../data/ml-100k/u.data")
lines

../data/ml-100k/u.data MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

# Define parse input function to convert RDD to list of Row objects

In [8]:
def parse_input(line):
    fields = line.split()
    userID = int(fields[0])
    movieID = int(fields[1])
    rating = float(fields[2])
    return Row(UserID=userID, MovieID=movieID, Rating=rating)

rdd_ratings = lines.map(parse_input)
rdd_ratings.take(10)


[Row(UserID=196, MovieID=242, Rating=3.0),
 Row(UserID=186, MovieID=302, Rating=3.0),
 Row(UserID=22, MovieID=377, Rating=1.0),
 Row(UserID=244, MovieID=51, Rating=2.0),
 Row(UserID=166, MovieID=346, Rating=1.0),
 Row(UserID=298, MovieID=474, Rating=4.0),
 Row(UserID=115, MovieID=265, Rating=2.0),
 Row(UserID=253, MovieID=465, Rating=5.0),
 Row(UserID=305, MovieID=451, Rating=3.0),
 Row(UserID=6, MovieID=86, Rating=3.0)]

# Convert RDD to dataframe

In [9]:
movie_ratings = spark.createDataFrame(rdd_ratings)
movie_ratings.show(10)

+------+-------+------+
|UserID|MovieID|Rating|
+------+-------+------+
|   196|    242|   3.0|
|   186|    302|   3.0|
|    22|    377|   1.0|
|   244|     51|   2.0|
|   166|    346|   1.0|
|   298|    474|   4.0|
|   115|    265|   2.0|
|   253|    465|   5.0|
|   305|    451|   3.0|
|     6|     86|   3.0|
+------+-------+------+
only showing top 10 rows



# Show rated movies for user id 1

In [15]:
user1_movies = movie_ratings.filter("UserID = 1")
user1_movies

DataFrame[UserID: bigint, MovieID: bigint, Rating: double]

- Load movie names from u.item

In [16]:
def load_movie_names():
    movie_names = {}
    with open("../data/ml-100k/u.item", encoding="latin1") as file:
        for line in file:
            fields = line.split("|")
            movieID = int(fields[0])
            name = fields[1]

            movie_names[movieID] = name
    return movie_names

movie_names = load_movie_names()

- Names of rated movies for user 1

In [17]:
for rating in user1_movies.collect():
    movieID = rating['MovieID']
    print(f"{movie_names[movieID]} || rating: {rating['Rating']}")

Three Colors: White (1994) || rating: 4.0
Grand Day Out, A (1992) || rating: 3.0
Desperado (1995) || rating: 4.0
Glengarry Glen Ross (1992) || rating: 4.0
Angels and Insects (1995) || rating: 4.0
Groundhog Day (1993) || rating: 5.0
Delicatessen (1991) || rating: 5.0
Hunt for Red October, The (1990) || rating: 4.0
Dirty Dancing (1987) || rating: 2.0
Rock, The (1996) || rating: 3.0
Ed Wood (1994) || rating: 4.0
Star Trek: First Contact (1996) || rating: 4.0
Pillow Book, The (1995) || rating: 5.0
Horseman on the Roof, The (Hussard sur le toit, Le) (1995) || rating: 5.0
Star Trek VI: The Undiscovered Country (1991) || rating: 4.0
From Dusk Till Dawn (1996) || rating: 3.0
So I Married an Axe Murderer (1993) || rating: 4.0
Shawshank Redemption, The (1994) || rating: 5.0
True Romance (1993) || rating: 3.0
Star Trek: The Wrath of Khan (1982) || rating: 5.0
Kull the Conqueror (1997) || rating: 1.0
Independence Day (ID4) (1996) || rating: 4.0
Wallace & Gromit: The Best of Aardman Animation (1996

# Get movie that was rated over 100 times

In [20]:
rating_count_filter = movie_ratings.groupBy("movieID").count().filter("count >= 100")
rating_count_filter.show(10)

+-------+-----+
|movieID|count|
+-------+-----+
|    474|  194|
|     29|  114|
|     65|  115|
|    191|  276|
|    418|  129|
|    222|  365|
|    293|  147|
|    270|  136|
|    367|  170|
|    705|  137|
+-------+-----+
only showing top 10 rows



# ML

In [23]:
als=ALS(maxIter=5, regParam=0.01, userCol="UserID", itemCol="MovieID", ratingCol="Rating")
model = als.fit(movie_ratings)

In [24]:
popular_movies = rating_count_filter.select("MovieID").withColumn("UserID", lit(1))
popular_movies.show(10)

+-------+------+
|MovieID|UserID|
+-------+------+
|    474|     1|
|     29|     1|
|     65|     1|
|    191|     1|
|    418|     1|
|    222|     1|
|    293|     1|
|    270|     1|
|    367|     1|
|    705|     1|
+-------+------+
only showing top 10 rows



In [25]:
recommendations = model.transform(popular_movies)
recommendations.show(10)

+-------+------+----------+
|MovieID|UserID|prediction|
+-------+------+----------+
|    474|     1|  4.710671|
|     29|     1| 1.7873267|
|     65|     1| 3.7993805|
|    191|     1| 4.3726716|
|    418|     1| 2.5764577|
|    222|     1| 3.4987476|
|    293|     1|  4.269912|
|    270|     1|  3.401249|
|    367|     1|  3.486995|
|    705|     1| 3.5915065|
+-------+------+----------+
only showing top 10 rows



In [27]:
recommendations.orderBy("prediction", ascending=False).show(10)

+-------+------+----------+
|MovieID|UserID|prediction|
+-------+------+----------+
|     56|     1| 5.0769544|
|     12|     1|  5.020918|
|    100|     1| 5.0079856|
|    127|     1| 5.0022445|
|    156|     1|   4.99092|
|    357|     1|  4.962844|
|    169|     1| 4.9581566|
|     89|     1| 4.9482355|
|    475|     1|   4.85978|
|    179|     1|  4.843936|
+-------+------+----------+
only showing top 10 rows

