In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F
# Initialize Spark session
spark = SparkSession.builder \
    .appName("MovieLensDataProcessing") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/11 12:19:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
ratings_df = spark.read.csv("data_mk/u.data", sep="\t", inferSchema=True, header=False) \
    .toDF("user_id", "movie_id", "rating", "timestamp")
# Show the first few rows of the data
ratings_df.show()

                                                                                

+-------+--------+------+---------+
|user_id|movie_id|rating|timestamp|
+-------+--------+------+---------+
|    196|     242|     3|881250949|
|    186|     302|     3|891717742|
|     22|     377|     1|878887116|
|    244|      51|     2|880606923|
|    166|     346|     1|886397596|
|    298|     474|     4|884182806|
|    115|     265|     2|881171488|
|    253|     465|     5|891628467|
|    305|     451|     3|886324817|
|      6|      86|     3|883603013|
|     62|     257|     2|879372434|
|    286|    1014|     5|879781125|
|    200|     222|     5|876042340|
|    210|      40|     3|891035994|
|    224|      29|     3|888104457|
|    303|     785|     3|879485318|
|    122|     387|     5|879270459|
|    194|     274|     2|879539794|
|    291|    1042|     4|874834944|
|    234|    1184|     2|892079237|
+-------+--------+------+---------+
only showing top 20 rows



In [14]:
films_df = spark.read.csv("data_mk/u.item", sep="|", inferSchema=True, header=False) \
    .toDF("movie_id", "title", "release_date", "video_release_date", "IMDb_url", "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western")

In [16]:
users_df = spark.read.csv("data_mk/u.user", sep="|", inferSchema=True, header=False) \
    .toDF("user_id", "age", "gender", "occupation", "zip_code")
users_df.toPandas()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [None]:
# Drop the timestamp column since it's not needed for the collaborative filtering model
ratings_df = ratings_df.drop("timestamp")

In [22]:
ratings_with_movie_titles = ratings_df.join(films_df.select("movie_id", "title"), on="movie_id", how="left")
ratings_with_movie_titles.toPandas()

Unnamed: 0,movie_id,user_id,rating,timestamp,title
0,242,196,3,881250949,Kolya (1996)
1,302,186,3,891717742,L.A. Confidential (1997)
2,377,22,1,878887116,Heavyweights (1994)
3,51,244,2,880606923,Legends of the Fall (1994)
4,346,166,1,886397596,Jackie Brown (1997)
...,...,...,...,...,...
99995,476,880,3,880175444,"First Wives Club, The (1996)"
99996,204,716,5,879795543,Back to the Future (1985)
99997,1090,276,1,874795795,Sliver (1993)
99998,225,13,2,882399156,101 Dalmatians (1996)


In [24]:
films_df.toPandas()

Unnamed: 0,movie_id,title,release_date,video_release_date,IMDb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
genre_columns = [col for col in films_df.columns if col not in ['movie_id', 'title', 'release_date', 
                                                                'video_release_date', 'IMDb_url']]

In [31]:
print(genre_columns)

['unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [33]:
for genre in genre_columns:
    ratings_with_movie_titles = ratings_with_movie_titles.join(films_df.select("movie_id", genre), 
                                                               on="movie_id", how="left")

In [38]:
(training_data, test_data) = ratings_df.randomSplit([0.8, 0.2], seed=1234)

In [39]:
als = ALS(userCol="user_id", itemCol="movie_id", ratingCol="rating", coldStartStrategy="drop")

In [40]:
als_model = als.fit(training_data)

                                                                                

In [41]:
predictions = als_model.transform(test_data)

In [42]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)


In [46]:
print(rmse)

0.9240972528145989


In [48]:
user_recommendations = als_model.recommendForAllUsers(6)

# Example: Show recommendations for the first user
user_recommendations.show(truncate=False)



+-------+-----------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                  |
+-------+-----------------------------------------------------------------------------------------------------------------+
|1      |[{1589, 5.5392904}, {1449, 5.0326757}, {613, 5.031781}, {169, 4.9561973}, {408, 4.9103785}, {119, 4.893077}]     |
|2      |[{1463, 5.348398}, {1449, 4.9564934}, {1643, 4.951976}, {1398, 4.869051}, {1642, 4.817443}, {1122, 4.702278}]    |
|3      |[{838, 4.6144404}, {1142, 4.455568}, {320, 4.396119}, {205, 4.285173}, {1388, 4.1488476}, {50, 4.0550604}]       |
|4      |[{1251, 5.862391}, {1589, 5.7110167}, {793, 5.643307}, {958, 5.61551}, {1463, 5.599249}, {1150, 5.4867496}]      |
|5      |[{114, 4.8318996}, {793, 4.716633}, {838, 4.6926055}, {1589, 4.65665}, {169, 4.6106434}, {613, 4.6097617}]       |
|6      

                                                                                

In [49]:
user_recommendations.show(10, truncate=False)



+-------+-----------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                  |
+-------+-----------------------------------------------------------------------------------------------------------------+
|1      |[{1589, 5.5392904}, {1449, 5.0326757}, {613, 5.031781}, {169, 4.9561973}, {408, 4.9103785}, {119, 4.893077}]     |
|2      |[{1463, 5.348398}, {1449, 4.9564934}, {1643, 4.951976}, {1398, 4.869051}, {1642, 4.817443}, {1122, 4.702278}]    |
|3      |[{838, 4.6144404}, {1142, 4.455568}, {320, 4.396119}, {205, 4.285173}, {1388, 4.1488476}, {50, 4.0550604}]       |
|4      |[{1251, 5.862391}, {1589, 5.7110167}, {793, 5.643307}, {958, 5.61551}, {1463, 5.599249}, {1150, 5.4867496}]      |
|5      |[{114, 4.8318996}, {793, 4.716633}, {838, 4.6926055}, {1589, 4.65665}, {169, 4.6106434}, {613, 4.6097617}]       |
|6      

                                                                                

In [51]:
als_model.save("maching_learning_model/als_model")

from pyspark.ml.recommendation import ALSModel
loaded_model = ALSModel.load("data_mk/als_model")

25/03/11 14:43:40 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

Py4JJavaError: An error occurred while calling o532.load.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/opt/spark-data/data_mk/als_model/metadata
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:210)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:294)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:290)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:294)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:290)
	at org.apache.spark.rdd.RDD.$anonfun$take$1(RDD.scala:1471)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1465)
	at org.apache.spark.rdd.RDD.$anonfun$first$1(RDD.scala:1506)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.first(RDD.scala:1506)
	at org.apache.spark.ml.util.DefaultParamsReader$.loadMetadata(ReadWrite.scala:587)
	at org.apache.spark.ml.recommendation.ALSModel$ALSModelReader.load(ALS.scala:563)
	at org.apache.spark.ml.recommendation.ALSModel$ALSModelReader.load(ALS.scala:557)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.base/java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Unknown Source)
Caused by: java.io.IOException: Input path does not exist: file:/opt/spark-data/data_mk/als_model/metadata
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
	... 35 more


In [52]:
# Stop Spark session
spark.stop()