<a href="https://colab.research.google.com/github/azhary86/bigdata2023/blob/main/Assignment-9_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%%capture
!sudo apt-get update --fix-missing

!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

#!mv spark-3.0.0-bin-hadoop3.2.tgz sparkkk
!tar xf spark-3.0.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

spark

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [5]:
lines = spark.read.text("/content/gdrive/My Drive/bigdata2023/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])
ratings

userId,movieId,rating,timestamp
0,2,3.0,1424380312
0,3,1.0,1424380312
0,5,2.0,1424380312
0,9,4.0,1424380312
0,11,1.0,1424380312
0,12,2.0,1424380312
0,15,1.0,1424380312
0,17,1.0,1424380312
0,19,1.0,1424380312
0,21,1.0,1424380312


# **regParam 0.1**

Max iter 5

In [6]:
als = ALS(maxIter=5, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [7]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

1.1168409738974592

Max iter 10

In [8]:
als = ALS(maxIter=10, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [9]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

1.0520832510157423

Max iter 20

In [10]:
als = ALS(maxIter=20, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [11]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

1.0356696368758882

# **regParam 0.5**

Max iter 5

In [12]:
als = ALS(maxIter=5, regParam=0.5, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [13]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

1.3443237174800196

Max iter 10

In [14]:
als = ALS(maxIter=10, regParam=0.5, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [15]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

1.341413631157068

Max iter 20

In [16]:
als = ALS(maxIter=20, regParam=0.5, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [17]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

1.3415097125700757

# **regParam 1.0**

Max iter 5

In [18]:
als = ALS(maxIter=5, regParam=1.0, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [19]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

1.636369427587814

Max iter 10

In [20]:
als = ALS(maxIter=10, regParam=1.0, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [21]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

1.636369422267838

Max iter 20

In [22]:
als = ALS(maxIter=20, regParam=1.0, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [23]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse

1.6363694247340612

# conclusion


*   the higher regParam value, the higher RMSE (Root Mean Square Error) value
*    



In [24]:
userRecs = model.recommendForAllUsers(10)

movieRecs = model.recommendForAllItems(10)


users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)

movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)

In [25]:
print(userRecs,users,userSubsetRecs)

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    28|[[32, 1.5057156],...|
|    26|[[32, 1.5691745],...|
|    27|[[32, 1.2169762],...|
|    12|[[32, 1.4280378],...|
|    22|[[32, 1.653142], ...|
|     1|[[32, 1.1779101],...|
|    13|[[32, 1.1901962],...|
|     6|[[32, 1.0206982],...|
|    16|[[32, 1.4448189],...|
|     3|[[32, 1.3756318],...|
|    20|[[32, 1.414803], ...|
|     5|[[32, 1.3723788],...|
|    19|[[32, 1.1931331],...|
|    15|[[32, 1.0174483],...|
|    17|[[32, 1.5618776],...|
|     9|[[32, 1.3695327],...|
|     4|[[32, 1.1730655],...|
|     8|[[32, 1.4640737],...|
|    23|[[32, 1.6999125],...|
|     7|[[32, 1.2330371],...|
+------+--------------------+
only showing top 20 rows
 +------+
|userId|
+------+
|    26|
|    29|
|    19|
+------+
 +------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    26|[[32, 1.5691745],...|
|    19|[[32, 1.1931331],...|
|    29|[[32, 1.4927351],...|
+------+--

In [27]:
print(movieRecs,movies,movieSubSetRecs)

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     31|[[11, 1.004093], ...|
|     85|[[11, 1.2112008],...|
|     65|[[11, 1.1154374],...|
|     53|[[11, 1.2445621],...|
|     78|[[11, 0.59644693]...|
|     34|[[11, 0.9516681],...|
|     81|[[11, 1.4302608],...|
|     28|[[11, 0.85641456]...|
|     76|[[11, 1.2940328],...|
|     26|[[11, 0.691625], ...|
|     27|[[11, 1.4932251],...|
|     44|[[11, 0.80313367]...|
|     12|[[11, 1.0297291],...|
|     91|[[11, 1.0683824],...|
|     22|[[11, 1.1324949],...|
|     93|[[11, 1.4512936],...|
|     47|[[11, 0.95127094]...|
|      1|[[11, 0.68917364]...|
|     52|[[11, 1.4973824],...|
|     13|[[11, 1.1191458],...|
+-------+--------------------+
only showing top 20 rows
 +-------+
|movieId|
+-------+
|     26|
|     29|
|     65|
+-------+
 +-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     65|[[11, 1.1154374],...|
|     26|[[11, 0.691625], ...|
|   