# Mengimport modul dan membuat session

In [1]:
#mengimport modul ALS dari Spark Machine Learning
from pyspark.ml.recommendation import ALS

#membuat session
appName = "Sistem Rekomender di Spark"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Memuat data dari file

In [2]:
#membuat data file ke DataFrame
ratings = spark.read.csv('dataset/ratings.csv', inferSchema=True, header=True)
movies = spark.read.csv('dataset/movies.csv', inferSchema=True, header=True)
#menggabungkan data movie dan rating berdasarkan "movieId"
ratings.join(movies, "movieId").show(3)

+-------+------+------+----------+--------------------+--------------------+
|movieId|userId|rating| timestamp|               title|              genres|
+-------+------+------+----------+--------------------+--------------------+
|     31|     1|   2.5|1260759144|Dangerous Minds (...|               Drama|
|   1029|     1|   3.0|1260759179|        Dumbo (1941)|Animation|Childre...|
|   1061|     1|   3.0|1260759182|     Sleepers (1996)|            Thriller|
+-------+------+------+----------+--------------------+--------------------+
only showing top 3 rows



# Menyiapkan data

In [3]:
#memilih kolom data "userId", "movieId", dan "rating"
data = ratings.select("userId", "movieId", "rating")
#membagi data, 70% training dan 30% testing
splits = data.randomSplit([0.7, 0.3])
train = splits[0].withColumnRenamed("rating", "label")
test = splits[1].withColumnRenamed("rating", "trueLabel")
#menghitung baris data training dan testing
train_rows = train.count()
test_rows = test.count()
print ("Jumlah baris data training:", train_rows, 
       ", jumlah baris data testing:", test_rows)

Jumlah baris data training: 69842 , jumlah baris data testing: 30162


# Mendefiniskan model dan mentrainingnya

In [4]:
#mendefinisikan algoritma ALS untuk sistem rekomender kita
als = ALS(maxIter=19, regParam=0.01, userCol="userId", 
          itemCol="movieId", ratingCol="label")
#mentraining model dengan fungsi ".fit()"
model = als.fit(train)
print("Model telah selesai ditraining!")

Model telah selesai ditraining!


# Melakukan prediksi dengan model rekomender yang telah kita training

In [5]:
prediction = model.transform(test)
prediction.join(movies, "movieId").select(
    "userId", "title", "prediction", "trueLabel").show(n=3, truncate=False)

+------+--------------------------------+----------+---------+
|userId|title                           |prediction|trueLabel|
+------+--------------------------------+----------+---------+
|575   |Awfully Big Adventure, An (1995)|NaN       |4.0      |
|232   |Guilty as Sin (1993)            |3.888484  |4.0      |
|452   |Guilty as Sin (1993)            |3.0628183 |2.0      |
+------+--------------------------------+----------+---------+
only showing top 3 rows



# Mengevaluasi seberapa akurat sistem rekomender kita

In [6]:
#import RegressionEvaluator karena kita ingin menghitung RMSE
#, yg formulanya sama saja meski untuk sistem rekomender
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="trueLabel", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(prediction)
print ("Root Mean Square Error (RMSE):", rmse)

Root Mean Square Error (RMSE): nan


In [7]:
prediction.count()
a = prediction.count()
print("jumlah baris sebelum di hapus data kosong: ", a)
cleanPred = prediction.dropna(how="any", subset=["prediction"])
b = cleanPred.count()
print("jumlah baris setelah di hapus data kosong: ", b)
print("jumlah baris data kosong: ", a-b)

jumlah baris sebelum di hapus data kosong:  30162
jumlah baris setelah di hapus data kosong:  28869
jumlah baris data kosong:  1293


In [8]:
rmse = evaluator.evaluate(cleanPred)
print ("Root Mean Square Error (RMSE):", rmse)

Root Mean Square Error (RMSE): 1.2417154440671327
