In [1]:

#importing the required pyspark library
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import IntegerType
import pandas as pd

#Setup Spark Session
spark = SparkSession.builder.appName('Recommender').getOrCreate()
spark

In [2]:
'''
df  = pd.read_csv('BX-Book-Ratings.csv', delimiter=';', index_col = False)
df.set_index('User-ID', inplace=True)
ds = df.sample(frac=0.1, random_state=1)
ds.to_csv('Shuffled-BX-Book-Ratings.csv')
'''

"\ndf  = pd.read_csv('BX-Book-Ratings.csv', delimiter=';', index_col = False)\ndf.set_index('User-ID', inplace=True)\nds = df.sample(frac=0.1, random_state=1)\nds.to_csv('Shuffled-BX-Book-Ratings.csv')\n"

In [3]:
data = spark.read.option("header", True).csv('Shuffled-BX-Book-Ratings.csv')
data.show(5)

+-------+----------+-----------+
|User-ID|      ISBN|Book-Rating|
+-------+----------+-----------+
|  76798|2070213951|          0|
|  22281|8401461170|         10|
| 249939|0441003745|          8|
| 113904|0670815616|          0|
| 194151|0399144315|          7|
+-------+----------+-----------+
only showing top 5 rows



In [4]:
data = data.filter(data['Book-Rating'] != 0)
#change the data type of the columns
data = data.withColumn("User-ID", data["User-ID"].cast(IntegerType()))
#data = data.withColumn("ISBN", data["ISBN"].cast(IntegerType()))
data = data.withColumn("Book-Rating", data["Book-Rating"].cast(IntegerType()))
data.show(5)

indexer = StringIndexer(inputCol="ISBN", outputCol="ISBN_Index")

+-------+----------+-----------+
|User-ID|      ISBN|Book-Rating|
+-------+----------+-----------+
|  22281|8401461170|         10|
| 249939|0441003745|          8|
| 194151|0399144315|          7|
|  16982|1857028570|          3|
|  73681|0684821303|          5|
+-------+----------+-----------+
only showing top 5 rows



In [5]:
data = indexer.fit(data).transform(data)
data.show(20)

+-------+----------+-----------+----------+
|User-ID|      ISBN|Book-Rating|ISBN_Index|
+-------+----------+-----------+----------+
|  22281|8401461170|         10|   29542.0|
| 249939|0441003745|          8|    3367.0|
| 194151|0399144315|          7|    1527.0|
|  16982|1857028570|          3|   26741.0|
|  73681|0684821303|          5|   19419.0|
| 239000|0441569595|         10|     938.0|
| 146386|0821770918|          9|   22880.0|
|  23872|1579901751|          9|   26171.0|
| 123883|0515128554|          5|     201.0|
| 222918|0449217493|          7|     969.0|
| 197659|0312064977|          9|    8396.0|
|  25981|0380405768|          5|    2904.0|
|  84479|0671687816|          9|   18420.0|
|  25122|0571169341|          7|    3931.0|
|  75501|0842335714|          8|    4611.0|
| 163358|0440220688|          7|    1598.0|
|  72352|0373834705|          8|   10842.0|
|  76499|2707301485|         10|   27803.0|
| 230030|1560252529|          7|   25442.0|
| 114368|0373240228|         10|

In [6]:
data.describe().show()

+-------+------------------+---------------+------------------+------------------+
|summary|           User-ID|           ISBN|       Book-Rating|        ISBN_Index|
+-------+------------------+---------------+------------------+------------------+
|  count|             43420|          43420|             43420|             43420|
|   mean|135926.74769691387|       Infinity| 7.591179180101336|11821.774297558728|
| stddev| 80564.87537415828|            NaN|1.8447544412534618|  10088.0865445309|
|    min|                 9|     0330299891|                 1|               0.0|
|    max|            278854|\8888809228""""|                10|           31520.0|
+-------+------------------+---------------+------------------+------------------+



In [7]:
train_data, test_data = data.randomSplit([0.8, 0.2])

In [9]:
# Build the recommendation model using ALS on the training data
als = ALS(maxIter=20,
          regParam=0.01,
          userCol="User-ID",
          itemCol="ISBN_Index",
          ratingCol="Book-Rating",
          nonnegative=True)

#Fitting the model on the train_data
model = als.fit(train_data)

ConnectionRefusedError: [WinError 10061] Impossibile stabilire la connessione. Rifiuto persistente del computer di destinazione

In [27]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test_data)

#Displaying predictions calculated by the model
predictions.show(20)

+-------+----------+-----------+----------+----------+
|User-ID|      ISBN|Book-Rating|ISBN_Index|prediction|
+-------+----------+-----------+----------+----------+
|    899|0971880107|          2|       2.0|0.92186326|
|    254|0439136350|          9|      39.0|  6.701047|
|    595|0446610399|          9|     165.0| 5.1245317|
|    424|0425156842|          7|    3208.0|       NaN|
|    507|0689711522|         10|    4292.0|       NaN|
|    709|015216250X|         10|    7744.0|       NaN|
|    507|0312986769|          9|    8973.0|       NaN|
|    900|0373057296|          5|   10181.0|       NaN|
|    507|0375815147|          9|   11193.0|       NaN|
|    345|067153484X|          6|   18228.0|       NaN|
|    882|0671641778|         10|   18330.0|       NaN|
|    273|0671649949|          6|   18341.0|       NaN|
|    901|1576832899|         10|   26104.0|       NaN|
|     12|1879384493|         10|   26986.0|       NaN|
|    625|2266033689|          9|   27674.0|       NaN|
|    850|3

In [8]:
#Printing and calculating RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = nan


In [9]:
#Filtering user with user id "5461" with book id on which it has given the reviews
user1 = test_data.filter(test_data['user_id']==5461).select(['book_id','user_id'])

#Displaying user1 data
user1.show()

+-------+-------+
|book_id|user_id|
+-------+-------+
|      2|   5461|
|     35|   5461|
|     58|   5461|
|     60|   5461|
|    113|   5461|
|    116|   5461|
|    118|   5461|
|    186|   5461|
|    194|   5461|
|    212|   5461|
|    251|   5461|
|    261|   5461|
|    296|   5461|
|    311|   5461|
|    357|   5461|
|    396|   5461|
|    588|   5461|
|    595|   5461|
|    646|   5461|
|    693|   5461|
+-------+-------+
only showing top 20 rows



In [10]:
recommendations = model.transform(user1)

#Displaying the predictions of books for user1
recommendations.orderBy('prediction',ascending=False).show()

+-------+-------+----------+
|book_id|user_id|prediction|
+-------+-------+----------+
|    113|   5461| 4.7132545|
|    396|   5461| 4.5312314|
|     58|   5461| 4.4338636|
|     60|   5461|  4.391992|
|    296|   5461| 4.3389835|
|    116|   5461|  4.246293|
|    733|   5461| 4.2270174|
|    847|   5461|  4.083642|
|      2|   5461| 4.0320635|
|   4994|   5461| 4.0164266|
|    194|   5461| 3.9682267|
|    588|   5461|  3.959906|
|    118|   5461| 3.9385495|
|    919|   5461| 3.9313989|
|    311|   5461|  3.901535|
|    646|   5461|  3.894922|
|   2854|   5461| 3.8758938|
|    357|   5461| 3.8325243|
|    212|   5461| 3.8228502|
|    595|   5461| 3.8119242|
+-------+-------+----------+
only showing top 20 rows



In [55]:

spark.stop()

ConnectionRefusedError: [WinError 10061] Impossibile stabilire la connessione. Rifiuto persistente del computer di destinazione