In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('recommender_systems').getOrCreate()

#Content Based and Collaborative Filtering
Content based focuses on the attributes of the items and recommends based on the similarity between them

Collborative Filtering focuses on the knowledge of the users  

Collaborative filtering is widely used.
User-Item Association matrix

ALS--> Matrix Factorization approach to implement a recommendation algorithm you decompose your large user/item matrix into lower dimensioanl user factors and item factors

In [3]:
data = spark.read.csv('movielens_ratings.csv', header=True,inferSchema=True)

In [4]:
from pyspark.ml.recommendation import ALS

In [5]:
from pyspark.ml.evaluation import RegressionEvaluator

In [6]:
data.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
|     23|   1.0|     0|
|     26|   3.0|     0|
|     27|   1.0|     0|
|     28|   1.0|     0|
|     29|   1.0|     0|
|     30|   1.0|     0|
|     31|   1.0|     0|
|     34|   1.0|     0|
|     37|   1.0|     0|
|     41|   2.0|     0|
+-------+------+------+
only showing top 20 rows



In [7]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [8]:
training,test = data.randomSplit([0.8,0.2])

In [9]:
#regParam is the regularization param
als = ALS(maxIter=5, regParam=0.02,userCol='userId',itemCol='movieId',ratingCol='rating')

In [10]:
model = als.fit(training)

In [11]:
predictions  = model.transform(test)

In [12]:
predictions.show()

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|     31|   4.0|    12|-0.5615097|
|     31|   2.0|    25| 0.6532082|
|     31|   1.0|    29|0.78068507|
|     31|   1.0|     0|0.26625332|
|     31|   1.0|    18|0.65506935|
|     85|   2.0|    20|  3.240032|
|     85|   1.0|    23|0.11133571|
|     85|   3.0|    21| 1.9545287|
|     65|   1.0|    28| 0.7798416|
|     65|   1.0|    22| 0.4703888|
|     65|   1.0|     4|0.15480775|
|     53|   3.0|    13| 1.7604439|
|     53|   2.0|    19| 1.3335803|
|     53|   5.0|    21| 3.2247021|
|     53|   3.0|    14| 2.6046777|
|     78|   1.0|    19|  0.881061|
|     78|   1.0|    17| 1.1907868|
|     78|   1.0|     8|  1.098132|
|     34|   1.0|    14| 1.4342097|
|     81|   2.0|     5|-0.5812866|
+-------+------+------+----------+
only showing top 20 rows



In [13]:
evaluation = RegressionEvaluator(metricName='rmse',labelCol='rating',predictionCol='prediction')

In [14]:
rmse = evaluation.evaluate(predictions)

In [15]:
print(rmse)

1.387896983792885


In [17]:
single_user = test.filter(test['userId']==11).select(['movieId','userId'])

In [18]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|     21|    11|
|     38|    11|
|     45|    11|
|     48|    11|
|     50|    11|
|     51|    11|
|     59|    11|
|     61|    11|
|     76|    11|
|     86|    11|
|     88|    11|
|     97|    11|
+-------+------+



In [19]:
recommndations_for_singleuser = model.transform(single_user)

In [21]:
recommndations_for_singleuser.orderBy('prediction',ascending=False).show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     48|    11|  3.2080991|
|     38|    11|  2.5373065|
|     51|    11|  2.4321537|
|     59|    11|  1.8368171|
|     21|    11|  1.4446455|
|     50|    11|  1.0983341|
|     76|    11| 0.84437716|
|     45|    11|  0.6249192|
|     97|    11|0.120393336|
|     86|    11|-0.04157394|
|     61|    11|-0.24410877|
|     88|    11| -1.7679291|
+-------+------+-----------+

