In [0]:
!python3 --version

Python 3.6.4 :: Anaconda, Inc.


## Ao longo da execucao, documente o codigo com comentarios, referentes ao seu entendimento. Utilize tambem a estrutura de markdown do Jupyter para colocar informacoes textuais relevantes.

In [0]:
from __future__ import print_function

import sys
if sys.version >= '3':
    long = int

from pyspark.sql import SparkSession

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [0]:
def g(x):
    print(x)

In [0]:
spark = SparkSession\
        .builder\
        .appName("Pratica ALS")\
        .getOrCreate()

In [0]:
lines = spark.read.text("sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=long(p[3])))
ratings = spark.createDataFrame(ratingsRDD)

In [0]:
ratings.show()

+-------+------+----------+------+
|movieId|rating| timestamp|userId|
+-------+------+----------+------+
|      2|   3.0|1424380312|     0|
|      3|   1.0|1424380312|     0|
|      5|   2.0|1424380312|     0|
|      9|   4.0|1424380312|     0|
|     11|   1.0|1424380312|     0|
|     12|   2.0|1424380312|     0|
|     15|   1.0|1424380312|     0|
|     17|   1.0|1424380312|     0|
|     19|   1.0|1424380312|     0|
|     21|   1.0|1424380312|     0|
|     23|   1.0|1424380312|     0|
|     26|   3.0|1424380312|     0|
|     27|   1.0|1424380312|     0|
|     28|   1.0|1424380312|     0|
|     29|   1.0|1424380312|     0|
|     30|   1.0|1424380312|     0|
|     31|   1.0|1424380312|     0|
|     34|   1.0|1424380312|     0|
|     37|   1.0|1424380312|     0|
|     41|   2.0|1424380312|     0|
+-------+------+----------+------+
only showing top 20 rows



In [0]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [0]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
              coldStartStrategy="drop")
model = als.fit(training)

In [0]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.6673391192012108


In [0]:
userRecs = model.recommendForAllUsers(10)

In [0]:
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    28|[[93, 5.909292], ...|
|    26|[[51, 6.7328672],...|
|    27|[[18, 3.888737], ...|
|    12|[[17, 5.080094], ...|
|    22|[[75, 5.099725], ...|
|     1|[[83, 3.66217], [...|
|    13|[[62, 4.181654], ...|
|     6|[[52, 5.3958316],...|
|    16|[[22, 5.3041067],...|
|     3|[[51, 4.762616], ...|
|    20|[[90, 3.67034], [...|
|     5|[[55, 4.543656], ...|
|    19|[[90, 4.1464696],...|
|    15|[[46, 5.067572], ...|
|    17|[[46, 5.1394644],...|
|     9|[[10, 5.2341466],...|
|     4|[[31, 4.402969], ...|
|     8|[[52, 4.9064813],...|
|    23|[[30, 6.022075], ...|
|     7|[[25, 5.106661], ...|
+------+--------------------+
only showing top 20 rows



In [0]:
movieRecs = model.recommendForAllItems(10)

In [0]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     31|[[4, 4.402969], [...|
|     85|[[8, 4.8677278], ...|
|     65|[[23, 4.8021784],...|
|     53|[[16, 5.1129117],...|
|     78|[[21, 1.3871652],...|
|     34|[[2, 4.0777], [26...|
|     81|[[28, 4.9508896],...|
|     28|[[18, 4.8938055],...|
|     76|[[14, 5.017491], ...|
|     26|[[15, 3.042717], ...|
|     27|[[12, 4.942835], ...|
|     44|[[12, 2.2516878],...|
|     12|[[26, 3.8531444],...|
|     91|[[7, 3.404726], [...|
|     22|[[16, 5.3041067],...|
|     93|[[28, 5.909292], ...|
|     47|[[7, 3.7464967], ...|
|      1|[[2, 4.3560557], ...|
|     52|[[21, 5.5879693],...|
|     13|[[23, 3.9412727],...|
+-------+--------------------+
only showing top 20 rows



In [0]:
users = ratings.select(als.getUserCol()).distinct()

In [0]:
users.show()

+------+
|userId|
+------+
|    26|
|    29|
|    19|
|     0|
|    22|
|     7|
|    25|
|     6|
|     9|
|    27|
|    17|
|    28|
|     5|
|     1|
|    10|
|     3|
|    12|
|     8|
|    11|
|     2|
+------+
only showing top 20 rows



In [0]:
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)

In [0]:
userSubsetRecs.select(userSubsetRecs['recommendations']).show()

+--------------------+
|     recommendations|
+--------------------+
|[[51, 6.7328672],...|
|[[90, 4.1464696],...|
|[[23, 3.7985656],...|
+--------------------+



In [0]:
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 2)

In [0]:
movieSubSetRecs.show(1, False)

+-------+---------------------------------+
|movieId|recommendations                  |
+-------+---------------------------------+
|65     |[[23, 4.8021784], [9, 2.9555926]]|
+-------+---------------------------------+
only showing top 1 row



In [0]:
userRecsOnlyItemId = userRecs.select(userRecs['userId'], userRecs['recommendations']['movieId'])

In [0]:
userRecsOnlyItemId.show(10, False)

+------+----------------------------------------+
|userId|recommendations.movieId                 |
+------+----------------------------------------+
|28    |[93, 81, 92, 53, 89, 82, 49, 2, 96, 19] |
|26    |[51, 92, 94, 88, 23, 24, 22, 83, 8, 89] |
|27    |[18, 35, 88, 74, 75, 51, 27, 80, 66, 83]|
|12    |[17, 27, 35, 64, 94, 55, 33, 31, 7, 23] |
|22    |[75, 74, 88, 51, 30, 68, 32, 62, 98, 83]|
|1     |[83, 68, 28, 75, 88, 62, 77, 32, 9, 80] |
|13    |[62, 93, 96, 89, 2, 53, 52, 83, 74, 92] |
|6     |[52, 40, 76, 25, 43, 72, 29, 4, 58, 31] |
|16    |[22, 53, 54, 90, 51, 29, 68, 73, 82, 24]|
|3     |[51, 23, 49, 73, 13, 18, 55, 36, 8, 7]  |
+------+----------------------------------------+
only showing top 10 rows



## Agora faca 50 recomendacoes para todos os usuarios

## Recomende 50 usuarios para os itens

## Como poderiamos armazenar as recomendacoes no MongoDB?

## Como podemos fazer isso em Python? Pesquise para utilizacao na proxima aula

## Podemos utilizar outros datasets de teste, em especifico do proprio MovieLenz? Pesquise sobre esses datasets e sua estrutura basica.

## Uma vez tendo os dados no MongoDB, como podemos criar um pipeline online para consultar as recomendacoes?