# Curso Big Data #11 - Sistema de recomendacion

#### 1. Creamos una SparkSession

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Recommendation_System').getOrCreate()

#### 2. Importamos el data set

In [2]:
df = spark.read.csv('C:/Users/pc/pruebas/movielens_ratings.csv', inferSchema=True, header=True)

In [4]:
df.show(5)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
+-------+------+------+
only showing top 5 rows



In [5]:
df.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



#### 3. Splitting el dataset

In [6]:
training_set, test_set = df.randomSplit([0.8, 0.2])

#### 4. Creamos el modelo

In [7]:
#Factorización de matrices de mínimos cuadrados alternos (ALS).

#ALS intenta estimar la matriz de calificaciones R como el producto de dos matrices de rango inferior, 
#X e Y, es decir, X * Yt = R. Por lo general, estas aproximaciones se denominan matrices de "factor". 
#El enfoque general es iterativo. Durante cada iteración, una de las matrices factoriales se mantiene constante, 
#mientras que la otra se resuelve utilizando mínimos cuadrados. 
#La matriz de factores recién resuelta se mantiene constante mientras se resuelve para la otra matriz de factores.

from pyspark.ml.recommendation import ALS

recommender = ALS(userCol='userId', ratingCol='rating', itemCol='movieId')
recommender = recommender.fit(training_set)

#### 5. Predecimos usando el test set

In [8]:
preds = recommender.transform(test_set)

In [10]:
preds.show(5)

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|      1|   1.0|     5| 1.7388971|
|      1|   1.0|     7| 1.3713032|
|      2|   3.0|     6| 1.1975499|
|      2|   2.0|     7| 1.7239631|
|      0|   3.0|    28|  2.312674|
+-------+------+------+----------+
only showing top 5 rows



#### 6. Evaluamos el model

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol='rating')
evaluator.evaluate(preds)

0.954289889758269

#### 7. Hacemos la recomendacion

In [17]:
test_set.show(5)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      0|   1.0|    26|
|      0|   3.0|    28|
|      1|   1.0|     5|
|      1|   1.0|     7|
|      2|   2.0|     7|
+-------+------+------+
only showing top 5 rows



In [14]:
test_set.filter(test_set['userId'] == 23).show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      4|   1.0|    23|
|     25|   1.0|    23|
|     30|   4.0|    23|
|     32|   5.0|    23|
|     38|   1.0|    23|
|     50|   4.0|    23|
|     53|   1.0|    23|
|     55|   5.0|    23|
|     59|   1.0|    23|
|     69|   1.0|    23|
|     83|   1.0|    23|
+-------+------+------+



In [15]:
single_user = test_set.filter(test_set['userId'] == 23).select(['userId', 'movieId'])

In [19]:
single_user.show(5)

+------+-------+
|userId|movieId|
+------+-------+
|    23|      4|
|    23|     25|
|    23|     30|
|    23|     32|
|    23|     38|
+------+-------+
only showing top 5 rows



In [20]:
recommendations = recommender.transform(single_user)

In [21]:
recommendations.orderBy('prediction', ascending=False).show()

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|    23|     32| 5.0406437|
|    23|     55| 3.9250937|
|    23|     50|  3.866125|
|    23|     38| 2.3600922|
|    23|     69| 2.3459716|
|    23|     30| 2.0719569|
|    23|     83| 1.3872185|
|    23|     59| 1.2001252|
|    23|      4| 1.1434418|
|    23|     53| 0.8110362|
|    23|     25|0.12990454|
+------+-------+----------+

