In [3]:
import pandas as pd

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
ruta_books = "/content/drive/MyDrive/Datasets/books_limpio_def.csv"
books = pd.read_csv(ruta_books)

ruta_ratings = "/content/drive/MyDrive/Datasets/ratings.csv"
ratings = pd.read_csv(ruta_ratings)

In [9]:
books.columns

Index(['id', 'book_id', 'books_count', 'isbn', 'isbn13', 'authors',
       'original_publication_year', 'original_title', 'title', 'language_code',
       'average_rating', 'ratings_count', 'work_ratings_count',
       'work_text_reviews_count', 'ratings_1', 'ratings_2', 'ratings_3',
       'ratings_4', 'ratings_5', 'pages', 'genre', 'genre_ordenado',
       'genero_1', 'genero_2', 'genero_3', 'genero_4', 'description_en',
       'title_en', 'texto', 'texto_limpio', 'texto_lemmatizado', 'Cluster'],
      dtype='object')

In [6]:
ratings.columns

Index(['book_id', 'user_id', 'rating'], dtype='object')

In [7]:
ratings.dtypes

book_id    int64
user_id    int64
rating     int64
dtype: object

###Cálculo de la Dispersión

In [8]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=248fa1e87bd437dfba1286197416d85dcb87850a88622d6c6ed3fac555f61d58
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [10]:
from pyspark.sql.functions import col, explode
from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext
spark = SparkSession.builder.appName('Recommendations').getOrCreate()

In [11]:
num_ratings = ratings["rating"].count()
num_users = ratings["user_id"].nunique()
num_movies = ratings["book_id"].nunique()

denominator = num_users * num_movies

sparsity = (1.0 - (num_ratings * 1.0) / denominator) * 100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")

The ratings dataframe is  99.82% empty.


La dispersión puede traducirse en una mala performance del modelo. Por lo tanto, es necesario elegir un modelo que pueda manejar dicha escasez. Es por esto que se eligió el modelo ALS. Otras ventajas de este modelo son:

*   Puede incorporar restricciones y regularizaciones para evitar el sobreajuste.
*   Es poco sensible a outliers.
*   Es más rápido que los métodos SVD y SGD.
*   Puede manejar eficientemente la escasez de interacciones entre usuarios e ítems.
*   Es escalable a grandes conjuntos de datos.

###Transformar el Dataset en una Matrix BookXUser, que es la Matriz que vamos a factorizar.

In [12]:
pip install implicit

Collecting implicit
  Downloading implicit-0.7.0-cp310-cp310-manylinux2014_x86_64.whl (9.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.7.0


In [13]:
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares



In [14]:
sparse_matrix = csr_matrix((ratings['rating'], (ratings['book_id'], ratings['user_id'])))

###Modelo ALS

In [15]:
model = AlternatingLeastSquares(factors=190, regularization=0.01, iterations=10)

In [16]:
transposed_matrix = sparse_matrix.T

In [17]:
model.fit(transposed_matrix)



  0%|          | 0/10 [00:00<?, ?it/s]

In [18]:
import numpy as np

In [51]:
user_vector = model.user_factors[1]
scores = model.item_factors.dot(user_vector)
top_indices = np.argsort(-scores)[:5]

for item_id in top_indices:
  score = scores[item_id]
  resultado = books.loc[books["book_id"] == item_id, "title"].values
  if len(genero1) > 0:
    genero1 = books.loc[books["book_id"] == item_id, "genero_1"].values
    print(genero1[0])

In [55]:
def recomendacion(user_id):
    user_vector = model.user_factors[user_id]
    scores = model.item_factors.dot(user_vector)
    top_indices = np.argsort(-scores)[:5]

    for item_id in top_indices:
        score = scores[item_id]
        resultado = books.loc[books["book_id"] == item_id, "title"].values
        genero1 = books.loc[books["book_id"] == item_id, "genero_1"].values
        genero2 = books.loc[books["book_id"] == item_id, "genero_2"].values
        genero3 = books.loc[books["book_id"] == item_id, "genero_3"].values
        genero4 = books.loc[books["book_id"] == item_id, "genero_4"].values
        if len(resultado) > 0:
            titulo = resultado[0]
            generos = []
            generos.append(genero1[0])
            generos.append(genero2[0])
            generos.append(genero3[0])
            print(f"Book Title: {titulo}, Genres: {generos}, Score: {score}")
        else:
            print(f"Book with ID {item_id} not found, Score: {score}")

In [56]:
recomendacion(1)

Book with ID 6285 not found, Score: 0.04462588205933571
Book with ID 4893 not found, Score: 0.04259587079286575
Book Title: Atonement, Genres: ['Fiction', ' Romance', ' Historical'], Score: 0.04236472398042679
Book with ID 1180 not found, Score: 0.04235490784049034
Book with ID 5335 not found, Score: 0.04169480875134468
