In [1]:
import pandas as pd

In [2]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:

ruta_books = 'https://drive.google.com/file/d/1UbISK_Guvg7AjQ8BkRvd7GAx9i0OSKL2/view?usp=sharing'
ruta_ratings = "https://drive.google.com/file/d/1iOP_hMHC_WISo-BIVja-U1Ikvt3d5dul/view?usp=sharing"


id_books = ruta_books.split("/")[-2]
id_ratings = ruta_ratings.split("/")[-2]

downloaded_books = drive.CreateFile({'id':id_books})
downloaded_books.GetContentFile('books_limpio_def.csv')

downloaded_ratings = drive.CreateFile({'id':id_ratings})
downloaded_ratings.GetContentFile('ratings.csv')

books = pd.read_csv('books_limpio_def.csv')
ratings = pd.read_csv('ratings.csv')

In [4]:
books.columns

Index(['id', 'book_id', 'books_count', 'isbn', 'isbn13', 'authors',
       'original_publication_year', 'original_title', 'title', 'language_code',
       'average_rating', 'ratings_count', 'work_ratings_count',
       'work_text_reviews_count', 'ratings_1', 'ratings_2', 'ratings_3',
       'ratings_4', 'ratings_5', 'pages', 'genre', 'genre_ordenado',
       'genero_1', 'genero_2', 'genero_3', 'genero_4', 'description_en',
       'title_en', 'texto', 'texto_limpio', 'texto_lemmatizado', 'Cluster'],
      dtype='object')

In [5]:
ratings.columns

Index(['book_id', 'user_id', 'rating'], dtype='object')

In [6]:
ratings.dtypes

book_id    int64
user_id    int64
rating     int64
dtype: object

# Modelo de recomendacion por similitud o contenido

In [8]:
books.title.sample()

6660    Sugar and Spice
Name: title, dtype: object

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_books_titulo(book_title, num_similar_books=3):
    books.reset_index(drop=True, inplace=True)
    book_index = books.loc[books['title'] == book_title].index[0]
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(books['texto_lemmatizado']+ ' ' + books['genero_1'])
    tfidf_matrix_weighted = tfidf_matrix.multiply(books['Cluster'].values[:, None])
    cosine_sim = cosine_similarity(tfidf_matrix_weighted, tfidf_matrix_weighted)
    book_similarities = cosine_sim[book_index]
    similar_books_indices = book_similarities.argsort()[::-1][1:num_similar_books+1]
    similar_books = books.loc[similar_books_indices, ['title', 'genero_1', 'genero_2', 'pages', 'average_rating']]
    return similar_books

# Verificar si el título ingresado existe en el dataset
try:
    similar_books = find_similar_books_titulo('Sugar and Spice', num_similar_books=3)
    print('Libros similares:')
    for i, book in similar_books.iterrows():
        print(f'Título: {book.title}')
        print(f'Género: {book.genero_1} - {book.genero_2}')
        print(f'Páginas: {book.pages}')
        print(f'Rating: {book.average_rating}')
        print('---')
except ValueError:
    print('Disculpas, no pudimos encontrar ese título. Por favor ingresa otro.')
except IndexError:
    print('Disculpas, no pudimos encontrar ese título. Por favor ingresa otro.')


Libros similares:
Título: Sweet Little Lies
Género: Fiction -  Young Adult
Páginas: 309
Rating: 3.53
---
Título: L.A. Candy
Género: Fiction -  Young Adult
Páginas: 326
Rating: 3.36
---
Título: Blood Cross
Género: Fantasy -  Fantasy
Páginas: 321
Rating: 4.13
---


###Cálculo de la Dispersión

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=fa467c3843fb1fc138baf4bd943f0108c96bf2d0498150be93d3fc3fa1876f69
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [None]:
from pyspark.sql.functions import col, explode
from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext
spark = SparkSession.builder.appName('Recommendations').getOrCreate()

In [None]:
num_ratings = ratings["rating"].count()
num_users = ratings["user_id"].nunique()
num_movies = ratings["book_id"].nunique()

denominator = num_users * num_movies

sparsity = (1.0 - (num_ratings * 1.0) / denominator) * 100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")

The ratings dataframe is  99.82% empty.


La dispersión puede traducirse en una mala performance del modelo. Por lo tanto, es necesario elegir un modelo que pueda manejar dicha escasez. Es por esto que se eligió el modelo ALS. Otras ventajas de este modelo son:

*   Puede incorporar restricciones y regularizaciones para evitar el sobreajuste.
*   Es poco sensible a outliers.
*   Es más rápido que los métodos SVD y SGD.
*   Puede manejar eficientemente la escasez de interacciones entre usuarios e ítems.
*   Es escalable a grandes conjuntos de datos.

###Transformar el Dataset en una Matrix BookXUser, que es la Matriz que vamos a factorizar.

In [None]:
pip install implicit

Collecting implicit
  Downloading implicit-0.7.0-cp310-cp310-manylinux2014_x86_64.whl (9.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/9.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/9.2 MB[0m [31m11.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/9.2 MB[0m [31m46.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m7.3/9.2 MB[0m [31m70.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m9.2/9.2 MB[0m [31m76.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.7.0


In [None]:
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares



In [None]:
sparse_matrix = csr_matrix((ratings['rating'], (ratings['book_id'], ratings['user_id'])))

###Modelo ALS

In [None]:
model = AlternatingLeastSquares(factors=190, regularization=0.01, iterations=10)

In [None]:
transposed_matrix = sparse_matrix.T

In [None]:
model.fit(transposed_matrix)



  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
import numpy as np

In [None]:
def recomendacion(user_id):
    user_vector = model.user_factors[user_id]
    scores = model.item_factors.dot(user_vector)
    top_indices = np.argsort(-scores)[:5]

    for item_id in top_indices:
        score = scores[item_id]
        resultado = books.loc[books["book_id"] == item_id, "title"].values
        genero1 = books.loc[books["book_id"] == item_id, "genero_1"].values
        genero2 = books.loc[books["book_id"] == item_id, "genero_2"].values
        genero3 = books.loc[books["book_id"] == item_id, "genero_3"].values
        genero4 = books.loc[books["book_id"] == item_id, "genero_4"].values
        if len(resultado) > 0:
            titulo = resultado[0]
            generos = []
            generos.append(genero1[0])
            generos.append(genero2[0])
            generos.append(genero3[0])
            print(f"Book Title: {titulo}, Genres: {generos}, Score: {score}")
        else:
            print(f"Book with ID {item_id} not found, Score: {score}")

In [None]:
recomendacion(1)

Book Title: Atonement, Genres: ['Fiction', ' Romance', ' Historical'], Score: 0.04483562335371971
Book with ID 4893 not found, Score: 0.04473603144288063
Book with ID 5335 not found, Score: 0.04451511427760124
Book with ID 8015 not found, Score: 0.04410182684659958
Book with ID 6285 not found, Score: 0.042941849678754807
