# **SISTEMAS DE RECOMENDAÇÃO - DOMÍNIO DE LIVROS**


---


## Importação das bibliotecas necessárias

In [79]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [80]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [81]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from surprise import Dataset, Reader, accuracy, KNNWithMeans
from surprise.model_selection import train_test_split

## Importação dos dados


In [82]:
book_ratings = pd.read_csv('/content/drive/MyDrive/Datasets/BX-Book-Ratings.csv')
books = pd.read_csv('/content/drive/MyDrive/Datasets/BX-Books.csv')
users = pd.read_csv('/content/drive/MyDrive/Datasets/BX-Users.csv')

  books = pd.read_csv('/content/drive/MyDrive/Datasets/BX-Books.csv')


## Limpeza e Transformação dos dados

In [83]:
def split_country(x):
  return x.split(",")[-1].strip()

def split_state(x):
  try:
    return x.split(",")[1].strip()
  except:
    return x

### Books

In [84]:
# Exclui colunas inúteis 
books.drop(['Image-URL-L' , 'Image-URL-S', 'Image-URL-M', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Publisher'], axis=1, inplace=True)

# Remove todas as linhas que tiverem o campo de Autor nulo
books.dropna(subset=['Book-Author'], inplace=True)

# Deixa no campo ano de publicação só valores numéricos e de 1965 até 2023
books = books[books['Year-Of-Publication'].str.isnumeric() == True]
books = books[(books['Year-Of-Publication'] > '1965') & (books['Year-Of-Publication'] < '2023')]

books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication
0,195153448,Classical Mythology,Mark P. O. Morford,2002
1,2005018,Clara Callan,Richard Bruce Wright,2001
2,60973129,Decision in Normandy,Carlo D'Este,1991
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999


### Users

In [85]:
# Filtrando idade dos usuários após análise em gráfico boxplot
users = users[(users['Age'] < 75) & (users['Age'] > 12) | (users['Age'].isna() == True)]

# Separando a Localização
users['Country'] = users['Location'].apply(split_country)
users['State'] = users['Location'].apply(split_state)
users.drop(['Location'], axis=1, inplace=True)

users

users.head()

Unnamed: 0,User-ID,Age,Country,State
0,1,,usa,new york
1,2,18.0,usa,california
2,3,,russia,yukon territory
3,4,17.0,portugal,v.n.gaia
4,5,,united kingdom,hants


In [86]:
book_ratings.rename(columns={'Book-Rating': 'rating'}, inplace = True)
book_ratings.isnull().sum()

User-ID    0
ISBN       0
rating     0
dtype: int64

## Modelo de recomendação baseado em Item

In [87]:
# Dataframe com livros com mais de 50 avaliações
new_df = book_ratings.groupby("ISBN").filter(lambda x:x['rating'].count() >= 50)

reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(new_df, reader)

# Divisão entre treino e teste 70/30
trainset, testset = train_test_split(data, test_size=0.3,random_state=10)

# Criação de um modelo baseado em item 
algo = KNNWithMeans(k=5, sim_options={'user_based': False})
algo.fit(trainset)

# Teste do modelo
test_pred = algo.test(testset)

# RMSE
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

# Usando a função get_neighbors para obter as 10 recomendações para o produto no índice 1 do book_ratings
algo.get_neighbors(1, 10)

Computing the msd similarity matrix...
Done computing similarity matrix.
Item-based Model : Test Set
RMSE: 3.9226


[2, 6, 9, 14, 21, 24, 32, 33, 34, 35]

RMSE: 3.9637
[2, 14, 15, 21, 24, 30, 32, 33, 34, 35]

In [88]:
# Creating a new DataFrame with mean rating and number of ratings by product
ratings_df = pd.DataFrame(new_df.groupby('ISBN').rating.mean())
# Adding column with number of ratings by product
ratings_df['rating_counts'] = new_df.groupby('ISBN').rating.count()
# Visualizing the 5 top rated products (number of ratings)
ratings_df.sort_values(by='rating_counts', ascending=False).head(5)

Unnamed: 0_level_0,rating,rating_counts
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
971880107,1.01877,2504
316666343,4.468726,1295
385504209,4.662147,885
60928336,3.448087,732
312195516,4.339779,724


In [89]:
ratings_df.iloc[[2, 6, 9, 14, 21, 24, 32, 33, 34, 35]].index

Index(['002542730X', '006019491X', '006098824X', '006109921X', '014029628X',
       '014100018X', '034529873X', '034530988X', '034536676X', '034537536X'],
      dtype='object', name='ISBN')

In [90]:
#books.rename(columns={'Book-Author': 'Book_Author'}, inplace = True)
#books.iloc[itens].Book_Author


In [91]:
#books.rename(columns={'Book-Title': 'Book_Title'}, inplace = True)
#books.iloc[itens].Book_Title

## Modelo baseado em filtragem colaborativa


In [92]:
import os
from sklearn.decomposition import TruncatedSVD

In [93]:
ratings_matrix = new_df.pivot_table(values='rating', index='User-ID', columns='ISBN', fill_value=0)
ratings_matrix.head()

ISBN,000649840X,002026478X,002542730X,006000438X,006001203X,006016848X,006019491X,006091565X,006092988X,006098824X,...,894805770,894808249,91867770,929634063,971880107,99245027,99387913,99747200,99771519,99800403
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
ratings_matrix.shape

(43442, 2186)

In [95]:
# Transpondo a matriz
X = ratings_matrix.T
X.head()

User-ID,9,14,16,17,26,32,39,42,44,51,...,278813,278819,278828,278832,278836,278843,278844,278846,278851,278854
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000649840X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
002026478X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006000438X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006001203X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
X.shape

(2186, 43442)

In [97]:
# A função Truncated SVD realmente reduzirá a dimensão da matriz esparsa para o número de componentes solicitados
SVD_model = TruncatedSVD(n_components=10)
decomposed_matrix = SVD_model.fit_transform(X)
decomposed_matrix.shape

(2186, 10)

In [98]:
# Matriz de correlação
correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

(2186, 2186)

In [99]:
correlation_matrix

array([[1.        , 0.29663982, 0.85976606, ..., 0.87927222, 0.23488885,
        0.32904781],
       [0.29663982, 1.        , 0.6575346 , ..., 0.65676797, 0.85147257,
        0.79773765],
       [0.85976606, 0.6575346 , 1.        , ..., 0.95887176, 0.57844591,
        0.63100884],
       ...,
       [0.87927222, 0.65676797, 0.95887176, ..., 1.        , 0.48862925,
        0.57650646],
       [0.23488885, 0.85147257, 0.57844591, ..., 0.48862925, 1.        ,
        0.82204865],
       [0.32904781, 0.79773765, 0.63100884, ..., 0.57650646, 0.82204865,
        1.        ]])

In [100]:
# Pegando o livro do índice 1 do book_ratings, assim como no outro modelo
i = '002026478X'

product_names = list(X.index)
product_ID = product_names.index(i)
product_ID

1

In [101]:
# Correlação de todos os itens com o livro comprado por este cliente com base em itens avaliados por outros clientes 
# que compraram o mesmo livro
correlation_product_ID = correlation_matrix[product_ID]
correlation_product_ID.shape

(2186,)

In [102]:
# Os 10 principais produtos semelhantes ao produto i recomendados para este usuário.
Recommend = list(X.index[correlation_product_ID > 0.65])

# Remove o item já comprado pelo cliente
Recommend.remove(i) 

Recommend[0:10]

['002542730X',
 '006000438X',
 '006091565X',
 '006109868X',
 '014016930X',
 '014023313X',
 '014029628X',
 '015100692X',
 '031205436X',
 '031242227X']

In [105]:
item_based = ['002542730X', '006019491X', '006098824X', '006109921X', '014029628X','014100018X', '034529873X', '034530988X', '034536676X', '034537536X']
SVD_model_res = ['002542730X', '006000438X', '006091565X', '006109868X', '014016930X', '014023313X', '014029628X', '015100692X', '031205436X', '031242227X']

In [106]:
# Quantas livros foram recomendados nos dois modelos
c = 0
for i in item_based:
    if i in SVD_model_res:
        c +=1

c

2