## ALS

## 0. Setup

In [1]:
#---- Bibliotecas:

# Análise e manipulação de dados:

import pandas as pd
import numpy as np

# Visualização de dados:

import plotly.io as pio
import plotly.express as px

# Manuseio nos dados

from os import chdir

# ALS

from implicit import als

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#---- Mudando o diretório raiz para a pasta principal do projeto

chdir('../../')

#---- Template dos gráficos:

pio.templates.default = "plotly_white"

#---- Carregando as funções

from functions import *

## 1. Extração dos dados

In [3]:
#---- Dados de treino

train_data = load_sparse_csr('03-data/02-processed/04-ALS-train-data.npz')

train_data

<22569x158236 sparse matrix of type '<class 'numpy.int64'>'
	with 346011 stored elements in Compressed Sparse Row format>

In [4]:
#---- Dados de teste

df_test = pd.read_parquet('03-data/02-processed/02-test_data.parquet')

df_test.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,city,state,country,RN
0,276747,1885408226,7,The Golden Rule of Schmoozing,Aye Jaye,1998.0,Listen &amp; Live Audio,25.0,iowa city,iowa,usa,5
1,276762,N3453124715,4,,,,,25.0,duisburg,nordrhein-westfalen,germany,4
2,276772,3596151465,10,Henry der Held.,Roddy Doyle,2001.0,"Fischer (Tb.), Frankfurt",,bonn,nordrhein-westfalen,germany,3
3,276786,8478442588,6,El Elogio de La Sombra,Tanazaki,1998.0,Siruela,34.0,madrid,madrid,spain,4
4,276788,055310666X,10,False Memory,Dean R. Koontz,1999.0,Bantam Books,,mentone,california,usa,3


In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22568 entries, 0 to 22567
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   User-ID              22568 non-null  int64  
 1   ISBN                 22568 non-null  object 
 2   Book-Rating          22568 non-null  int64  
 3   Book-Title           18886 non-null  object 
 4   Book-Author          18886 non-null  object 
 5   Year-Of-Publication  18886 non-null  object 
 6   Publisher            18886 non-null  object 
 7   Age                  15357 non-null  float64
 8   city                 22568 non-null  object 
 9   state                22568 non-null  object 
 10  country              22568 non-null  object 
 11  RN                   22568 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 2.1+ MB


In [6]:
#---- Dados de treino

df_train = pd.read_parquet('03-data/02-processed/05-ALS-df-train-data.parquet')

df_train.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,city,state,country,RN,USER_ID_CAT,IBSN_ID_CAT,USER_ID_INT,ISBN_ID_INT
0,276747,60517794,9,Little Altars Everywhere,Rebecca Wells,2003,HarperTorch,25.0,iowa city,iowa,usa,1,22404,3844,1,1
1,276747,671537458,9,Waiting to Exhale,Terry McMillan,1995,Pocket,25.0,iowa city,iowa,usa,2,22404,78067,1,2
2,276747,679776818,8,Birdsong: A Novel of Love and War,Sebastian Faulks,1997,Vintage Books USA,25.0,iowa city,iowa,usa,3,22404,83361,1,3
3,276747,943066433,7,How to Deal With Difficult People,Rick Brinkman,1995,Careertrack Inc.,25.0,iowa city,iowa,usa,4,22404,118092,1,4
4,276762,380711524,5,See Jane Run,Joy Fielding,1992,Avon,25.0,duisburg,nordrhein-westfalen,germany,1,22405,37575,2,5


In [7]:
# #---- De-para's:

# de_para_isbn = pd.read_parquet('03-data/02-processed/05-ALS-DE-PARA-ISBN.parquet')
# de_para_user = pd.read_parquet('03-data/02-processed/06-ALS-DE-PARA-USER.parquet')

In [8]:
# #---- Full data:

# df_full = pd.read_parquet('03-data/02-processed/03-full_data.parquet')

# df_full.head()

## 2. Modelagem

In [9]:
model = als.AlternatingLeastSquares(factors = 30)

model

  check_blas_config()


<implicit.cpu.als.AlternatingLeastSquares at 0x7d3538bb1f90>

In [10]:
model.fit(train_data)

100%|███████████████████████████████████████████| 15/15 [02:34<00:00, 10.32s/it]


## 3. Recomendações

### 3.1. Nível usuário

In [11]:
user_id_rec = 171011

de_para_user_id_rec = df_train.query(f'`User-ID` == {user_id_rec}')['USER_ID_INT'].iloc[0]

de_para_user_id_rec

14098

In [12]:
#---- TOP 10 livros mais bem avaliados pelo usuário

df_train\
    .query(f'`User-ID` == {user_id_rec}')\
    .query(f'`Book-Rating` != 0')\
    .sort_values(by = 'Book-Rating', ascending = False)[['ISBN', 'Book-Title', 'Book-Rating']]\
    .drop_duplicates()\
    .head(10)

Unnamed: 0,ISBN,Book-Title,Book-Rating
221525,0439064872,Harry Potter and the Chamber of Secrets (Book 2),10
221526,0439136369,Harry Potter and the Prisoner of Azkaban (Book 3),10
221527,0439139597,Harry Potter and the Goblet of Fire (Book 4),10
221528,043935806X,Harry Potter and the Order of the Phoenix (Boo...,10
221529,043936213X,Harry Potter and the Sorcerer's Stone (Book 1),10
221524,0140386335,The Neverending Story,9


In [13]:
#---- Lista de ISBN que o usuário já leu

readed_isbn_by_user = df_train\
    .query(f'`User-ID` == {user_id_rec}')['Book-Title']\
    .unique()\
    .tolist()

readed_isbn_by_user

['The Neverending Story',
 'Harry Potter and the Chamber of Secrets (Book 2)',
 'Harry Potter and the Prisoner of Azkaban (Book 3)',
 'Harry Potter and the Goblet of Fire (Book 4)',
 'Harry Potter and the Order of the Phoenix (Book 5)',
 "Harry Potter and the Sorcerer's Stone (Book 1)"]

In [14]:
not_recommended_list_to_user = df_train\
    .query(f'`Book-Title`.isin({readed_isbn_by_user})')['ISBN_ID_INT']\
    .unique()\
    .tolist()

In [15]:
#---- Top10 recomendações para o usuário

recommendations = model.recommend(de_para_user_id_rec,
                                  train_data[de_para_user_id_rec], 
                                  filter_already_liked_items = True,
                                  N = 10,
                                  filter_items = not_recommended_list_to_user)

print(f'Recomendações: {recommendations}')

list_recommendations = recommendations[0].tolist()

#---- De-para das recomendações para ISBN

isbn_recommendations = df_train\
    .query(f'ISBN_ID_INT.isin({list_recommendations})')['ISBN']\
    .unique()\
    .tolist()

df_train\
    .pipe(filter_list_ordered, 
          col = 'ISBN', 
          list_filter = isbn_recommendations)\
    [['ISBN', 'Book-Title']]\
    .drop_duplicates()['Book-Title'].unique()

Recomendações: (array([ 1623,   810,  1478,  2081,  2529, 19063,  1851,  5298,  5404,
         831], dtype=int32), array([0.7069525 , 0.32919478, 0.3209007 , 0.27279955, 0.2562572 ,
       0.2552646 , 0.21154246, 0.19130623, 0.1830472 , 0.17414919],
      dtype=float32))


array(['The Hobbit : The Enchanting Prelude to The Lord of the Rings',
       None, 'The Fellowship of the Ring (The Lord of the Rings, Part 1)',
       "Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))",
       'American Gods', 'The Two Towers (The Lord of the Rings, Part 2)',
       'The Bad Beginning (A Series of Unfortunate Events, Book 1)',
       'Red Dragon',
       'The Reptile Room (A Series of Unfortunate Events, Book 2)',
       'The Return of the King (The Lord of the Rings, Part 3)'],
      dtype=object)

In [19]:
df_test\
    .query(f'`User-ID` == {user_id_rec}')

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,city,state,country,RN
14097,171011,590032747,9,Why I'm Afraid of Bees,R. L. Stine,1999,Apple,13.0,pipe creek,texas,usa,7


In [None]:
#---- Fazer a avaliação quantitativa, pois a amostra é viesada

### 3.2. Nível item (PAREI AQUI: ADAPTAR PARA O NOVO DATAFRAME)

In [16]:
isnb_rec = '043935806X'
de_para_isbn_rec = de_para_isbn.query(f'`ISBN` == "{isnb_rec}"')['ISBN_ID_INT'].iloc[0]
de_para_isbn_rec

NameError: name 'de_para_isbn' is not defined

In [None]:
#---- Vamos recomendar outros livros a partir desse livro:

df_full\
    .query(f'`ISBN` == "{isnb_rec}"')['Book-Title']\
    .unique()[0]

In [None]:
#---- Top10 recomendação para um livro:

recommendations = model.similar_items(itemid = de_para_isbn_rec)

list_recommendations = recommendations[0].tolist()

#---- De-para das recomendações para ISBN

isbn_recommendations = de_para_isbn\
    .query(f'ISBN_ID_INT.isin({list_recommendations})')['ISBN']\
    .tolist()

df_full\
    .pipe(filter_list_ordered, 
          col = 'ISBN', 
          list_filter = isbn_recommendations)\
    [['ISBN', 'Book-Title']]\
    .drop_duplicates()

In [None]:
test = pd.DataFrame.sparse.from_spmatrix(train_data)

test

In [None]:
#---------- PAREI AQUI:

# - Fazer a parte do item
# - Ler sobre o ALS

# TO-DO's

- Ler um pouco mais sobre a teoria do ALS
- Criar um código que exclua o primeiro item da recomendação do usuário

# Validações

In [None]:
df_train['FLAG_HP'] = np.where(df_train['Book-Title'].str.contains('Potter'), 1, 0)

In [None]:
test = df_train[['User-ID', 'FLAG_HP']]\
    .value_counts()\
    .reset_index()\
    .sort_values(by = ['User-ID', 'FLAG_HP'])

test['percent'] = test['count'] / test.groupby('User-ID')['count'].transform('sum') * 100

test\
    .sort_values(by = 'percent', ascending = False)\
    .query('FLAG_HP == 1 & count >= 3')\
    .head(20)

In [None]:
df_train\
    .query('`User-ID` == 252829')\
    .sort_values(by = 'Book-Title')