## ALS

## 0. Setup

In [1]:
#---- Bibliotecas:

# Análise e manipulação de dados:

import pandas as pd
import numpy as np

# Visualização de dados:

import plotly.io as pio
import plotly.express as px

# Manuseio nos dados

from os import chdir

# ALS

from implicit import als

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#---- Mudando o diretório raiz para a pasta principal do projeto

chdir('../../')

#---- Template dos gráficos:

pio.templates.default = "plotly_white"

#---- Carregando as funções

from functions import *

## 1. Extração dos dados

In [3]:
#---- Dados de treino

train_data = load_sparse_csr('03-data/02-processed/04-ALS-train-data.npz')

train_data

<22569x158236 sparse matrix of type '<class 'numpy.int64'>'
	with 346011 stored elements in Compressed Sparse Row format>

In [4]:
#---- Dados de teste

test_data = pd.read_parquet('03-data/02-processed/02-test_data.parquet')

test_data.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,city,state,country,RN
0,276747,1885408226,7,The Golden Rule of Schmoozing,Aye Jaye,1998.0,Listen &amp; Live Audio,25.0,iowa city,iowa,usa,5
1,276762,N3453124715,4,,,,,25.0,duisburg,nordrhein-westfalen,germany,4
2,276772,3596151465,10,Henry der Held.,Roddy Doyle,2001.0,"Fischer (Tb.), Frankfurt",,bonn,nordrhein-westfalen,germany,3
3,276786,8478442588,6,El Elogio de La Sombra,Tanazaki,1998.0,Siruela,34.0,madrid,madrid,spain,4
4,276788,055310666X,10,False Memory,Dean R. Koontz,1999.0,Bantam Books,,mentone,california,usa,3


In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22568 entries, 0 to 22567
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   User-ID              22568 non-null  int64  
 1   ISBN                 22568 non-null  object 
 2   Book-Rating          22568 non-null  int64  
 3   Book-Title           18886 non-null  object 
 4   Book-Author          18886 non-null  object 
 5   Year-Of-Publication  18886 non-null  object 
 6   Publisher            18886 non-null  object 
 7   Age                  15357 non-null  float64
 8   city                 22568 non-null  object 
 9   state                22568 non-null  object 
 10  country              22568 non-null  object 
 11  RN                   22568 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 2.1+ MB


In [6]:
#---- De-para's:

de_para_isbn = pd.read_parquet('03-data/02-processed/05-ALS-DE-PARA-ISBN.parquet')
de_para_user = pd.read_parquet('03-data/02-processed/06-ALS-DE-PARA-USER.parquet')

## 2. Modelagem

In [7]:
model = als.AlternatingLeastSquares(factors = 50)

model

  check_blas_config()


<implicit.cpu.als.AlternatingLeastSquares at 0x7f6492da8640>

In [8]:
model.fit(train_data)

100%|███████████████████████████████████████████| 15/15 [00:36<00:00,  2.40s/it]


In [11]:
de_para_user

Unnamed: 0,User-ID,USER_ID_CAT,USER_ID_INT
0,276747,22404,1
1,276762,22405,2
2,276772,22406,3
3,276786,22407,4
4,276788,22408,5
...,...,...,...
22563,276680,22399,22564
22564,276681,22400,22565
22565,276683,22401,22566
22566,276688,22402,22567


In [52]:
user_id_rec = 88733

de_para_user_id_rec = de_para_user.query(f'`User-ID` == {user_id_rec}')['USER_ID_INT'].iloc[0]
de_para_user_id_rec

7481

In [53]:
#---- Recomendação para um usuário:

recommendations = model.recommend(de_para_user_id_rec, train_data[de_para_user_id_rec], filter_already_liked_items = True)

recommendations

(array([ 436, 1267, 2393, 2308, 1851, 1871, 2228,  135, 1472,  981],
       dtype=int32),
 array([1.8541181, 1.8107033, 1.6902206, 1.457916 , 1.440654 , 1.3504819,
        1.2806928, 1.2370964, 1.1571505, 1.1270586], dtype=float32))

In [54]:
rec_list = de_para_isbn.query(f'ISBN_ID_INT.isin({list(recommendations[0])})')['ISBN'].tolist()

test\
    .query(f'ISBN.isin({rec_list})')[['ISBN', 'Book-Title']]\
    .drop_duplicates()

Unnamed: 0,ISBN,Book-Title
429,0385504209,The Da Vinci Code
806,0671003755,She's Come Undone (Oprah's Book Club (Paperback))
1142,0671510053,SHIPPING NEWS
1290,0316601950,The Pilot's Wife : A Novel
1388,0312195516,The Red Tent (Bestselling Backlist)
2385,0618002219,The Hobbit: or There and Back Again
2978,067976402X,Snow Falling on Cedars
3526,0446605239,The Notebook
5916,0380789035,American Gods
11326,0440219078,The Giver (21st Century Reference)


In [13]:
isnb_rec = 47405
de_para_isbn_rec = de_para_isbn.query(f'`ISBN` == {isnb_rec}')['INDEX'].iloc[0]
de_para_isbn_rec

158230

In [14]:
#---- Recomendação para um item:

recommendations = model.similar_items(itemid = de_para_isbn_rec)

recommendations

(array([141338, 136833, 102820,  98982,  93536,  73339,  67517,  30689,
         10031,   3401], dtype=int32),
 array([1.0000001, 1.0000001, 1.0000001, 1.0000001, 1.0000001, 1.0000001,
        1.0000001, 1.0000001, 1.0000001, 1.0000001], dtype=float32))

# TO-DO's

- Ler um pouco mais sobre a teoria do ALS
- Criar um código que exclua o primeiro item da recomendação do usuário

In [12]:
test = pd.read_parquet('03-data/02-processed/03-full_data.parquet')

test

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,city,state,country
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,,tyler,texas,usa
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,,seattle,washington,usa
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,16.0,h,new south wales,australia
3,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,16.0,rijeka,,croatia
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,16.0,rijeka,,croatia
...,...,...,...,...,...,...,...,...,...,...,...
1149775,276704,1563526298,9,Get Clark Smart : The Ultimate Guide for the S...,Clark Howard,2000,Longstreet Press,,cedar park,texas,usa
1149776,276706,0679447156,0,Eight Weeks to Optimum Health: A Proven Progra...,Andrew Weil,1997,Alfred A. Knopf,18.0,quebec,quebec,canada
1149777,276709,0515107662,10,The Sherbrooke Bride (Bride Trilogy (Paperback)),Catherine Coulter,1996,Jove Books,38.0,mannington,west virginia,usa
1149778,276721,0590442449,10,Fourth Grade Rats,Jerry Spinelli,1996,Scholastic,14.0,providence,rhode island,usa


In [41]:
test\
    .query('`Book-Title`.notnull()')\
    .query('`Book-Rating` != 0')\
    .query('`Book-Title`.str.contains("Potter")')['User-ID']\
    .value_counts()

User-ID
11676     17
252829    10
88733      8
267830     7
254859     7
          ..
112931     1
113821     1
113830     1
113944     1
276231     1
Name: count, Length: 1189, dtype: int64

In [51]:
test\
    .query('`User-ID` == 88733')[['Book-Rating', 'Book-Title']]\
    .drop_duplicates()['Book-Title']\
    .tolist()

Unnamed: 0,Book-Rating,Book-Title
369366,7,The Best of Rock: The Essential Cd Guide (The ...
369367,0,Joshua
369368,0,Joshua In the Holy Land
369369,8,Horse and His Boy
369370,0,"Lion, the Witch and the Wardrobe"
...,...,...
370518,8,Daybreak
370519,0,Black Box Voting: Vote Tampering in the 21st C...
370520,0,Skinwalker
370521,0,The Spy's Guide: Office Espionage


In [55]:
# test\
#     .query('`Book-Title`.notnull()')\
#     .query('`Book-Title`.str.contains("Potter")')['Book-Title'].unique()