## ALS

## 0. Setup

In [3]:
#---- Bibliotecas:

# Análise e manipulação de dados:

import pandas as pd
import numpy as np

# Visualização de dados:

import plotly.io as pio
import plotly.express as px

# Manuseio nos dados

from os import chdir

# ALS

from implicit import als

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#---- Mudando o diretório raiz para a pasta principal do projeto

chdir('../../')

#---- Template dos gráficos:

pio.templates.default = "plotly_white"

#---- Carregando as funções

from functions import *

## 1. Extração dos dados

In [5]:
#---- Dados de treino

train_data = load_sparse_csr('03-data/02-processed/04-ALS-train-data.npz')

train_data

<22568x158235 sparse matrix of type '<class 'numpy.int64'>'
	with 346011 stored elements in Compressed Sparse Row format>

In [6]:
#---- Dados de teste

test_data = pd.read_parquet('03-data/02-processed/02-test_data.parquet')

test_data.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,city,state,country,RN
0,276747,1885408226,7,The Golden Rule of Schmoozing,Aye Jaye,1998.0,Listen &amp; Live Audio,25.0,iowa city,iowa,usa,5
1,276762,N3453124715,4,,,,,25.0,duisburg,nordrhein-westfalen,germany,4
2,276772,3596151465,10,Henry der Held.,Roddy Doyle,2001.0,"Fischer (Tb.), Frankfurt",,bonn,nordrhein-westfalen,germany,3
3,276786,8478442588,6,El Elogio de La Sombra,Tanazaki,1998.0,Siruela,34.0,madrid,madrid,spain,4
4,276788,055310666X,10,False Memory,Dean R. Koontz,1999.0,Bantam Books,,mentone,california,usa,3


In [7]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22568 entries, 0 to 22567
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   User-ID              22568 non-null  int64  
 1   ISBN                 22568 non-null  object 
 2   Book-Rating          22568 non-null  int64  
 3   Book-Title           18886 non-null  object 
 4   Book-Author          18886 non-null  object 
 5   Year-Of-Publication  18886 non-null  object 
 6   Publisher            18886 non-null  object 
 7   Age                  15357 non-null  float64
 8   city                 22568 non-null  object 
 9   state                22568 non-null  object 
 10  country              22568 non-null  object 
 11  RN                   22568 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 2.1+ MB


In [8]:
#---- De-para's:

de_para_isbn = pd.read_parquet('03-data/02-processed/05-ALS-DE-PARA-ISBN.parquet')
de_para_user = pd.read_parquet('03-data/02-processed/06-ALS-DE-PARA-USER.parquet')

## 2. Modelagem

In [9]:
model = als.AlternatingLeastSquares(factors = 50)

model

  check_blas_config()


<implicit.cpu.als.AlternatingLeastSquares at 0x7f045e4fc190>

In [10]:
model.fit(train_data)

100%|███████████████████████████████████████████| 15/15 [00:36<00:00,  2.46s/it]


In [11]:
user_id_rec = 22406

de_para_user_id_rec = de_para_user.query(f'`User-ID` == {user_id_rec}')['INDEX'].iloc[0]
de_para_user_id_rec

2

In [12]:
#---- Recomendação para um usuário:

recommendations = model.similar_users(userid = de_para_user_id_rec, N = 10)

recommendations

(array([    2, 14342, 15528, 19734,  2393,  2389, 16812,  8079, 11357,
         5468], dtype=int32),
 array([1.0000001 , 0.7422485 , 0.5969736 , 0.5844441 , 0.55926967,
        0.5444118 , 0.5401609 , 0.5090132 , 0.50190246, 0.49890992],
       dtype=float32))

In [13]:
isnb_rec = 47405
de_para_isbn_rec = de_para_isbn.query(f'`ISBN` == {isnb_rec}')['INDEX'].iloc[0]
de_para_isbn_rec

158230

In [14]:
#---- Recomendação para um item:

recommendations = model.similar_items(itemid = de_para_isbn_rec)

recommendations

(array([141338, 136833, 102820,  98982,  93536,  73339,  67517,  30689,
         10031,   3401], dtype=int32),
 array([1.0000001, 1.0000001, 1.0000001, 1.0000001, 1.0000001, 1.0000001,
        1.0000001, 1.0000001, 1.0000001, 1.0000001], dtype=float32))

# TO-DO's

- Ler um pouco mais sobre a teoria do ALS
- Criar um código que exclua o primeiro item da recomendação do usuário

In [15]:
test = pd.read_parquet('03-data/02-processed/03-full_data.parquet')

test

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,city,state,country
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,,tyler,texas,usa
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,,seattle,washington,usa
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,16.0,h,new south wales,australia
3,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,16.0,rijeka,,croatia
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,16.0,rijeka,,croatia
...,...,...,...,...,...,...,...,...,...,...,...
1149775,276704,1563526298,9,Get Clark Smart : The Ultimate Guide for the S...,Clark Howard,2000,Longstreet Press,,cedar park,texas,usa
1149776,276706,0679447156,0,Eight Weeks to Optimum Health: A Proven Progra...,Andrew Weil,1997,Alfred A. Knopf,18.0,quebec,quebec,canada
1149777,276709,0515107662,10,The Sherbrooke Bride (Bride Trilogy (Paperback)),Catherine Coulter,1996,Jove Books,38.0,mannington,west virginia,usa
1149778,276721,0590442449,10,Fourth Grade Rats,Jerry Spinelli,1996,Scholastic,14.0,providence,rhode island,usa


In [16]:
de_para_isbn

Unnamed: 0,INDEX,ISBN
0,0,3844
1,1,78067
2,2,83361
3,3,118092
4,4,37575
...,...,...
158230,158230,47405
158231,158231,66042
158232,158232,78240
158233,158233,38863


In [17]:
test\
    .query('`Book-Title`.notnull()')\
    .query('`Book-Title`.str.contains("Potter")')

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,city,state,country
84,276788,043935806X,7,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic,,mentone,california,usa
210,276847,3551551677,10,Harry Potter und der Stein der Weisen,Joanne K. Rowling,1999,Carlsen Verlag GmbH,27.0,köln,nordrhein-westfalen,germany
211,276847,3551551685,10,Harry Potter und die Kammer des Schreckens,Joanne K. Rowling,2000,Carlsen Verlag GmbH,27.0,köln,nordrhein-westfalen,germany
212,276847,3551551693,10,Harry Potter und der Gefangene von Azkaban,J. K. Rowling,1999,Carlsen Verlag GmbH,27.0,köln,nordrhein-westfalen,germany
213,276847,3551551936,10,Harry Potter Und Der Feuerkelch,Joanne K. Rowling,1999,Carlsen Verlag GmbH,27.0,köln,nordrhein-westfalen,germany
...,...,...,...,...,...,...,...,...,...,...,...
1148155,276165,0439139597,10,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,20.0,las vegas,nevada,usa
1148156,276165,043935806X,10,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic,20.0,las vegas,nevada,usa
1148185,276165,0590353403,10,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,20.0,las vegas,nevada,usa
1148334,276231,0425154092,7,From Potter's Field,Patricia Daniels Cornwell,1996,Berkley Publishing Group,,livingston,texas,usa


In [25]:
test\
    .query('`Year-Of-Publication`.isin(["2003", "2004"])')\
    .groupby('Book-Title')\
    .agg(
        media_avaliacao_2003 = ('Book-Rating', lambda x: x[test['Year-Of-Publication'] == "2003"].mean()),
        media_avaliacao_2004 = ('Book-Rating', lambda x: x[test['Year-Of-Publication'] == "2004"].mean())
    )

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f04f8221180>>
Traceback (most recent call last):
  File "/home/rafa/Documentos/github/book-recommendation/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 

KeyboardInterrupt



In [None]:
from pyspark.sql.functions import *

df\
	.groupBy('id')\ # Agrupamento por ID
    .agg(mean\ # Média da variável 
         (when(col('idade') >= 18, col('renda')) # Filtro dentro do agg para trazer somente ID's com idade maior que 18
         ).alias('renda_idade_maior_18')) # Escolha do nome da variável

In [None]:
library(dplyr)

df %>% 
  group_by(id) %>% # Agrupamento por ID
  summarise(media_renda_idade_maior_18 = mean(ifelse(idade >= 18, renda, NA), na.rm = TRUE), # Média da variável de idade, somente para os IDs com idade maior que 18
            mediana_renda_idade_maior_18 = median(ifelse(idade >= 18, renda, NA), na.rm = TRUE) # Mediana da variável de idade, somente para os IDs com idade maior que 18
           ) 