# Criação dos dados para o ALS

## 0. Setup

In [2]:
#---- Bibliotecas:

# Análise e manipulação de dados:

import pandas as pd
import numpy as np

# Visualização de dados:

import plotly.io as pio
import plotly.express as px

# Manuseio nos dados

from os import chdir

In [3]:
#---- Mudando o diretório raiz para a pasta principal do projeto

chdir('../../')

#---- Template dos gráficos:

pio.templates.default = "plotly_white"

#---- Carregando as funções

from functions import *

## 1. Extração dos dados

In [4]:
#---- Dados de treino:

df_train = pd.read_parquet('03-data/02-processed/01-train_data.parquet')

df_train.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,city,state,country,RN
0,276747,60517794,9,Little Altars Everywhere,Rebecca Wells,2003,HarperTorch,25.0,iowa city,iowa,usa,1
1,276747,671537458,9,Waiting to Exhale,Terry McMillan,1995,Pocket,25.0,iowa city,iowa,usa,2
2,276747,679776818,8,Birdsong: A Novel of Love and War,Sebastian Faulks,1997,Vintage Books USA,25.0,iowa city,iowa,usa,3
3,276747,943066433,7,How to Deal With Difficult People,Rick Brinkman,1995,Careertrack Inc.,25.0,iowa city,iowa,usa,4
4,276762,380711524,5,See Jane Run,Joy Fielding,1992,Avon,25.0,duisburg,nordrhein-westfalen,germany,1


## 2. Manuseio nos dados

In [5]:
#---- Criando a matriz de usuário x item

sparse_matrix = csr_matrix(
    (
        df_train['Book-Rating'], 
        (df_train['User-ID'].astype('category').cat.codes, df_train['ISBN'].astype('category').cat.codes)
    )
)

sparse_matrix

<22568x158235 sparse matrix of type '<class 'numpy.int64'>'
	with 346011 stored elements in Compressed Sparse Row format>

In [6]:
# pd.DataFrame.sparse.from_spmatrix(sparse_matrix)

In [9]:
df_train['ISBN']\
    .astype('category')\
    .cat\
    .codes

0           3844
1          78067
2          83361
3         118092
4          37575
           ...  
346006    121371
346007     28227
346008     38863
346009     89828
346010    101701
Length: 346011, dtype: int32

In [6]:
#---- DE-PARA de ISBN para index

df_de_para_isbn = df_train['ISBN']\
    .astype('category')\
    .cat\
    .codes\
    .drop_duplicates()\
    .to_frame('ISBN')\
    .reset_index(drop = True)\
    .reset_index()\
    .rename(columns = {'index': 'INDEX'})

df_de_para_isbn

Unnamed: 0,INDEX,ISBN
0,0,3844
1,1,78067
2,2,83361
3,3,118092
4,4,37575
...,...,...
158230,158230,47405
158231,158231,66042
158232,158232,78240
158233,158233,38863


In [7]:
#---- DE-PARA de USER_ID para index

df_de_para_user = df_train['User-ID']\
    .astype('category')\
    .cat\
    .codes\
    .drop_duplicates()\
    .to_frame('User-ID')\
    .reset_index(drop = True)\
    .reset_index()\
    .rename(columns = {'index': 'INDEX'})

df_de_para_user

Unnamed: 0,INDEX,User-ID
0,0,22404
1,1,22405
2,2,22406
3,3,22407
4,4,22408
...,...,...
22563,22563,22399
22564,22564,22400
22565,22565,22401
22566,22566,22402


## 3. Salvando os dados

In [8]:
#---- Matriz usuário e item

save_sparse_csr(array = sparse_matrix, filename = '03-data/02-processed/04-ALS-train-data.npz')

In [9]:
#---- DE-PARA

df_de_para_isbn\
    .to_parquet('03-data/02-processed/05-ALS-DE-PARA-ISBN.parquet', index = False)

df_de_para_user\
    .to_parquet('03-data/02-processed/06-ALS-DE-PARA-USER.parquet', index = False)