# Criação dos dados para o ALS

## 0. Setup

In [1]:
#---- Bibliotecas:

# Análise e manipulação de dados:

import pandas as pd
import numpy as np

# Visualização de dados:

import plotly.io as pio
import plotly.express as px

# Manuseio nos dados

from os import chdir

In [2]:
#---- Mudando o diretório raiz para a pasta principal do projeto

chdir('../../')

#---- Template dos gráficos:

pio.templates.default = "plotly_white"

#---- Carregando as funções

from functions import *

## 1. Extração dos dados

In [3]:
#---- Dados de treino:

df_train = pd.read_parquet('03-data/02-processed/01-train_data.parquet')

df_train.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,city,state,country,RN
0,276747,60517794,9,Little Altars Everywhere,Rebecca Wells,2003,HarperTorch,25.0,iowa city,iowa,usa,1
1,276747,671537458,9,Waiting to Exhale,Terry McMillan,1995,Pocket,25.0,iowa city,iowa,usa,2
2,276747,679776818,8,Birdsong: A Novel of Love and War,Sebastian Faulks,1997,Vintage Books USA,25.0,iowa city,iowa,usa,3
3,276747,943066433,7,How to Deal With Difficult People,Rick Brinkman,1995,Careertrack Inc.,25.0,iowa city,iowa,usa,4
4,276762,380711524,5,See Jane Run,Joy Fielding,1992,Avon,25.0,duisburg,nordrhein-westfalen,germany,1


## 2. Manuseio nos dados

### 2.1. Criação das variáveis de ID para entrar na matriz usuário X item

In [4]:
df_train['USER_ID_CAT'] = df_train['User-ID'].astype('category').cat.codes
df_train['IBSN_ID_CAT'] = df_train['ISBN'].astype("category").cat.codes

In [5]:
df_dim_user = df_train[['User-ID', 'USER_ID_CAT']]\
    .drop_duplicates()\
    .reset_index(drop = True)\
    .reset_index()\
    .assign(\
        USER_ID_INT = lambda x: x['index'] + 1
    )\
    .drop(columns = 'index')

df_dim_user

Unnamed: 0,User-ID,USER_ID_CAT,USER_ID_INT
0,276747,22404,1
1,276762,22405,2
2,276772,22406,3
3,276786,22407,4
4,276788,22408,5
...,...,...,...
22563,276680,22399,22564
22564,276681,22400,22565
22565,276683,22401,22566
22566,276688,22402,22567


In [6]:
df_dim_isbn = df_train[['ISBN', 'IBSN_ID_CAT']]\
    .drop_duplicates()\
    .reset_index(drop = True)\
    .reset_index()\
    .assign(\
        ISBN_ID_INT = lambda x: x['index'] + 1
    )\
    .drop(columns = 'index')

df_dim_isbn.head()

Unnamed: 0,ISBN,IBSN_ID_CAT,ISBN_ID_INT
0,60517794,3844,1
1,671537458,78067,2
2,679776818,83361,3
3,943066433,118092,4
4,380711524,37575,5


In [7]:
df_train = df_train\
    .merge(df_dim_user, on = ['User-ID', 'USER_ID_CAT'], how = 'left')\
    .merge(df_dim_isbn, on = ['ISBN', 'IBSN_ID_CAT'], how = 'left')

df_train.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age,city,state,country,RN,USER_ID_CAT,IBSN_ID_CAT,USER_ID_INT,ISBN_ID_INT
0,276747,60517794,9,Little Altars Everywhere,Rebecca Wells,2003,HarperTorch,25.0,iowa city,iowa,usa,1,22404,3844,1,1
1,276747,671537458,9,Waiting to Exhale,Terry McMillan,1995,Pocket,25.0,iowa city,iowa,usa,2,22404,78067,1,2
2,276747,679776818,8,Birdsong: A Novel of Love and War,Sebastian Faulks,1997,Vintage Books USA,25.0,iowa city,iowa,usa,3,22404,83361,1,3
3,276747,943066433,7,How to Deal With Difficult People,Rick Brinkman,1995,Careertrack Inc.,25.0,iowa city,iowa,usa,4,22404,118092,1,4
4,276762,380711524,5,See Jane Run,Joy Fielding,1992,Avon,25.0,duisburg,nordrhein-westfalen,germany,1,22405,37575,2,5


In [8]:
#---- Criando a matriz de usuário x item

sparse_matrix = csr_matrix(
    (
        df_train['Book-Rating'], 
        (df_train['USER_ID_INT'], df_train['ISBN_ID_INT'])
    )
)

sparse_matrix

<22569x158236 sparse matrix of type '<class 'numpy.int64'>'
	with 346011 stored elements in Compressed Sparse Row format>

## 3. Salvando os dados

In [18]:
#---- Matriz usuário e item

save_sparse_csr(array = sparse_matrix, filename = '03-data/02-processed/04-ALS-train-data.npz')

In [16]:
#---- Dados de treino

df_train\
    .to_parquet('03-data/02-processed/05-ALS-df-train-data.parquet', index = False)

In [17]:
# #---- DE-PARA

# df_dim_isbn\
#     .to_parquet('03-data/02-processed/05-ALS-DE-PARA-ISBN.parquet', index = False)

# df_dim_user\
#     .to_parquet('03-data/02-processed/06-ALS-DE-PARA-USER.parquet', index = False)