### Importar base de dados e instalar framework de recomendação

In [None]:
! wget https://github.com/caserec/4EABDARecSys/blob/main/dataset/AmazonMusicDataset.tar.xz
! tar -xf AmazonMusic.tar.xz
! pip install caserecommender

### Importar bibliotecas

In [None]:
import pandas as pd
import numpy as np

### Explorar Dados

In [None]:
dataset_metadata = pd.read_csv('./AmazonMusic/amazon_music_metadata.csv')
dataset_metadata.head()

In [None]:
dataset = pd.read_json('./AmazonMusic/Digital_Music_5.json', lines=True)
dataset.head()

In [None]:
dataset.overall.value_counts().plot(kind='bar', color=['r', 'g', 'y', 'c', 'b']);

In [None]:
df = dataset[['reviewerID', 'asin', 'overall']]
df.tail()

In [None]:
df = df.merge(dataset_metadata[['asin', 'title']])
df.tail()

### Números de usuários e número de itens

In [None]:
print(
"""
Número de usuários: {}
Número de itens: {}
Número de interações: {}
""".format(
    df.reviewerID.nunique(),
    df.asin.nunique(),
    df.shape[0]
)
)

### Mapeamento em idx

In [None]:
map_users = {user: idx for idx, user in enumerate(df.reviewerID.unique())}
map_items = {item: idx for idx, item in enumerate(df.asin.unique())}

In [None]:
df['reviewerID'] = df['reviewerID'].map(map_users)
df['asin'] = df['asin'].map(map_items)
df

In [None]:
map_title = {}

for _, row in df.iterrows():
    map_title[row.asin] = row.title

In [None]:
len(map_title)
map_title[100]

In [None]:
# qtd interações dos usuários
df.groupby('reviewerID').count()

### Divisão do dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(df, test_size=.2, random_state=2)

In [None]:
train.to_csv('train.txt', index=False, header=False, sep='\t')
test.to_csv('test.txt', index=False, header=False, sep='\t')

In [None]:
test.shape[0]

### Recomendadores

## Prever notas

In [None]:
from caserec.recommenders.rating_prediction.most_popular import MostPopular

In [None]:
MostPopular('train.txt', 'test.txt', 'out_mp_pred.txt').compute()

In [None]:
df_pred = pd.read_csv('out_mp_pred.txt', sep='\t', names=['reviewerID', 'asin', 'pred'])
df_pred.head()

In [None]:
test[test.reviewerID == 0]

In [None]:
df_pred = df_pred.merge(test)

In [None]:
df_pred.overall.value_counts().plot(kind='bar')

In [None]:
df_pred['pred'] = df_pred['pred'].apply(round) 
df_pred.pred.value_counts().plot(kind='bar');

### Top N

In [None]:
from caserec.recommenders.item_recommendation.most_popular import MostPopular as MPR

In [None]:
MPR('train.txt', 'test.txt', 'out_mp_pred.txt').compute()

In [None]:
MPR('train.txt', 'test.txt', 'out_mp_pred_binary.txt', as_binary=True).compute(as_table=True, metrics=['PREC'])

In [None]:
ranking = pd.read_csv('out_mp_pred.txt', sep='\t', names=['reviewerID', 'asin', 'score'])
ranking['title'] = ranking.asin.map(map_title)
ranking.head(15)

In [None]:
ranking = pd.read_csv('out_mp_pred_binary.txt', sep='\t', names=['reviewerID', 'asin', 'score'])
ranking['title'] = ranking.asin.map(map_title)
ranking.head(15)