# Aula 05 - Exemplos

In [1]:
import numpy as np
import pandas as pd

## Fazendo download da base

In [1]:
import wget
!python3 -m wget https://github.com/mmanzato/MBABigData/raw/master/ml-20m-compact.tar.gz
!tar -xvzf ml-20m-compact.tar.gz

100% [....................................................] 65019041 / 65019041
Saved under ml-20m-compact.tar.gz
x dataset/
x dataset/tags_sample.csv
x dataset/.DS_Store
x dataset/movies_sample.csv
x dataset/genome-tags.csv
x dataset/ml-youtube.csv
x dataset/genome-scores.csv
x dataset/ratings_sample.csv


## Ler e preparar dados (vide notebooks anteriores)


In [2]:
movies = pd.read_csv('./dataset/movies_sample.csv')
ratings = pd.read_csv('./dataset/ratings_sample.csv')
df = ratings[['userId', 'movieId', 'rating']]
df = df.merge(movies[['movieId', 'title']])
map_users = {user: idx for idx, user in enumerate(df.userId.unique())}
map_items = {item: idx for idx, item in enumerate(df.movieId.unique())}
df['userId'] = df['userId'].map(map_users)
df['movieId'] = df['movieId'].map(map_items)

map_title = {}
for _, row in df.iterrows():
    map_title[row.movieId] = row.title

## Avaliação no cenário de predição de notas (rating prediction)

### Cross-Validation

In [3]:
from caserec.utils.cross_validation import CrossValidation
from caserec.recommenders.rating_prediction.itemknn import ItemKNN

df.to_csv('ratings.dat', index=False, header=False, sep='\t')

recommender = ItemKNN()
CrossValidation(input_file='ratings.dat', recommender=recommender, dir_folds='./', header=1, k_folds=5).compute()

[Case Recommender: Cross Validation]

Database:: ratings.dat 
Recommender Algorithm:: ItemKNN Algorithm | K Folds: 5

Eval:: MAE: 0.80243 RMSE: 1.068182 
Eval:: MAE: 0.801585 RMSE: 1.065781 
Eval:: MAE: 0.799916 RMSE: 1.06408 
Eval:: MAE: 0.801509 RMSE: 1.064542 
Eval:: MAE: 0.800297 RMSE: 1.06173 
Mean:: MAE: 0.801147 RMSE: 1.064863 
STD:: MAE: 0.000917 RMSE: 0.002116 


### Hold-Out

In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=.2, random_state=2)
train.to_csv('train.dat', index=False, header=False, sep='\t')
test.to_csv('test.dat', index=False, header=False, sep='\t')


ItemKNN('train.dat', 'test.dat', 'rp_iknn.dat', as_similar_first=True).compute()

[Case Recommender: Rating Prediction > ItemKNN Algorithm]

train data:: 11090 users and 403 items (152496 interactions) | sparsity:: 96.59%
test data:: 10503 users and 340 items (38125 interactions) | sparsity:: 98.93%

training_time:: 3.721757 sec
prediction_time:: 2.289772 sec
Eval:: MAE: 0.665964 RMSE: 0.876739 


### Explorando as predições

In [5]:
preds = pd.read_csv('./rp_iknn.dat', sep='\t', names=['userId', 'movieId', 'rating'])
preds_user = preds.loc[(preds.userId==0), 'rating'].tolist()
print(preds_user)

[4.26384, 3.824934]


In [6]:
ratings_user = test.loc[(test.userId==0), 'rating'].tolist()
print(ratings_user)

[5.0, 5.0]


In [7]:
from math import sqrt

def rmse_user(preds, ratings):
    if len(preds) != len(ratings):
        return -1
    sum = 0
    for i in range(len(preds)):
        sum += pow(preds[i]-ratings[i], 2)
    return sqrt(sum/len(preds))

print(rmse_user(preds_user, ratings_user))

0.9804875445297611


## Avaliação no cenário de recomendação de itens (item recommendation)

In [8]:
from caserec.recommenders.item_recommendation.bprmf import BprMF

BprMF('train.dat', 'test.dat', 'ir_bprmf.dat', factors=3).compute()

[Case Recommender: Item Recommendation > BPRMF]

train data:: 11090 users and 403 items (152496 interactions) | sparsity:: 96.59%
test data:: 10503 users and 340 items (38125 interactions) | sparsity:: 98.93%

training_time:: 105.370493 sec
prediction_time:: 2.090727 sec


Eval:: PREC@1: 0.37475 PREC@3: 0.263956 PREC@5: 0.223784 PREC@10: 0.172322 RECALL@1: 0.118124 RECALL@3: 0.23802 RECALL@5: 0.329635 RECALL@10: 0.500156 MAP@1: 0.37475 MAP@3: 0.459123 MAP@5: 0.463995 MAP@10: 0.439807 NDCG@1: 0.37475 NDCG@3: 0.540146 NDCG@5: 0.561823 NDCG@10: 0.562711 


In [9]:
from caserec.recommenders.item_recommendation.itemknn import ItemKNN

ItemKNN('train.dat', 'test.dat', 'ir_itemknn.dat').compute()

[Case Recommender: Item Recommendation > ItemKNN Algorithm]

train data:: 11090 users and 403 items (152496 interactions) | sparsity:: 96.59%
test data:: 10503 users and 340 items (38125 interactions) | sparsity:: 98.93%

training_time:: 1.059788 sec
prediction_time:: 37.244149 sec


Eval:: PREC@1: 0.419023 PREC@3: 0.307975 PREC@5: 0.254384 PREC@10: 0.187261 RECALL@1: 0.134438 RECALL@3: 0.281617 RECALL@5: 0.378529 RECALL@10: 0.546924 MAP@1: 0.419023 MAP@3: 0.513599 MAP@5: 0.516788 MAP@10: 0.487732 NDCG@1: 0.419023 NDCG@3: 0.603503 NDCG@5: 0.620406 NDCG@10: 0.613585 


### Explorando as recomendações

In [10]:
recs = pd.read_csv('./ir_bprmf.dat', sep='\t', names=['userId', 'movieId', 'score'])
recs_user = recs.loc[(recs.userId==1), 'movieId'].tolist()
print(recs_user)

[12, 22, 10, 4, 17, 21, 30, 33, 13, 28]


In [11]:
ground_truth = test.loc[(test.userId==1), 'movieId'].tolist()
print(ground_truth)

[6, 106, 21, 30, 12]


In [12]:
intersec = list(set(recs_user) & set(ground_truth))
print('Precisão: ' + str(len(intersec)/len(recs_user)))
print('Revocação: ' + str(len(intersec)/len(ground_truth)))

Precisão: 0.3
Revocação: 0.6
