# Aula 06 - Filtragem Híbrida - Exemplos

## Criando dataframe do exemplo da aula

In [1]:
import pandas as pd
import numpy as np

In [2]:


data = [['Jessica', 'Oceans Eleven', 2], 
        ['Jessica', 'The Lion King', 4], 
        ['Jessica', 'Braveheart', 3], 
        ['Jessica', 'Independence Day', 2],
        ['Jessica', 'Deby&Loide', 3],
        ['Marta', 'The Princess Diary', 4], 
        ['Marta', 'Oceans Eleven', 3], 
        ['Marta', 'Braveheart', 4], 
        ['Marta', 'Independence Day', 3],
        ['Marta', 'Deby&Loide', 2],
        ['Jose', 'The Princess Diary', 1], 
        ['Jose', 'Oceans Eleven', 5], 
        ['Jose', 'The Lion King', 3], 
        ['Jose', 'Braveheart', 4], 
        ['Jose', 'Deby&Loide', 5],
        ['Dave', 'The Princess Diary', 1], 
        ['Dave', 'The Lion King', 2], 
        ['Dave', 'Braveheart', 3], 
        ['Dave', 'Independence Day', 4]]
        
genres = [['The Princess Diary', 'Drama|Comedia'],
          ['Oceans Eleven', 'Acao|Comedia'],
          ['The Lion King', 'Drama|Sci-Fi'],
          ['Braveheart', 'Drama|Acao'],
          ['Independence Day', 'Acao|Sci-Fi'],
          ['Deby&Loide', 'Comedia']]

df = pd.DataFrame(data, columns=['user','item', 'rating'])
movies = pd.DataFrame(genres, columns=['item','genres'])
df

Unnamed: 0,user,item,rating
0,Jessica,Oceans Eleven,2
1,Jessica,The Lion King,4
2,Jessica,Braveheart,3
3,Jessica,Independence Day,2
4,Jessica,Deby&Loide,3
5,Marta,The Princess Diary,4
6,Marta,Oceans Eleven,3
7,Marta,Braveheart,4
8,Marta,Independence Day,3
9,Marta,Deby&Loide,2


In [3]:
movies

Unnamed: 0,item,genres
0,The Princess Diary,Drama|Comedia
1,Oceans Eleven,Acao|Comedia
2,The Lion King,Drama|Sci-Fi
3,Braveheart,Drama|Acao
4,Independence Day,Acao|Sci-Fi
5,Deby&Loide,Comedia


## Mapeando usuários e itens para ids

In [4]:
map_users = {user: idx for idx, user in enumerate(df.user.unique())}
map_items = {item: idx for idx, item in enumerate(df.item.unique())}
df['userId'] = df['user'].map(map_users)
df['itemId'] = df['item'].map(map_items)
df

Unnamed: 0,user,item,rating,userId,itemId
0,Jessica,Oceans Eleven,2,0,0
1,Jessica,The Lion King,4,0,1
2,Jessica,Braveheart,3,0,2
3,Jessica,Independence Day,2,0,3
4,Jessica,Deby&Loide,3,0,4
5,Marta,The Princess Diary,4,1,5
6,Marta,Oceans Eleven,3,1,0
7,Marta,Braveheart,4,1,2
8,Marta,Independence Day,3,1,3
9,Marta,Deby&Loide,2,1,4


In [5]:
movies['itemId'] = movies['item'].map(map_items)
movies

Unnamed: 0,item,genres,itemId
0,The Princess Diary,Drama|Comedia,5
1,Oceans Eleven,Acao|Comedia,0
2,The Lion King,Drama|Sci-Fi,1
3,Braveheart,Drama|Acao,2
4,Independence Day,Acao|Sci-Fi,3
5,Deby&Loide,Comedia,4


In [6]:
movies_genres = movies.drop('genres', axis=1).join(movies.genres.str.split('|', expand=True)
             .stack().reset_index(drop=True, level=1).rename('genre'))
movies_genres

Unnamed: 0,item,itemId,genre
0,The Princess Diary,5,Drama
0,The Princess Diary,5,Comedia
1,Oceans Eleven,0,Acao
1,Oceans Eleven,0,Comedia
2,The Lion King,1,Drama
2,The Lion King,1,Sci-Fi
3,Braveheart,2,Drama
3,Braveheart,2,Acao
4,Independence Day,3,Acao
4,Independence Day,3,Sci-Fi


### Funções para obter informações específicas do DataFrame

In [7]:
# Obter a nota que um usuário deu para um item.
def get_rating(userId,itemId):
    if len(df[(df['userId']==userId)&(df['itemId']==itemId)]) == 0:
        return 0
    return (df.loc[(df.userId==userId) & (df.itemId == itemId),'rating'].iloc[0])

get_rating(1, 5)

np.int64(4)

In [8]:
# Obter a lista de todos os itens que um usuário avaliou.
def get_item_ids(userId):
    if userId not in df['userId'].values:
        return []
    return (df.loc[(df.userId==userId),'itemId'].tolist())

get_item_ids(0)

[0, 1, 2, 3, 4]

In [9]:
def get_user_ids(itemId):
    if itemId not in df['itemId'].values:
        return []
    return (df.loc[(df.itemId==itemId),'userId'].tolist())

get_user_ids(2)

[0, 1, 2, 3]

In [10]:
# Obter o título do item dado o seu id.
def get_item_title(itemId):
    if itemId not in df['itemId'].values:
        return ''
    return (df.loc[(df.itemId == itemId),'item'].iloc[0])

get_item_title(0)

'Oceans Eleven'

In [11]:
# Obter a lista de ratings de um usuário.
def get_user_ratings(userId):
    if userId not in df['userId'].values:
        return []
    return (df.loc[(df.userId==userId),'rating'].tolist())

get_user_ratings(0)

[2, 4, 3, 2, 3]

In [12]:
# Obter a lista de ratings de um usuário.
def get_item_ratings(itemId):
    if itemId not in df['itemId'].values:
        return []
    return (df.loc[(df.itemId==itemId),'rating'].tolist())

get_item_ratings(0)

[2, 3, 5]

In [13]:
# Obter a média de ratings de um usuário
def get_user_mean(userId):
    return np.mean(get_user_ratings(userId))

get_user_mean(1)

np.float64(3.2)

In [14]:
# Obter a média de ratings de um item
def get_item_mean(itemId):
    return np.mean(get_item_ratings(itemId))

get_item_mean(1)

np.float64(3.0)

In [15]:
# Obter a lista de gêneros de um item
def get_genres(itemId):
    if itemId not in movies_genres['itemId'].values:
        return []
    return movies_genres.loc[(movies_genres.itemId==itemId),'genre'].tolist()

get_genres(0)

['Acao', 'Comedia']

## Hibridização monolítica - combinação

### Computar a similaridade de itens usando gêneros e notas

In [16]:
from math import sqrt

def item_sim_pearson_rating(itemId1, itemId2):
    '''
    itemId1 & itemId2 : ids dos dois itens cuja similaridade será computada
    '''
    # Contar quantos usuários avaliaram os mesmos itens.
    item_list1 = get_user_ids(itemId1)
    item_list2 = get_user_ids(itemId2)
    common_users = list(set(item_list1) & set(item_list2))
    if len(common_users) == 0:
        return 0
    
    # Calcular a média de cada item
    item1_mean = get_item_mean(itemId1)
    item2_mean = get_item_mean(itemId2)
    
    # Cálculo da similaridade.
    sim = []
    norm1 = []
    norm2 = []
    for user in common_users:
        rating1 = get_rating(user, itemId1)
        rating2 = get_rating(user, itemId2)
        sim.append((rating1 - item1_mean)*(rating2 - item2_mean))
        norm1.append(pow(rating1 - item1_mean, 2))
        norm2.append(pow(rating2 - item2_mean, 2))
    
    return sum(sim) / (sqrt(sum(norm1)) * sqrt(sum(norm2)))

item_sim_pearson_rating(4, 5)

np.float64(-0.9079593845004517)

In [17]:
from math import pow, sqrt

def item_sim_jaccard_genre(itemId1, itemId2):
    '''
    itemId1 & itemId2 : ids dos dois itens cuja similaridade será computada
    '''
    # Similaridade baseada em gêneros (Jaccard).
    genre_list1 = get_genres(itemId1)
    genre_list2 = get_genres(itemId2)
    common_genres = list(set(genre_list1) & set(genre_list2))
    return len(common_genres) / len(set(genre_list1 + genre_list2))
        
item_sim_jaccard_genre(4, 5)

0.5

In [18]:
def hybrid_similarity_score(itemId1, itemId2, tradeoff):
    return tradeoff * item_sim_pearson_rating(itemId1, itemId2) + (1 - tradeoff) * item_sim_jaccard_genre(itemId1, itemId2)

hybrid_similarity_score(4, 5, 0.5)

np.float64(-0.20397969225022583)

### Obter os itens mais similares

In [19]:
def most_similar_items(itemId, k):
    '''
    itemId : Targeted item
    k : qtde de vizinhos
    '''
    # Obter lista de itens.
    item_ids = df.itemId.unique().tolist()
    
    # Obter a similaridade entre o item alvo e os demais itens
    sim = [(hybrid_similarity_score(itemId, i, 0.9), i) for i in item_ids if i != itemId]
    
    # Ordenação inversa.
    sim.sort()
    sim.reverse()
    
    # Retornando os usuários mais similares. 
    return sim[:k]

print(most_similar_items(0, 2))

[(np.float64(0.9064615834641321), 3), (np.float64(0.7571428571428569), 4)]


### Calcular a nota

In [20]:
def get_prediction(userId, itemId, k):
    item_mean = get_item_mean(itemId)
    similar_items = most_similar_items(itemId, k)
    num = []
    den = []
    
    for s, j in similar_items:
        rj = get_rating(userId, j)
        if rj == 0:
            continue
        num.append(s * (rj - get_item_mean(j)))
        den.append(s)
    
    return item_mean + sum(num) / sum(den)

get_prediction(3, 4, 2)

np.float64(4.333333333333334)

## Hibridização monolítica - combinação - usando base de dados e CaseRecommender

### Importar base de dados

In [21]:
import wget
!python3 -m wget https://github.com/mmanzato/MBABigData/raw/master/ml-20m-compact.tar.gz
!tar -xvzf ml-20m-compact.tar.gz


Saved under ml-20m-compact.tar (29).gz


x dataset/
x dataset/tags_sample.csv
x dataset/._.DS_Store
x dataset/.DS_Store
x dataset/movies_sample.csv
x dataset/._genome-tags.csv
x dataset/genome-tags.csv
x dataset/._ml-youtube.csv
x dataset/ml-youtube.csv
x dataset/._genome-scores.csv
x dataset/genome-scores.csv
x dataset/ratings_sample.csv


In [22]:
movies = pd.read_csv('./dataset/movies_sample.csv')
ratings = pd.read_csv('./dataset/ratings_sample.csv')
df = ratings[['userId', 'movieId', 'rating']]
df = df.merge(movies[['movieId', 'title']])
map_users = {user: idx for idx, user in enumerate(df.userId.unique())}
map_items = {item: idx for idx, item in enumerate(df.movieId.unique())}
df['userId'] = df['userId'].map(map_users)
df['movieId'] = df['movieId'].map(map_items)
movies['movieId'] = movies['movieId'].map(map_items)
map_title = {}

for _, row in df.iterrows():
    map_title[row.movieId] = row.title
    
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=.2, random_state=2)

### Calcular similaridade baseada em notas

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

n_users = train['userId'].max()
n_items = train['movieId'].max()

A = np.zeros((n_users+1,n_items+1))
for line in train.itertuples():
    A[line[1],line[2]] = line[3]

print("Original rating matrix shape: ",A.shape)
print("Matrix :\n",A)

Original rating matrix shape:  (11090, 417)
Matrix :
 [[0.  4.5 4.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  4.  ... 0.  0.  0. ]]


In [24]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

#Valid metrics are ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', 'wminkowski', 'nan_euclidean', 'haversine'], or 'precomputed', or a callable
sim_matrix = pairwise_distances(A.T, metric="cosine")
print("Shape: ", sim_matrix.shape)
print(sim_matrix)

Shape:  (417, 417)
[[0.         0.9722365  0.89060962 ... 1.         1.         1.        ]
 [0.9722365  0.         0.91093714 ... 1.         1.         0.94611664]
 [0.89060962 0.91093714 0.         ... 1.         1.         1.        ]
 ...
 [1.         1.         1.         ... 0.         0.         1.        ]
 [1.         1.         1.         ... 0.         0.         1.        ]
 [1.         0.94611664 1.         ... 1.         1.         0.        ]]


In [25]:
with open('sim_r_matrix.dat', 'w') as arq_sim_matrix:
    for i in range(len(sim_matrix)):
        for j in range(len(sim_matrix)):
            if i < j:
                arq_sim_matrix.write(str(i) + '\t' + str(j) + '\t' + str(sim_matrix[i][j]) + '\n')

### Testando só com similaridade baseada em ratings (equivalente ao ItemKNN)

In [26]:
from caserec.recommenders.rating_prediction.item_attribute_knn import ItemAttributeKNN

ItemAttributeKNN('train.dat', 'test.dat', similarity_file='sim_r_matrix.dat', as_similar_first=True).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 6.862049 sec
prediction_time:: 0.262534 sec
Eval:: MAE: 0.662461 RMSE: 0.863259 


### Calcular similaridade baseada em gêneros

In [43]:
genres_unique = pd.DataFrame(movies.genres.str.split('|').tolist()).stack().unique()
genres_unique = pd.DataFrame(genres_unique, columns=['genre'])
print(genres_unique)
movies_exp = movies.join(movies.genres.str.get_dummies().astype(bool))
movies_exp.drop('genres', inplace=True, axis=1)
movies_exp.dropna(inplace=True)
B = movies_exp.iloc[:,2:].to_numpy(dtype=bool)

sim_g_matrix = pairwise_distances(B, metric="jaccard")
sim_g_matrix

                 genre
0                Crime
1                Drama
2          Documentary
3                 IMAX
4             Thriller
5                  War
6               Action
7               Comedy
8              Romance
9            Animation
10            Children
11             Mystery
12              Sci-Fi
13           Adventure
14             Fantasy
15           Film-Noir
16              Horror
17             Western
18             Musical
19  (no genres listed)


array([[0.        , 0.5       , 1.        , ..., 1.        , 1.        ,
        0.66666667],
       [0.5       , 0.        , 1.        , ..., 1.        , 1.        ,
        0.5       ],
       [1.        , 1.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 0.        ,
        1.        ],
       [0.66666667, 0.5       , 1.        , ..., 1.        , 1.        ,
        0.        ]])

In [28]:
with open('sim_g_matrix.dat', 'w') as arq_sim_matrix:
    for i in range(len(sim_g_matrix)):
        for j in range(len(sim_g_matrix)):
            if i < j:
                arq_sim_matrix.write(str(i) + '\t' + str(j) + '\t' + str(sim_g_matrix[i][j]) + '\n')

### Testando só com similaridade baseada em gêneros (equivalente a FBC)

In [29]:
ItemAttributeKNN('train.dat', 'test.dat', similarity_file='sim_g_matrix.dat', as_similar_first=True).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 4.703032 sec
prediction_time:: 0.770478 sec
Eval:: MAE: 0.725912 RMSE: 0.955302 


### Hibridização

In [30]:
hib_rating_sim = pd.read_csv('sim_r_matrix.dat', sep='\t', names=['movieId1', 'movieId2', 'similarity'])
hib_rating_sim.sort_values(['movieId1', 'movieId2']).head()

Unnamed: 0,movieId1,movieId2,similarity
0,0,1,0.972237
1,0,2,0.89061
2,0,3,0.844219
3,0,4,0.832586
4,0,5,0.754534


In [31]:
hib_genre_sim = pd.read_csv('sim_g_matrix.dat', sep='\t', names=['movieId1', 'movieId2', 'similarity'])
hib_genre_sim.sort_values(['movieId1', 'movieId2']).head()

Unnamed: 0,movieId1,movieId2,similarity
0,0,1,0.5
1,0,2,1.0
2,0,3,0.75
3,0,4,0.5
4,0,5,0.5


In [32]:
hib_sim = hib_rating_sim.merge(hib_genre_sim, on=['movieId1', 'movieId2'], how='outer')
hib_sim.head()

Unnamed: 0,movieId1,movieId2,similarity_x,similarity_y
0,0,1,0.972237,0.5
1,0,2,0.89061,1.0
2,0,3,0.844219,0.75
3,0,4,0.832586,0.5
4,0,5,0.754534,0.5


In [33]:
hib_sim[hib_sim['similarity_x'].isna()==True]

Unnamed: 0,movieId1,movieId2,similarity_x,similarity_y


In [34]:
import math

def choose_sim(row):
    if math.isnan(row['similarity_x']):
        return row['similarity_y']        
    else:
        return 0.7*row['similarity_x'] + 0.3*row['similarity_y']

hib_sim['similarity'] = hib_sim.apply(lambda row: choose_sim(row), axis=1)
hib_sim

Unnamed: 0,movieId1,movieId2,similarity_x,similarity_y,similarity
0,0,1,0.972237,0.500000,0.830566
1,0,2,0.890610,1.000000,0.923427
2,0,3,0.844219,0.750000,0.815954
3,0,4,0.832586,0.500000,0.732810
4,0,5,0.754534,0.500000,0.678174
...,...,...,...,...,...
86731,413,415,0.000000,1.000000,0.300000
86732,413,416,1.000000,0.666667,0.900000
86733,414,415,0.000000,1.000000,0.300000
86734,414,416,1.000000,1.000000,1.000000


In [35]:
hib_sim[['movieId1', 'movieId2', 'similarity']].to_csv('sim_h_matrix.dat', index=False, header=False, sep='\t')

In [36]:
ItemAttributeKNN('train.dat', 'test.dat', similarity_file='sim_h_matrix.dat', as_similar_first=True).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 7.801001 sec
prediction_time:: 0.673973 sec
Eval:: MAE: 0.661219 RMSE: 0.86049 


## Hibridização canalizada - meta-nível (MF + KNN)


In [37]:
from math import sqrt

def train_svdopt(train, n_factors, lr=0.05, reg=0.02, miter=10):
    global_mean = train['rating'].mean()
    n_users = df['userId'].max()+1
    n_items = df['movieId'].max()+1
    bu = np.zeros(n_users)
    bi = np.zeros(n_items)
    p = np.random.normal(0.1, 0.1, (n_users, n_factors))
    q = np.random.normal(0.1, 0.1, (n_items, n_factors))
    error = []
    for t in range(miter):
        sq_error = 0
        for index, row in train.iterrows():
            u = row['userId']
            i = row['movieId']
            r_ui = row['rating']
            pred = global_mean + bu[u] + bi[i] + np.dot(p[u], q[i])
            e_ui = r_ui - pred
            sq_error = sq_error + pow(e_ui, 2)
            bu[u] = bu[u] + lr * e_ui
            bi[i] = bi[i] + lr * e_ui
            for f in range(n_factors):
                temp_uf = p[u][f]
                p[u][f] = p[u][f] + lr * (e_ui * q[i][f] - reg * p[u][f])
                q[i][f] = q[i][f] + lr * (e_ui * temp_uf - reg * q[i][f])
        error.append(sqrt(sq_error/len(train)))
    
    return global_mean, bu, bi, p, q, error

In [38]:
gl, bu, bi, p, q, error = train_svdopt(train, 10, miter=30)

In [39]:
print('Items factors matrix shape: ', q.shape)

Items factors matrix shape:  (417, 10)


In [40]:
sim_f_matrix = pairwise_distances(q, metric="cosine")
print("Shape: ", sim_f_matrix.shape)
print(sim_f_matrix)

Shape:  (417, 417)
[[0.         1.1941483  1.62711386 ... 1.14695206 0.8685363  1.24268945]
 [1.1941483  0.         0.93004272 ... 0.83043361 0.84494341 1.62047542]
 [1.62711386 0.93004272 0.         ... 0.80879002 1.30855934 0.65908884]
 ...
 [1.14695206 0.83043361 0.80879002 ... 0.         1.44242327 0.85119935]
 [0.8685363  0.84494341 1.30855934 ... 1.44242327 0.         0.84791173]
 [1.24268945 1.62047542 0.65908884 ... 0.85119935 0.84791173 0.        ]]


In [41]:
with open('sim_f_matrix.dat', 'w') as arq_sim_matrix:
    for i in range(len(sim_f_matrix)):
        for j in range(len(sim_f_matrix)):
            if i < j:
                arq_sim_matrix.write(str(i) + '\t' + str(j) + '\t' + str(sim_f_matrix[i][j]) + '\n')

In [42]:
ItemAttributeKNN('train.dat', 'test.dat', similarity_file='sim_f_matrix.dat', as_similar_first=True).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 4.770686 sec
prediction_time:: 0.626801 sec
Eval:: MAE: 0.738577 RMSE: 0.972871 
