## Aula 06 - Filtragem Híbrida - Exercícios

In [2]:
import pandas as pd
import numpy as np
from math import pow, sqrt
from sklearn.metrics import pairwise_distances
import pickle
from caserec.recommenders.rating_prediction.item_attribute_knn import ItemAttributeKNN
from caserec.recommenders.item_recommendation.bprmf import BprMF
from caserec.recommenders.item_recommendation.itemknn import ItemKNN

### Importar base de dados

In [4]:
import wget
!python3 -m wget https://github.com/mmanzato/MBABigData/raw/master/ml-20m-compact.tar.gz
!tar -xvzf ml-20m-compact.tar.gz


Saved under ml-20m-compact.tar (32).gz


x dataset/
x dataset/tags_sample.csv
x dataset/._.DS_Store
x dataset/.DS_Store
x dataset/movies_sample.csv
x dataset/._genome-tags.csv
x dataset/genome-tags.csv
x dataset/._ml-youtube.csv
x dataset/ml-youtube.csv
x dataset/._genome-scores.csv
x dataset/genome-scores.csv
x dataset/ratings_sample.csv


In [5]:
!python3 -m wget https://github.com/mmanzato/MBABigData/raw/master/ml-20m-features.tar.gz
! tar -xvzf ml-20m-features.tar.gz


Saved under ml-20m-features.tar (18).gz


x features/
x features/._m4infus_max_histogram_300_sn.arq
x features/m4infus_max_histogram_300_sn.arq
x features/._mm_avg_histogram_100_sn.arq
x features/mm_avg_histogram_100_sn.arq
x features/._visual_histogram_100_sn.arq
x features/visual_histogram_100_sn.arq
x features/._visual_histogram_50_sn.arq
x features/visual_histogram_50_sn.arq
x features/._aural_histogram_50.arq
x features/aural_histogram_50.arq
x features/._mm_max_histogram_300.arq
x features/mm_max_histogram_300.arq
x features/._m4infus_max_histogram_50.arq
x features/m4infus_max_histogram_50.arq
x features/._mm_max_histogram_100.arq
x features/mm_max_histogram_100.arq
x features/._mm_max_histogram_50_sn.arq
x features/mm_max_histogram_50_sn.arq
x features/._visual_histogram_100.arq
x features/visual_histogram_100.arq
x features/._visual_histogram_300.arq
x features/visual_histogram_300.arq
x features/._aural_histogram_100_sn.arq
x features/aural_histogram_100_sn.arq
x features/._mm_avg_histogram_100.arq
x features/mm_avg_

***Exercício 01:*** Implemente uma hibridização monolítica/combinação usando a seguinte heurística:
- Uso do algoritmo ItemAtributeKNN, sendo a hibridização feita no cálculo das similaridades entre os itens.
- Se a quantidade de usuários que avaliaram ambos os itens for maior que um limiar L1, calcule a similaridade entre esses itens usando cosseno aplicado à representação baseada em notas.
- Caso contrário, calcule a similaridade entre os itens usando tags, características visuais e características aurais. Pondere cada uma das modalidades via pesos passados por parâmetro. 

Compare os resultados do algoritmo híbrido com as versões isoladas do mesmo algoritmo.

In [7]:
# TODO
from sklearn.model_selection import train_test_split

# Read the csv files
movies = pd.read_csv('./dataset/movies_sample.csv')
ratings = pd.read_csv('./dataset/ratings_sample.csv')
tags = pd.read_csv('./dataset/tags_sample.csv')

# Merge movies and ratings df
df = ratings[['userId', 'movieId', 'rating']]
df = df.merge(movies[['movieId', 'title']])

# Map the userId and movieId 
map_users = {user: idx for idx, user in enumerate(df.userId.unique())}
map_items = {item: idx for idx, item in enumerate(df.movieId.unique())}
df['userId'] = df['userId'].map(map_users)
df['movieId'] = df['movieId'].map(map_items)
tags['userId'] = tags['userId'].map(map_users)
tags['movieId'] = tags['movieId'].map(map_items)

# Split in train and test
train, test = train_test_split(df, test_size=0.20, random_state=2)
train.to_csv('train.dat', index=False, header=False, sep='\t')
test.to_csv('test.dat', index=False, header=False, sep='\t')

map_title = {}
for _, row in df.iterrows():
    map_title[row.movieId] = row.title

In [8]:
# Auxiliary functions that will be used ahead

# Get the tags of the movie
def get_tags(movieId):
    if movieId not in tags['movieId'].values:
        return []
    return tags.loc[(tags.movieId==movieId),'tag'].tolist()

# Get the common users that rated both movies
def get_common_raters_between_movies(movieId1, movieId2):
    df_movie1 = df[df['movieId'] == movieId1][['userId']]
    df_movie2 = df[df['movieId'] == movieId2][['userId']]
    
    merged_users = pd.merge(df_movie1, df_movie2, on='userId')
    return merged_users['userId'].tolist()

# Calculate the Jaccard similarity by tags between 2 movies 
def item_sim_jaccard_tag(movieId1, movieId2):
    tag_list1 = get_tags(movieId1)
    tag_list2 = get_tags(movieId2)
    common_tags = list(set(tag_list1) & set(tag_list2))
    if len(common_tags) == 0:
        return 0
    return len(common_tags) / len(set(tag_list1 + tag_list2))

# Generate the similarity files
def generate_dat_file_sim(sim_df, filename):
    with open(filename + ".dat", 'w') as arq_sim_matrix:
        for i in range(len(sim_df)):
            for j in range(len(sim_df)):
                if i < j:
                    arq_sim_matrix.write(str(i) + '\t' + str(j) + '\t' + str(sim_df[i][j]) + '\n')

In [9]:
# Rating-based cosine similarity calculation

n_users = train['userId'].max()
n_items = train['movieId'].max()

A = np.zeros((n_users+1,n_items+1))
for line in train.itertuples():
    A[line[1],line[2]] = line[3]

# Similarity matrix of all movies, based on ratings
sim_matrix_rating = pairwise_distances(A.T, metric="cosine")

with open('sim_r_matrix.dat', 'w') as arq_sim_matrix:
    for i in range(len(sim_matrix_rating)):
        for j in range(len(sim_matrix_rating)):
            if i < j:
                arq_sim_matrix.write(str(i) + '\t' + str(j) + '\t' + str(sim_matrix_rating[i][j]) + '\n')

In [10]:
# Tags-based Jaccard similarity calculation

n_items = train['movieId'].max()

sim_matrix_tag = np.zeros((n_items+1,n_items+1))


# Similarity matrix of all movies, based on tags
sim_matrix_tag = pairwise_distances(A.T, metric="cosine")

with open('sim_matrix_tag.dat', 'w') as arq_sim_matrix:
    for i in range(len(sim_matrix_tag)):
        for j in range(len(sim_matrix_tag)):
            if i < j:
                sim_matrix_tag[i][j] = item_sim_jaccard_tag(i, j)
                arq_sim_matrix.write(str(i) + '\t' + str(j) + '\t' + str(sim_matrix_tag[i][j]) + '\n')

In [11]:
# Visual_Histogram-based cosine similarity calculation

with open('./features/visual_histogram_50.arq', 'rb') as arq_visualHistograms:
    visualHistograms = pickle.load(arq_visualHistograms)

# Similarity matrix of all movies, based on visual histograms
sim_matrix_visual_hist = pairwise_distances(visualHistograms, metric="cosine")

with open('sim_matrix_visual_hist.dat', 'w') as arq_sim_matrix:
    for i in range(len(sim_matrix_visual_hist)):
        for j in range(len(sim_matrix_visual_hist)):
            if i < j:
                arq_sim_matrix.write(str(i) + '\t' + str(j) + '\t' + str(sim_matrix_visual_hist[i][j]) + '\n')

In [12]:
# Aural_Histogram-based cosine similarity calculation

with open('./features/aural_histogram_50_sn.arq', 'rb') as arq_auralHistograms:
    auralHistograms = pickle.load(arq_auralHistograms)

# Similarity matrix of all movies, based on aural histograms
sim_matrix_aural_hist = pairwise_distances(auralHistograms, metric="cosine")

with open('sim_matrix_aural_hist.dat', 'w') as arq_sim_matrix:
    for i in range(len(sim_matrix_aural_hist)):
        for j in range(len(sim_matrix_aural_hist)):
            if i < j:
                arq_sim_matrix.write(str(i) + '\t' + str(j) + '\t' + str(sim_matrix_aural_hist[i][j]) + '\n')

In [13]:
# Hybrid matrix using sim_matrix_tag, sim_matrix_visual_hist and sim_matrix_aural_hist
def hybrid_sim(L1, w_tag, w_visual, w_aural):
    sim_matrix_hybrid = sim_matrix_rating.copy()
    
    # Normalize the weights if their sum is not 1
    w_total = w_tag + w_visual + w_aural
    w_tag /= w_total
    w_visual /= w_total
    w_aural /= w_total

    # Iterate through the upper triangle of the matrix (i < j)
    for i in range(len(sim_matrix_hybrid)):
        for j in range(i+1, len(sim_matrix_hybrid)):  # Start from i+1 to avoid checking i < j
            # If the number of common raters is below the threshold L1
            if len(get_common_raters_between_movies(i, j)) <= L1:
                # Weighted sum of tag, visual, and aural similarities
                sim_matrix_hybrid[i][j] = (
                    w_tag * sim_matrix_tag[i, j] +
                    w_visual * sim_matrix_visual_hist[i, j] +
                    w_aural * sim_matrix_aural_hist[i, j]
                )
                # Maintain symmetry
                sim_matrix_hybrid[j][i] = sim_matrix_hybrid[i][j]
    
    return sim_matrix_hybrid

In [14]:
# Test 1 - Rating-Based Similarity Only
# (L1, w_tag, w_visual, w_aural) = (0, 1/3, 1/3, 1/3)
# Since L1 = 0, the hybrid similarity matrix will only use sim_matrix_rating, 
# ignoring the tag, visual, and aural similarity matrices. 

sim_hib_matrix = hybrid_sim(0, 1/3, 1/3, 1/3)
generate_dat_file_sim(sim_hib_matrix, "sim_hib_matrix")
ItemAttributeKNN('train.dat', 'test.dat', similarity_file='sim_hib_matrix.dat', as_similar_first=True).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 4.682458 sec
prediction_time:: 0.434747 sec
Eval:: MAE: 0.664563 RMSE: 0.866486 


In [15]:
# Test 2 - Tag-Based Similarity Only 
# (L1 > 0, w_tag, w_visual, w_aural) = (5, 1, 0, 0)
# This test will only use the tag-based similarity matrix (sim_matrix_tag) 
# by setting w_tag = 1, and w_visual = w_aural = 0. It uses tag-based similarity 
# only when there are 5 or fewer common raters (L1 = 5).

sim_tag_matrix = hybrid_sim(5, 1, 0, 0)
generate_dat_file_sim(sim_tag_matrix, "sim_tag_matrix")
ItemAttributeKNN('train.dat', 'test.dat', similarity_file='sim_tag_matrix.dat', as_similar_first=True).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 4.532275 sec
prediction_time:: 0.383004 sec
Eval:: MAE: 0.69465 RMSE: 0.909092 


In [16]:
# Test 3 - Visual-Based Similarity Only 
# (L1 > 0, w_tag, w_visual, w_aural) = (5, 0, 1, 0)
# This test will only use the visual-based similarity matrix (sim_matrix_visual_hist) 
# by setting w_visual = 1, and w_tag = w_aural = 0. It uses visual-based similarity 
# only when there are 5 or fewer common raters (L1 = 5).

sim_visual_matrix = hybrid_sim(5, 0, 1, 0)
generate_dat_file_sim(sim_visual_matrix, "sim_visual_matrix")
ItemAttributeKNN('train.dat', 'test.dat', similarity_file='sim_visual_matrix.dat', as_similar_first=True).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 4.557132 sec
prediction_time:: 0.260340 sec
Eval:: MAE: 0.664174 RMSE: 0.865856 


In [17]:
# Test 4 - Aural-Based Similarity Only 
# (L1 > 0, w_tag, w_visual, w_aural) = (5, 0, 0, 1)
# This test will only use the aural-based similarity matrix (sim_matrix_aural_hist) 
# by setting w_aural = 1, and w_tag = w_visual = 0. It uses aural-based similarity 
# only when there are 5 or fewer common raters (L1 = 5).

sim_aural_matrix = hybrid_sim(5, 0, 0, 1)
generate_dat_file_sim(sim_aural_matrix, "sim_aural_matrix")
ItemAttributeKNN('train.dat', 'test.dat', similarity_file='sim_aural_matrix.dat', as_similar_first=True).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 4.563418 sec
prediction_time:: 0.447876 sec
Eval:: MAE: 0.672189 RMSE: 0.877902 


In [18]:
# Test 5 - Hybrid Similarity with Equal Weights 
# (L1 > 0, w_tag, w_visual, w_aural) = (5, 1/3, 1/3, 1/3)
# This test will use a hybrid similarity matrix combining tag, visual, and aural similarities
# with equal weights (1/3 each). The hybrid similarity will be applied only when there 
# are 5 or fewer common raters (L1 = 5).

sim_hybrid_matrix = hybrid_sim(5, 1/3, 1/3, 1/3)
generate_dat_file_sim(sim_hybrid_matrix, "sim_hybrid_matrix")
ItemAttributeKNN('train.dat', 'test.dat', similarity_file='sim_hybrid_matrix.dat', as_similar_first=True).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 4.561417 sec
prediction_time:: 0.304187 sec
Eval:: MAE: 0.682064 RMSE: 0.891725 


In [19]:
# Test 6 - Hybrid Similarity with L1 = 100 and Equal Weights 
# (L1 = 100, w_tag, w_visual, w_aural) = (100, 1/3, 1/3, 1/3)
# This test will use a hybrid similarity matrix combining tag, visual, and aural similarities
# with equal weights (1/3 each). The hybrid similarity will be applied only when there 
# are 100 or fewer common raters (L1 = 100).

sim_hybrid_matrix_100 = hybrid_sim(100, 1/3, 1/3, 1/3)
generate_dat_file_sim(sim_hybrid_matrix_100, "sim_hybrid_matrix_100")
ItemAttributeKNN('train.dat', 'test.dat', similarity_file='sim_hybrid_matrix_100.dat', as_similar_first=True).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 5.092627 sec
prediction_time:: 0.833756 sec
Eval:: MAE: 0.732889 RMSE: 0.964418 


***Exercício 02:*** Vamos implementar um recomendador híbrido canalizado em cascata, no cenário de ranqueamento. A ideia é que um primeiro algoritmo gere uma lista C1 de N=50 itens candidatos à recomendação para cada usuário. Em seguida, um outro recomendador irá gerar uma outra lista C2 também de N=50 itens candidatos à rcomendação para cada usuário. Por fim, o ranking final será a intersecção entre C1 e C2, sendo o score de cada itens formado pela média aritmética dos scores de cada lista. Avalie o desempenho.

Dica 1: utilize o parâmetro rank_length disponível nos algoritmos de ranqueamento do CaseRecommender para especificar o tamanho N de recomendações para cada usuário.

Dica 2: você pode gravar num arquivo os rankings gerados por um algoritmo para cada usuário especificando o nome do arquivo no parâmetro output_file.

Dica 3: consulte a Aula 04 que contém algumas métricas de avaliação de ranqueamento. Como você irá gerar o ranking final externamente ao CaseRecommender, será necessário avaliá-lo usando funções próprias.

In [65]:
# TODO

# The models that will be used are ItemKNN and BbrMF
ItemKNN('train.dat', 'test.dat', 'ir_itemknn.dat', rank_length=50).compute()
BprMF('train.dat', 'test.dat', 'ir_bprmf.dat', factors=3, rank_length=50).compute()

[Case Recommender: Item Recommendation > ItemKNN Algorithm]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 1.314007 sec
prediction_time:: 48.040426 sec


Eval:: PREC@1: 0.422193 PREC@3: 0.307003 PREC@5: 0.252521 PREC@10: 0.187201 RECALL@1: 0.138504 RECALL@3: 0.285647 RECALL@5: 0.382874 RECALL@10: 0.552056 MAP@1: 0.422193 MAP@3: 0.514284 MAP@5: 0.517983 MAP@10: 0.489237 NDCG@1: 0.422193 NDCG@3: 0.602418 NDCG@5: 0.620641 NDCG@10: 0.614089 
[Case Recommender: Item Recommendation > BPRMF]

train data:: 11090 users and 405 items (152496 interactions) | sparsity:: 96.60%
test data:: 10571 users and 331 items (38125 interactions) | sparsity:: 98.91%

training_time:: 131.841009 sec
prediction_time:: 2.876345 sec


Eval:: PREC@1: 0.367515 PREC@3: 0.260366 PREC@5: 0.220263 PREC@10: 0.169284 RECALL@1: 0.118391 RECALL@3: 0.238812 RECALL@5: 0.330738 RECALL@10: 0.495048 

In [71]:
# Candidates movies to be recommended to each user
C1 = pd.read_csv('./ir_itemknn.dat', sep='\t', names=['userId', 'movieId', 'score'])
C2 = pd.read_csv('./ir_bprmf.dat', sep='\t', names=['userId', 'movieId', 'score'])

def recommended_movies(user_id, list):
    if list == 'C1':
        # Filter C1 in order to get the recommendations for a specific user
        return C1.loc[C1.userId == user_id, ['movieId', 'score']].set_index('movieId')['score'].to_dict()
    elif list == 'C2':
        # Filter C2 in order to get the recommendations for a specific user
        return C2.loc[C2.userId == user_id, ['movieId', 'score']].set_index('movieId')['score'].to_dict()

# Combine the recommendations of C1 and C2
def combine_recommendations(user_id):
    rec_c1 = recommended_movies(user_id, 'C1')
    rec_c2 = recommended_movies(user_id, 'C2')
    
    # Find the intersection of the recommended movies
    common_movies = set(rec_c1.keys()).intersection(set(rec_c2.keys()))
    
    # Calculate the scores mean
    combined_scores = {}
    for movie in common_movies:
        combined_scores[movie] = (rec_c1[movie] + rec_c2[movie]) / 2
    
    return combined_scores

# Create a dataframe for the combined recommendations
user_ids = C1['userId'].unique()
combined_recommendations = []

for user_id in user_ids:
    combined_rec = combine_recommendations(user_id)
    for movie_id, score in combined_rec.items():
        combined_recommendations.append({'userId': user_id, 'movieId': movie_id, 'score': score})

combined_recommendations_df = pd.DataFrame(combined_recommendations)
combined_recommendations_df = combined_recommendations_df.sort_values(by='score', ascending=False)

# Calculate the average precision for a user
def average_precision(user_id, recommendations, test_set, limit): 
    # Get the recommendations for the specific user
    recs_user = recommendations.loc[(recommendations.userId == user_id), 'movieId'].tolist()
    # Get the relevant movies for the specific user
    relevant_movies = test_set.loc[(test_set.userId == user_id), 'movieId'].tolist()
    
    n_relevant_movies = 0
    cumulative_precision = 0.0
    total = 0
    # Iterate over the recommendations and calculate the precision
    for i, movie in enumerate(recs_user):
        total += 1
        if movie in relevant_movies:
            n_relevant_movies += 1
            # Precision at index i
            precision_at_i = n_relevant_movies / (i + 1)
            cumulative_precision += precision_at_i
        if total == limit:
            break
    if n_relevant_movies == 0:
        return 0.0
    
    ap = cumulative_precision / n_relevant_movies
    return ap

# Calculate the mean average precision
def mean_average_precision(recommendations, test_set, limit=10):
    average_precisions = []
    
    # Get all unique users Ids
    unique_users = recommendations['userId'].unique()
    
    for user_id in unique_users:
        ap = average_precision(user_id, recommendations, test_set, limit)
        average_precisions.append(ap)
    
    return np.mean(average_precisions)
    
# Calculate the MAP score
map_score = mean_average_precision(combined_recommendations_df, test)
print(f'MAP Score: {map_score}')

MAP Score: 0.4612911381539892


O modelo híbrido conseguiu melhorar o desempenho em relação ao BPRMF, mas não foi capaz de superar o ItemKNN, apesar dos resultados relativamente semelhantes da metrica MAP@10. Isso sugere que o ganho ao combinar esses dois métodos foi limitado, provavelmente porque o ItemKNN já captura a maioria dos padrões relevantes do conjunto de dados.