## Aula 06 - Filtragem Híbrida - Exercícios

In [1]:
import pandas as pd
import numpy as np
import pickle
from caserec.recommenders.rating_prediction.item_attribute_knn import ItemAttributeKNN
import wget
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix

In [2]:
# ! rm -fr ./dataset/ ./features/ ./outputs/ ./precomputed/ ./ml-20m-compact.tar.gz ./ml-20m-features.tar.gz

### Importar base de dados

In [3]:
!python3 -m wget https://github.com/mmanzato/MBABigData/raw/master/ml-20m-compact.tar.gz
!tar -xvzf ml-20m-compact.tar.gz

100% [....................................................] 65019041 / 65019041
Saved under ml-20m-compact.tar.gz
dataset/
dataset/tags_sample.csv
dataset/._.DS_Store
dataset/.DS_Store
dataset/movies_sample.csv
dataset/._genome-tags.csv
dataset/genome-tags.csv
dataset/._ml-youtube.csv
dataset/ml-youtube.csv
dataset/._genome-scores.csv
dataset/genome-scores.csv
dataset/ratings_sample.csv


In [4]:
!python3 -m wget https://github.com/mmanzato/MBABigData/raw/master/ml-20m-features.tar.gz
! tar -xvzf ml-20m-features.tar.gz

100% [......................................................] 5996435 / 5996435
Saved under ml-20m-features.tar.gz
features/
features/._m4infus_max_histogram_300_sn.arq
features/m4infus_max_histogram_300_sn.arq
features/._mm_avg_histogram_100_sn.arq
features/mm_avg_histogram_100_sn.arq
features/._visual_histogram_100_sn.arq
features/visual_histogram_100_sn.arq
features/._visual_histogram_50_sn.arq
features/visual_histogram_50_sn.arq
features/._aural_histogram_50.arq
features/aural_histogram_50.arq
features/._mm_max_histogram_300.arq
features/mm_max_histogram_300.arq
features/._m4infus_max_histogram_50.arq
features/m4infus_max_histogram_50.arq
features/._mm_max_histogram_100.arq
features/mm_max_histogram_100.arq
features/._mm_max_histogram_50_sn.arq
features/mm_max_histogram_50_sn.arq
features/._visual_histogram_100.arq
features/visual_histogram_100.arq
features/._visual_histogram_300.arq
features/visual_histogram_300.arq
features/._aural_histogram_100_sn.arq
features/aural_histogram_10

In [5]:
# Reading
movies = pd.read_csv('dataset/movies_sample.csv')
ratings = pd.read_csv('dataset/ratings_sample.csv')
tags = pd.read_csv('dataset/tags_sample.csv')


# Merging
df = ratings[['userId', 'movieId', 'rating']].merge(movies[['movieId', 'title']], on='movieId')


# Mapping
map_users = {user: idx for idx, user in enumerate(df.userId.unique())}
map_items = {movie: idx for idx, movie in enumerate(df.movieId.unique())}

df['userId'] = df['userId'].map(map_users)
df['movieId'] = df['movieId'].map(map_items)

ratings['userId'] = ratings['userId'].map(map_users)
ratings['movieId'] = ratings['movieId'].map(map_items)

movies['movieId'] = movies['movieId'].map(map_items)

tags['userId'] = tags['userId'].map(map_users)
tags['movieId'] = tags['movieId'].map(map_items)

map_title = {movie: title for movie, title in zip(movies['movieId'], movies['title'])}


# Display
df.head()


Unnamed: 0,userId,movieId,rating,title
0,0,0,5.0,Enemy Mine (1985)
1,0,1,4.5,Beautiful Thing (1996)
2,0,2,4.0,"Aristocats, The (1970)"
3,0,3,2.0,American Psycho (2000)
4,0,4,5.0,Bowling for Columbine (2002)


In [6]:
# Reading features
with open('./features/visual_histogram_50.arq', 'rb') as arq_visual_features:
    visual_features = pickle.load(arq_visual_features)

with open('./features/aural_histogram_50.arq', 'rb') as arq_aural_features:
    aural_features = pickle.load(arq_aural_features)

## Funções Auxiliares

In [7]:

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix

def compute_item_sim(df):
    # Pivot the dataframe to have movies as columns and users as rows, with ratings as values
    user_movie_matrix = df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
    
    # Compute the cosine similarity
    cosine_sim_matrix = cosine_similarity(user_movie_matrix.T)  # Transpose to have movies as rows
    
    # Get movie ids
    movie_ids = user_movie_matrix.columns
    
    # Create a sparse matrix representation using coo_matrix
    sim_sparse = coo_matrix(cosine_sim_matrix)
    
    # Create a dataframe from the sparse matrix
    sim_df = pd.DataFrame({
        'movieId1': movie_ids[sim_sparse.row],
        'movieId2': movie_ids[sim_sparse.col],
        'similarity': sim_sparse.data
    })
    
    # Remove self-similarities (where movieId1 == movieId2)
    sim_df = sim_df[sim_df['movieId1'] != sim_df['movieId2']].reset_index(drop=True)
    
    # Now, add missing pairs with zero similarity
    all_movie_pairs = pd.MultiIndex.from_product([movie_ids, movie_ids], names=["movieId1", "movieId2"]).to_frame(index=False)
    
    # Merge the existing similarities with all pairs
    final_sim_df = pd.merge(all_movie_pairs, sim_df, on=["movieId1", "movieId2"], how="left").fillna(0)
    
    # Remove self-similarities again if needed
    final_sim_df = final_sim_df[final_sim_df['movieId1'] != final_sim_df['movieId2']].reset_index(drop=True)
    
    return final_sim_df

def np_compute_feature_sim(features):
    # Convert to numpy array
    data = np.array(features)

    # Calculate norms (magnitude of vectors)
    norms = np.linalg.norm(data, axis=1, keepdims=True)

    # Filter out rows with all zeros
    mask = (norms != 0).flatten()  # Flatten the mask to make it 1D

    data = data[mask]
    norms = norms[mask]

    # Normalize data
    normalized_data = data / norms

    # Calculate cosine similarity matrix (dot product of normalized vectors)
    cosine_similarity_matrix = np.dot(normalized_data, normalized_data.T)

    return cosine_similarity_matrix


def np_to_pd_matrix(matrix):
        # Create row and column indices for all elements
    rows, cols = np.indices(matrix.shape)
    
    # Flatten the indices and values to create the DataFrame
    df = pd.DataFrame({
        'i': rows.flatten(),
        'j': cols.flatten(),
        'value': matrix.flatten()
    })
    
    return df.fillna(0)


def compute_feature_sim(features):
    return (
        np_to_pd_matrix(np_compute_feature_sim(features))
        .rename(columns={
            'i': 'movieId1', 
            'j': 'movieId2', 
            'value': 'similarity'
        })
    )

    

# returns the number of users that have rated both items i1 and i2
def coincidence(i1, i2):
    return len(set(df[df['movieId'] == i1]['userId']).intersection(set(df[df['movieId'] == i2]['userId'])))
               
def gen_coincidence_mat(df):
    n_items = df['movieId'].nunique()
    coincidence_mat = np.zeros((n_items, n_items))
    for i in range(n_items):
        for j in range(i+1, n_items):
            coincidence_mat[i, j] = coincidence(i, j)
            coincidence_mat[j, i] = coincidence_mat[i, j]
            
    return np_to_pd_matrix(coincidence_mat)


def gen_coincidence_mat(df):
    # Get unique users and movies
    user_ids = df['userId'].astype('category').cat.codes
    movie_ids = df['movieId'].astype('category').cat.codes
    num_users = user_ids.max() + 1
    num_movies = movie_ids.max() + 1
    
    # Create a sparse binary matrix where rows are users and columns are movies
    ratings_sparse = coo_matrix((df['rating'], (user_ids, movie_ids)), shape=(num_users, num_movies))
    
    # Multiply the sparse matrix by its transpose to get the coincidence matrix
    coincidence_matrix = ratings_sparse.T @ ratings_sparse
    
    # Extract non-zero values (coincidence counts)
    coincidence_matrix_coo = coo_matrix(coincidence_matrix)
    
    # Create the resulting dataframe with movieId pairs and their coincidence count
    coincidence_df = pd.DataFrame({
        'movieId1': coincidence_matrix_coo.row,
        'movieId2': coincidence_matrix_coo.col,
        'coincidence': coincidence_matrix_coo.data
    })
    
    # Map movieId1 and movieId2 back to original movie IDs
    movie_id_map = df[['movieId']].drop_duplicates().reset_index(drop=True)
    coincidence_df['movieId1'] = movie_id_map.iloc[coincidence_df['movieId1'].values]['movieId'].values
    coincidence_df['movieId2'] = movie_id_map.iloc[coincidence_df['movieId2'].values]['movieId'].values
    
    return coincidence_df


In [8]:
import pandas as pd
import numpy as np

def gen_hybrid_similarity_matrix(df, L1, coincidence, similarities, weights):
    # Create an empty list to store the results
    result = []

    # Create a dictionary for quick lookup of coincidence values
    coincidence_dict = {(row['movieId1'], row['movieId2']): row['coincidence'] for _, row in coincidence.iterrows()}

    # Create a list of dictionaries for quick lookup of similarities
    similarity_dicts = [
        {(row['movieId1'], row['movieId2']): row['similarity'] for _, row in sim_df.iterrows()} 
        for sim_df in similarities
    ]

    # Iterate over unique movie pairs in the df
    unique_movies = df['movieId'].unique()
    for i, movieId1 in enumerate(unique_movies):
        for movieId2 in unique_movies[i+1:]:
            # Default similarity to None
            similarity = None
            
            # Check for coincidence level
            coincidence_value = coincidence_dict.get((movieId1, movieId2), 0)

            if coincidence_value > L1:
                # If coincidence is greater than L1, use the first similarity matrix
                similarity = similarity_dicts[0].get((movieId1, movieId2), 0)
            else:
                # If coincidence is less than or equal to L1, calculate weighted average similarity
                weighted_sum = 0
                total_weight = 0
                for idx, sim_dict in enumerate(similarity_dicts):
                    sim_value = sim_dict.get((movieId1, movieId2), 0)
                    weighted_sum += sim_value * weights[idx]
                    total_weight += weights[idx]
                
                # Calculate the weighted average similarity
                if total_weight > 0:
                    similarity = weighted_sum / total_weight
                else:
                    similarity = 0

            # Store the result as a tuple (movieId1, movieId2, similarity)
            result.append((movieId1, movieId2, similarity))

    # Convert the result list into a DataFrame
    similarity_matrix = pd.DataFrame(result, columns=['movieId1', 'movieId2', 'similarity'])

    return similarity_matrix


In [9]:
def compare_similarity_matrices(df1, df2, tol=1e-6):
    """
    Compares two similarity matrices (pandas DataFrames) to see if they are the same.
    
    Parameters:
    df1, df2: DataFrames with columns ['movieId1', 'movieId2', 'similarity'].
    tol: tolerance for comparing floating-point similarity values.

    Returns:
    bool: True if the two DataFrames are equal within the given tolerance, False otherwise.
    """
    # First, check if they have the same shape
    if df1.shape != df2.shape:
        print("DataFrames have different shapes.")
        return False

    # Then, sort both DataFrames by movieId1 and movieId2 to ensure the comparison order is the same
    df1_sorted = df1.sort_values(by=['movieId1', 'movieId2']).reset_index(drop=True)
    df2_sorted = df2.sort_values(by=['movieId1', 'movieId2']).reset_index(drop=True)

    # Now, compare the movieId1 and movieId2 columns
    if not df1_sorted[['movieId1', 'movieId2']].equals(df2_sorted[['movieId1', 'movieId2']]):
        print("Movie ID pairs do not match.")
        return False

    # Compare the similarity values using numpy's isclose function with the given tolerance
    if not np.allclose(df1_sorted['similarity'], df2_sorted['similarity'], atol=tol):
        print("Similarity values do not match within tolerance.")
        return False

    # If all checks pass, the DataFrames are considered equal
    return True

In [10]:
import pandas as pd
import numpy as np

def fast_gen_hybrid_similarity_matrix(df, L1, coincidence, similarities, weights):
    # Step 1: Merge the coincidence DataFrame with the first similarity DataFrame
    merged_df = coincidence.merge(similarities[0], on=['movieId1', 'movieId2'], how='left', suffixes=('', '_sim1'))

    # Step 2: Check if the coincidence level is greater than L1
    mask = merged_df['coincidence'] > L1

    # Step 3: For movie pairs where coincidence > L1, use the first similarity matrix directly
    merged_df['similarity'] = np.where(mask, merged_df['similarity'], np.nan)
    
    # If there is only one similarity matrix, return the result immediately
    if len(similarities) <= 1:
        return merged_df[['movieId1', 'movieId2', 'similarity']].dropna()

    # Merge additional similarity matrices
    for i, sim_df in enumerate(similarities[1:], start=1):
        merged_df = merged_df.merge(
            sim_df[['movieId1', 'movieId2', 'similarity']].rename(columns={'similarity': f'sim_{i}'}),
            on=['movieId1', 'movieId2'], 
            how='left'
        )

    # Now we need to handle missing similarity values individually
    # We'll calculate the weighted sum for pairs where coincidence <= L1
    weighted_sums = np.zeros(len(merged_df))
    total_weight = np.zeros(len(merged_df))

    for i in range(len(similarities)):
        sim_column = f'sim_{i}' if i > 0 else 'similarity'

        # Handle NaN values manually during the summation, like in the first function
        sim_values = merged_df[sim_column].fillna(0)  # Replace NaNs with 0 for the sum
        weighted_sums += sim_values * weights[i]
        total_weight += (merged_df[sim_column].notna()) * weights[i]  # Count non-NaN similarities for the total weight

    # Assign weighted average where necessary (for pairs where coincidence <= L1)
    non_masked_indices = ~mask
    valid_indices = total_weight[non_masked_indices] > 0

    # Ensure proper calculation where there is valid data
    merged_df.loc[non_masked_indices, 'similarity'] = np.where(
        valid_indices,
        weighted_sums[non_masked_indices] / total_weight[non_masked_indices],
        0  # If total_weight is 0, the result is 0 (no valid similarities)
    )

    # Step 5: Return the final similarity DataFrame
    return merged_df[['movieId1', 'movieId2', 'similarity']].dropna()


## Pre-computando valores intermédiários

In [11]:
# Creating the folder structure
! mkdir -p precomputed
! mkdir -p outputs

In [12]:
bin_df = df.copy()
bin_df['rating'] = bin_df['rating'].apply(lambda x: 1 if x > 0 else 0)

In [13]:
# Train test split
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Save split to disk 
train.to_csv('precomputed/train.csv', index=False, sep='\t', header=False)
test.to_csv('precomputed/test.csv', index=False, sep='\t', header=False)

In [14]:
# Save tags to disk
tags[['movieId', 'tag']].to_csv('precomputed/tags.csv', index=False, sep='\t', header=False)

In [15]:
# Map ratings to one if rating >= 4, zero otherwise
train['rating'] = train['rating'].apply(lambda x: 1 if x > 0 else 0)

In [16]:
# Compute item-item cosine similarity based of ratings
rating_sim = compute_item_sim(df)

# Save rating similarity to disk
rating_sim.to_csv('precomputed/item_cos_rating_sim.csv', index=False, sep='\t', header=False)

rating_sim.head()

Unnamed: 0,movieId1,movieId2,similarity
0,0,1,0.04085
1,0,2,0.141544
2,0,3,0.183813
3,0,4,0.210518
4,0,5,0.306905


In [17]:
# Compute item-item cosine similarity based on visual features
visual_sim =  compute_feature_sim(visual_features)

# Drop values above df.movieId.max()
# visual_sim = visual_sim[(visual_sim['movieId1'] <= df['movieId'].max()) & (visual_sim['movieId2'] <= df['movieId'].max())]

# Save visual similarity to disk
visual_sim.to_csv('precomputed/item_cos_visual_sim.csv', index=False, sep='\t', header=False)

visual_sim.head()

Unnamed: 0,movieId1,movieId2,similarity
0,0,0,1.0
1,0,1,0.711958
2,0,2,0.487788
3,0,3,0.247959
4,0,4,0.736261


In [18]:
# Compute item-item cosine similarity based on aural features
aural_sim = compute_feature_sim(aural_features)

# Drop values above df.movieId.max()
aural_sim = aural_sim[(aural_sim['movieId1'] <= df['movieId'].max()) & (aural_sim['movieId2'] <= df['movieId'].max())]

# Save aural similarity to disk
aural_sim.to_csv('precomputed/item_cos_aural_sim.csv', index=False, sep='\t', header=False)

aural_sim.head()

Unnamed: 0,movieId1,movieId2,similarity
0,0,0,1.0
1,0,1,0.06512
2,0,2,0.025738
3,0,3,0.006057
4,0,4,6.4e-05


In [19]:
# Compute item-item coincidence matrix
coincidence_mat = gen_coincidence_mat(bin_df)

# Save coincidence matrix to disk
coincidence_mat.to_csv('precomputed/item_coincidence_mat.csv', index=False, sep='\t', header=False)

coincidence_mat.head()

Unnamed: 0,movieId1,movieId2,coincidence
0,0,348,1
1,0,302,1
2,0,225,1
3,0,372,1
4,0,409,1


In [20]:
# Compute hybrid similarity matrix
hybrid_mat = gen_hybrid_similarity_matrix(bin_df, 10, coincidence_mat, [rating_sim, aural_sim, visual_sim], [1, 2, 3])

# Save hybrid similarity matrix to disk
hybrid_mat.to_csv('precomputed/item_hybrid_sim.csv', index=False, sep='\t', header=False)

hybrid_mat.head()

Unnamed: 0,movieId1,movieId2,similarity
0,0,1,0.04085
1,0,2,0.141544
2,0,3,0.183813
3,0,4,0.210518
4,0,5,0.306905


In [21]:
hybrid_mat.movieId1.max()

415

***Exercício 01:*** Implemente uma hibridização monolítica/combinação usando a seguinte heurística:
- Uso do algoritmo ItemAtributeKNN, sendo a hibridização feita no cálculo das similaridades entre os itens.
- Se a quantidade de usuários que avaliaram ambos os itens for maior que um limiar L1, calcule a similaridade entre esses itens usando cosseno aplicado à representação baseada em notas.
- Caso contrário, calcule a similaridade entre os itens usando tags, características visuais e características aurais. Pondere cada uma das modalidades via pesos passados por parâmetro. 

Compare os resultados do algoritmo híbrido com as versões isoladas do mesmo algoritmo.

In [22]:
ItemAttributeKNN(
    train_file='precomputed/train.csv',
    test_file='precomputed/test.csv',
    output_file='outputs/base_predictions.csv',
    # metadata_file='precomputed/tags.csv', 
    as_similar_first=True,
    k_neighbors=5,
    similarity_file='precomputed/item_cos_rating_sim.csv',
).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 413 items (152496 interactions) | sparsity:: 96.67%
test data:: 10551 users and 333 items (38125 interactions) | sparsity:: 98.91%

training_time:: 4.198962 sec
prediction_time:: 0.270922 sec
Eval:: MAE: 0.732208 RMSE: 0.970678 


In [23]:
ItemAttributeKNN(
    train_file='precomputed/train.csv',
    test_file='precomputed/test.csv',
    output_file='outputs/aural_predictions.csv',
    as_similar_first=True,
    k_neighbors=5,
    similarity_file='precomputed/item_cos_aural_sim.csv',
).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 413 items (152496 interactions) | sparsity:: 96.67%
test data:: 10551 users and 333 items (38125 interactions) | sparsity:: 98.91%

training_time:: 4.115911 sec
prediction_time:: 0.150105 sec
Eval:: MAE: 0.685754 RMSE: 0.900496 


In [24]:
ItemAttributeKNN(
    train_file='precomputed/train.csv',
    test_file='precomputed/test.csv',
    output_file='outputs/visual_predictions.csv',
    as_similar_first=True,
    k_neighbors=5,
    similarity_file='precomputed/item_cos_visual_sim.csv',
).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 413 items (152496 interactions) | sparsity:: 96.67%
test data:: 10551 users and 333 items (38125 interactions) | sparsity:: 98.91%

training_time:: 4.456880 sec
prediction_time:: 0.159817 sec
Eval:: MAE: 0.703411 RMSE: 0.924584 


In [25]:
ItemAttributeKNN(
    train_file='precomputed/train.csv',
    test_file='precomputed/test.csv',
    output_file='outputs/hybrid_predictions.csv',
    as_similar_first=True,
    k_neighbors=5,
    similarity_file='precomputed/item_hybrid_sim.csv',
    metadata_file='precomputed/tags.csv', 
).compute()

[Case Recommender: Rating Prediction > Item Attribute KNN Algorithm]

train data:: 11090 users and 413 items (152496 interactions) | sparsity:: 96.67%
test data:: 10551 users and 333 items (38125 interactions) | sparsity:: 98.91%

training_time:: 4.299709 sec
>> metadata:: 231 items and 1979 metadata (6274 interactions) | sparsity:: 98.63%
prediction_time:: 0.285116 sec
Eval:: MAE: 0.72457 RMSE: 0.953339 


O desempenho do modelo híbrido é similar aos demais modelos. Seu desempenho foi bom, porém não é o melhor modelo. Talvez, mais testes com os parâmetros revelem uma configuração em que o modelo híbrido se sai melhor.

***Exercício 02:*** Vamos implementar um recomendador híbrido canalizado em cascata, no cenário de ranqueamento. A ideia é que um primeiro algoritmo gere uma lista C1 de N=50 itens candidatos à recomendação para cada usuário. Em seguida, um outro recomendador irá gerar uma outra lista C2 também de N=50 itens candidatos à rcomendação para cada usuário. Por fim, o ranking final será a intersecção entre C1 e C2, sendo o score de cada itens formado pela média aritmética dos scores de cada lista. Avalie o desempenho.

Dica 1: utilize o parâmetro rank_length disponível nos algoritmos de ranqueamento do CaseRecommender para especificar o tamanho N de recomendações para cada usuário.

Dica 2: você pode gravar num arquivo os rankings gerados por um algoritmo para cada usuário especificando o nome do arquivo no parâmetro output_file.

Dica 3: consulte a Aula 04 que contém algumas métricas de avaliação de ranqueamento. Como você irá gerar o ranking final externamente ao CaseRecommender, será necessário avaliá-lo usando funções próprias.

In [26]:
# def recommend(df, pred, userId, n=10):
#     # Get the top N recommendations
#     top_n = pred[pred['userId'] == userId].sort_values(by='prediction', ascending=False).head(n)
    
#     return top_n.drop(columns='userId')

# def ordered_intersection(*lists):
#     # Start with the first list
#     if not lists:
#         return []
    
#     # Create a set of all elements that appear in every list
#     all_sets = [set(lst) for lst in lists]
#     common_elements = set.intersection(*all_sets)
    
#     # Preserve the order of elements as they appear in the first list
#     return [item for item in lists[0] if item in common_elements]

# def hybrid_recommend(df, predictions, userId, n=10):
#     recommendations = [recommend(df, pred, userId, n)['movieId'].to_list() for pred in predictions]
#     return ordered_intersection(*recommendations)

# Helper function to calculate Precision@K
def precision_at_k(df, k):
    relevant_items = df[df['rating'] > 0]
    top_k_items = df.groupby('userId').head(k)
    precision = top_k_items[top_k_items['rating'] > 0].shape[0] / top_k_items.shape[0]
    return precision
    

In [27]:
from caserec.recommenders.item_recommendation.userknn import UserKNN
from caserec.recommenders.item_recommendation.itemknn import ItemKNN

# Definindo o tamanho do ranking
rank_length = 50

# Gerando C1 usando UserKNN
user_knn = UserKNN(
    train_file='precomputed/train.csv',
    test_file='precomputed/test.csv',
    output_file='outputs/rankings_1.csv',
    rank_length=rank_length
)

user_knn.compute()


# Gerando C2 usando ItemKNN
item_knn = ItemKNN(
    train_file='precomputed/train.csv',
    test_file='precomputed/test.csv',
    output_file='outputs/rankings_2.csv',
    rank_length=rank_length
)

item_knn.compute()

[Case Recommender: Item Recommendation > UserKNN Algorithm]

train data:: 11090 users and 413 items (152496 interactions) | sparsity:: 96.67%
test data:: 10551 users and 333 items (38125 interactions) | sparsity:: 98.91%

training_time:: 29.029569 sec
prediction_time:: 96.665291 sec


Eval:: PREC@1: 0.440337 PREC@3: 0.320507 PREC@5: 0.263937 PREC@10: 0.19183 RECALL@1: 0.143184 RECALL@3: 0.295854 RECALL@5: 0.395645 RECALL@10: 0.560795 MAP@1: 0.440337 MAP@3: 0.531213 MAP@5: 0.53265 MAP@10: 0.501229 NDCG@1: 0.440337 NDCG@3: 0.617644 NDCG@5: 0.633825 NDCG@10: 0.623731 
[Case Recommender: Item Recommendation > ItemKNN Algorithm]

train data:: 11090 users and 413 items (152496 interactions) | sparsity:: 96.67%
test data:: 10551 users and 333 items (38125 interactions) | sparsity:: 98.91%

training_time:: 1.245062 sec
prediction_time:: 25.802659 sec


Eval:: PREC@1: 0.419012 PREC@3: 0.305437 PREC@5: 0.252829 PREC@10: 0.187148 RECALL@1: 0.135091 RECALL@3: 0.28026 RECALL@5: 0.378824 RECALL@10: 

In [28]:
# Reading the rankings
rankings_1 = pd.read_csv('outputs/rankings_1.csv', sep='\t', names=['userId', 'movieId', 'pred_1'])
rankings_2 = pd.read_csv('outputs/rankings_2.csv', sep='\t', names=['userId', 'movieId', 'pred_2'])

# Merging the rankings
hybrid_rankings = pd.merge(rankings_1, rankings_2, on=['userId', 'movieId'])
hybrid_rankings = hybrid_rankings.fillna(0)

# Get average score
hybrid_rankings['pred'] = (hybrid_rankings['pred_1'] + hybrid_rankings['pred_2']) / 2

# Filtering for intersection
hybrid_rankings = hybrid_rankings[
    (hybrid_rankings['pred_1'] > 0)
    & (hybrid_rankings['pred_2'] > 0)
]

# Sorting the final rankings
final_rankings = hybrid_rankings.sort_values(by=['userId', 'pred'], ascending=[True, False])

final_rankings

Unnamed: 0,userId,movieId,pred_1,pred_2,pred
0,0,12,28.469562,2.403147,15.436355
1,0,19,25.500807,2.320786,13.910797
2,0,22,17.919954,1.986250,9.953102
3,0,57,15.556132,1.782764,8.669448
4,0,30,15.153696,1.977851,8.565774
...,...,...,...,...,...
457788,11089,0,2.220316,0.930920,1.575618
457790,11089,50,2.201432,0.879181,1.540307
457789,11089,9,2.211722,0.721707,1.466714
457786,11089,103,2.257446,0.639067,1.448256


In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Reading the ground truth ratings
test_ratings = pd.read_csv('precomputed/test.csv', sep='\t', names=['userId', 'movieId', 'rating', 'title'])

# Merging
eval_data = pd.merge(final_rankings, test_ratings, on=['userId', 'movieId'])

eval_data.head()

Unnamed: 0,userId,movieId,pred_1,pred_2,pred,rating,title
0,0,12,28.469562,2.403147,15.436355,5.0,Star Wars: Episode V - The Empire Strikes Back...
1,0,11,7.402471,0.750946,4.076709,3.5,Masters of the Universe (1987)
2,0,6,5.280536,1.314397,3.297466,5.0,Apocalypto (2006)
3,0,0,4.275248,0.946687,2.610968,5.0,Enemy Mine (1985)
4,1,20,20.382487,2.885046,11.633767,3.0,While You Were Sleeping (1995)


In [30]:
mse = mean_squared_error(eval_data['rating'], eval_data['pred'])

f"Mean Squared Error (MSE): {mse}"

'Mean Squared Error (MSE): 50.60468100644268'

In [31]:
mae = mean_absolute_error(eval_data['rating'], eval_data['pred'])

f"Mean Absolute Error (MAE): {mae}"

'Mean Absolute Error (MAE): 5.559783997003632'

In [32]:
# Calculate Precision@K for K=5
precision_k = precision_at_k(eval_data, 5)

f"Precision@5: {precision_k}"

'Precision@5: 1.0'

In [33]:
eval_data_filtered = eval_data.copy()
eval_data_filtered.loc[eval_data_filtered['pred_1'] > 5, 'pred'] = eval_data_filtered['pred_2']
eval_data_filtered.head()

Unnamed: 0,userId,movieId,pred_1,pred_2,pred,rating,title
0,0,12,28.469562,2.403147,2.403147,5.0,Star Wars: Episode V - The Empire Strikes Back...
1,0,11,7.402471,0.750946,0.750946,3.5,Masters of the Universe (1987)
2,0,6,5.280536,1.314397,1.314397,5.0,Apocalypto (2006)
3,0,0,4.275248,0.946687,2.610968,5.0,Enemy Mine (1985)
4,1,20,20.382487,2.885046,2.885046,3.0,While You Were Sleeping (1995)


In [34]:
mse = mean_squared_error(eval_data_filtered['rating'], eval_data_filtered['pred'])

f"Mean Squared Error (MSE): {mse}"

'Mean Squared Error (MSE): 2.7831567810143687'

In [35]:
mae = mean_absolute_error(eval_data_filtered['rating'], eval_data_filtered['pred'])

f"Mean Absolute Error (MAE): {mae}"

'Mean Absolute Error (MAE): 1.3841018758171912'

In [36]:
# Calculate Precision@K for K=5
precision_k = precision_at_k(eval_data_filtered, 5)

f"Precision@5: {precision_k}"

'Precision@5: 1.0'

O desempenho é ruim segundo as métricas baseadas em médias. Isso provavelmente ocorre porque a predição 1 está repleta de outliers, o que distorce a média. Ao remover esses outliers, a qualidade da predição melhora significativamente. Além disso, devido à sua alta precisão, o modelo híbrido apresenta uma grande proporção de itens relevantes entre as recomendações feitas.