In [3]:
import pandas as pd

# Lire les fichiers CSV
movies = pd.read_csv('../src/data/raw/movies.csv')
ratings = pd.read_csv('../src/data/raw/ratings.csv')

# Calculer la note moyenne pour chaque film
average_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
average_ratings.columns = ['movieId', 'average_rating']

# Fusionner les films avec leurs notes moyennes
movies_ratings = pd.merge(movies, average_ratings, on='movieId')

# Séparer les genres
movies_ratings['genres'] = movies_ratings['genres'].str.split('|')
movies_ratings = movies_ratings.explode('genres')

# Trouver les 3 films les mieux notés pour chaque genre
top_movies_per_genre = movies_ratings.sort_values(['genres', 'average_rating'], ascending=[True, False])
top_movies_per_genre = top_movies_per_genre.groupby('genres').head(3)

# Afficher les résultats
top_movies_per_genre[['genres', 'title', 'average_rating']]


Unnamed: 0,genres,title,average_rating
18934,(no genres listed),"Ella Lola, a la Trilby (1898)",5.0
18964,(no genres listed),"Turkish Dance, Ella Lola (1898)",5.0
25658,(no genres listed),Always for Pleasure (1978),5.0
19971,Action,Shaolin Temple 2: Kids from Shaolin (Shao Lin ...,5.0
24606,Action,Deewaar (1975),4.666667
9112,Action,Meltdown (Shu dan long wei) (1995),4.5
9005,Adventure,Life On A String (Bian chang Bian Zou) (1991),5.0
21318,Adventure,Giorgino (1994),5.0
22818,Adventure,Into the Middle of Nowhere (2010),5.0
24821,Animation,Rock the Boat (1944),5.0


In [4]:
def get_user_preferences(user_id, user_matrix_filename):
    """
    Description:
    Get the top 3 movie genres based on user preferences.

    Args:
    - user_id (int): The ID of the user for whom preferences are to be retrieved.
    - user_matrix_filename (str): The filename of the CSV file containing user preferences data.

    Returns:
    - list[str]: A list containing the top 3 movie genres based on the user's preferences.
    """
    # Load user matrix
    user_matrix = pd.read_csv(user_matrix_filename)

    # Filter user preferences by user_id, then select all columns (genres) except the column user_id (column 0)
    user_preferences = user_matrix[user_matrix["userId"] == user_id].iloc[:, 1:]

    # Get top 3 genres based on user preferences
    # Convert the DataFrame to a numpy array, sort the preference values in ascending order then select the last 3 indices
    top_3_genres = user_preferences.columns[user_preferences.values.squeeze().argsort()[-3:]].tolist()

    return top_3_genres


In [41]:
preferences = get_user_preferences(89, "../src/data/processed/user_matrix.csv")

In [42]:
preferences

['Romance', 'Comedy', 'Drama']

In [43]:
import pandas as pd
import numpy as np

def get_top_movies_combining_genres(genres, movies_filename='../src/data/raw/movies.csv', ratings_filename='../src/data/raw/ratings.csv', top_n=5):
    """
    Get the top N movies that combine the specified genres, using Bayesian score.

    Args:
    - genres (list[str]): The genres to combine.
    - movies_filename (str): Path to the movies CSV file.
    - ratings_filename (str): Path to the ratings CSV file.
    - top_n (int): The number of top movies to return.

    Returns:
    - list[dict]: A list of dictionaries containing movie titles and their Bayesian scores.
    """
    # Load movies and ratings data
    movies = pd.read_csv(movies_filename)
    ratings = pd.read_csv(ratings_filename)
    
    # Calculate average rating and number of votes for each movie
    ratings_count = ratings.groupby('movieId')['rating'].count().reset_index()
    ratings_count.columns = ['movieId', 'num_votes']
    
    average_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
    average_ratings.columns = ['movieId', 'average_rating']
    
    # Merge movies with their average ratings and number of votes
    movies_ratings = pd.merge(movies, average_ratings, on='movieId')
    movies_ratings = pd.merge(movies_ratings, ratings_count, on='movieId')
    
    # Filter for movies that match all specified genres
    def genre_combination(row_genres):
        return all(genre in row_genres for genre in genres)
    
    matching_movies = movies_ratings[movies_ratings['genres'].apply(lambda g: genre_combination(g.split('|')))]
    
    # Calculate Bayesian score
    C = movies_ratings['num_votes'].mean()  # Choose C as the average number of votes in the dataset
    matching_movies['bayesian_score'] = (C * average_ratings['average_rating'] + matching_movies['average_rating'] * matching_movies['num_votes']) / (C + matching_movies['num_votes'])
    
    # Sort by Bayesian score and select the top N
    top_movies = matching_movies.sort_values(by='bayesian_score', ascending=False).head(top_n)
    
    # Prepare the result
    result = top_movies[['title']].to_dict(orient='records')
    
    return result


In [44]:
get_top_movies_combining_genres(preferences, top_n=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_movies['bayesian_score'] = (C * average_ratings['average_rating'] + matching_movies['average_rating'] * matching_movies['num_votes']) / (C + matching_movies['num_votes'])


[{'title': 'Lady of Chance, A (1928)'},
 {'title': 'This Thing With Sarah (2013)'},
 {'title': 'Going Down in LA-LA Land (2011)'},
 {'title': 'Echoes of the Rainbow (Sui yuet san tau) (2010)'},
 {'title': "Impudent Girl (L'effrontÃ©e) (1985)"}]