In [1]:
# Preprocessing data - Sampling

import pandas as pd

# Read the CSV file into a pandas DataFrame
ratings_csv = pd.read_csv('C:\\Users\\user\\Desktop\\KDD_PP2\\ml-latest-small\\ratings.csv')

# Perform sampling
sampled_df = ratings_csv.sample(frac=0.4, replace=False, random_state=42)

print("Number of rows in the original DataFrame:", ratings_csv.shape[0])
print("Number of rows in the sampled DataFrame:", sampled_df.shape[0])


# Preprocessing data - Remove users and movies that have less than 12 ratings

while True:
    # Group ratings by userId and movieId, then count the ratings for each user and movie
    user_ratings_count = sampled_df.groupby('userId').size()
    movie_ratings_count = sampled_df.groupby('movieId').size()

    # Filter users and movies with less than 12 ratings each
    filtered_users = user_ratings_count[user_ratings_count >= 12].index
    filtered_movies = movie_ratings_count[movie_ratings_count >= 12].index
    filtered_ratings = sampled_df[(sampled_df['userId'].isin(filtered_users)) & (sampled_df['movieId'].isin(filtered_movies))]

    # Group ratings by userId and movieId, then count the ratings for each user and movie
    f_user_ratings_count = filtered_ratings.groupby('userId').size()
    f_movie_ratings_count = filtered_ratings.groupby('movieId').size()

    # Find users and movies with less than 12 ratings after filtering
    users_with_less_than_12_ratings = f_user_ratings_count[f_user_ratings_count < 12].index
    movies_with_less_than_12_ratings = f_movie_ratings_count[f_movie_ratings_count < 12].index

    # If there are users or movies with less than 12 ratings, continue filtering
    if not users_with_less_than_12_ratings.empty or not movies_with_less_than_12_ratings.empty:
        sampled_df = filtered_ratings[~filtered_ratings['userId'].isin(users_with_less_than_12_ratings)]
        sampled_df = sampled_df[~sampled_df['movieId'].isin(movies_with_less_than_12_ratings)]
    else:
        break  # Exit the loop if there are no users or movies with less than 12 ratings

print("Number of rows in the filtered DataFrame:", filtered_ratings.shape[0])
# Remove the column 'timestamp'
filtered_ratings = filtered_ratings.drop(columns=['timestamp'])
filtered_ratings.to_csv('C:\\Users\\user\\Desktop\\KDD_PP2\\ml-latest-small\\filtered_ratings.csv', index=False)

Number of rows in the original DataFrame: 100836
Number of rows in the sampled DataFrame: 40334
Number of rows in the filtered DataFrame: 21928


In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Auxiliary functions to find N similar movies

# The following function returns a list of movies that the specified user has rated.
def get_user_rated_movies(userId,data):
    # Filter ratings for the specified user
    user_ratings = data[data['userId'] == userId]

    # Get the movies that the user has rated
    user_rated_movies = user_ratings['movieId'].tolist()

    return user_rated_movies


# The following function returns a user-movie matrix with ratings as values, where the columns correspond to 
# the movies in the provided list, with an optional extra column for a specific movie.
def get_user_movie_matrix(user_rated_movies, data, extra_movie_id=None):
    # Filter ratings for the movies in the list
    filtered_ratings = data[data['movieId'].isin(user_rated_movies)]

    # Pivot the DataFrame to get a user-movie matrix
    user_movie_matrix = filtered_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

    # Append an extra column for the specified movie
    if extra_movie_id is not None:
        # Fetch ratings for the specified movie from the original ratings DataFrame
        extra_movie_ratings = filtered_ratings[filtered_ratings['movieId'] == extra_movie_id].set_index('userId')['rating']
    
        # Append the extra movie column to the user-movie matrix
        user_movie_matrix[extra_movie_id] = user_movie_matrix.index.map(extra_movie_ratings).fillna(0)

    return user_movie_matrix


# The following function calculates the adjusted cosine similarity for columns (movies) in a user-movie matrix.
# It is analogous to Pearson correlation, but specifically designed for collaborative filtering of movies.
# Given a movieId, it computes the similarity scores between that movie and all others in the matrix.
def adjusted_cosine_similarity_for_movie(matrix, movie_id):
    # Subtract the mean of each row from each element in the row
    adjusted_matrix = matrix.sub(matrix.mean(axis=1), axis=0).fillna(0)

    # Extract the column corresponding to the provided movieId
    movie_column = adjusted_matrix[movie_id].values.reshape(1, -1)

    # Calculate cosine similarity of the extracted column with all other columns
    similarity_vector = cosine_similarity(movie_column, adjusted_matrix.T)

    # Create a DataFrame with movieIds and their similarity scores
    similarity_df = pd.DataFrame({
        'movieId': adjusted_matrix.columns,
        'similarity': similarity_vector.flatten()
    })

    return similarity_df


# This function takes the number of desired similar movies (N), a movieId, and a DataFrame containing movie
# similarities. It returns the top N most similar movies to the given movieId based on similarity scores.
def find_similar_movies(N, movie_id, movie_similarity_df):
    # Exclude the movie itself from the list of most similar movies
    similar_movies = movie_similarity_df[movie_similarity_df['movieId'] != movie_id]

    # Sort the movies based on similarity in descending order
    similar_movies = similar_movies.sort_values(by='similarity', ascending=False)
    
    # Get the top N most similar movies
    top_similar_movies = similar_movies.head(N).copy()  # Create a copy of the DataFrame
    
    return top_similar_movies


# The following function takes a DataFrame of top N similar movies with their similarity scores and a userId. 
# It returns a dataframe that contains movieIds, similarities and the ratings given by the user for those movies.
def find_user_ratings(similar_movies_df, data, userId):
    # Merge the similar_movies_df with ratings to get the movieId, similarity, and rating information
    merged_df = pd.merge(similar_movies_df, data[data['userId'] == userId], on='movieId')
    
    # Drop the userId column
    merged_df = merged_df.drop(columns='userId')
    
    return merged_df

In [3]:
# Auxiliary functions to predict a rating

# The following function calculates the mean rating given by a specific user.
def calculate_user_mean_rating(userId, data):
    # Filter ratings for the specified user
    user_ratings = data[data['userId'] == userId]

    # Calculate the mean rating for the user
    user_mean_rating = user_ratings['rating'].mean()

    return user_mean_rating


# The following function calculates the mean rating of a specific movie.
def calculate_movie_mean_rating(movieId, data):
    # Filter ratings for the specified movie
    movie_ratings = data[data['movieId'] == movieId]

    # Calculate the mean rating for the movie
    movie_mean_rating = movie_ratings['rating'].mean()

    return movie_mean_rating


# The following function adds a 'bias removal' column to the DataFrame containing similarity scores, movieIds, and ratings.
def add_bias_removal_column(similarity_df, data):
    # Make a copy of the input DataFrame
    similar_movies_copy = similarity_df.copy()
    
    # Calculate mean rating for each movie
    similar_movies_copy['mean_rating'] = similar_movies_copy['movieId'].apply(lambda x: calculate_movie_mean_rating(x, data))

    # Calculate 'bias removal' for each row
    similar_movies_copy['bias_removal'] = similar_movies_copy['rating'] - similar_movies_copy['mean_rating']

    return similar_movies_copy
    

# Calculate the weighted mean rating with bias adjustment
def weighted_mean_rating_2(data, similar_movies_df, userId, movieId):
    weighted_mean = calculate_user_mean_rating(userId, data) + ((similar_movies_df['similarity'] * \
    similar_movies_df['bias_removal']).sum() / similar_movies_df['similarity'].sum())
    
    return weighted_mean


# Adds a 'common_users_weight' column to the similarity dataframe, representing the weight based on the number
# of common users between the provided movie_id and each row in similarity dataframe. Common users are adjusted
# by adding 1 to ensure a minimum weight of 1 for all movies.
#def add_weighted_common_users_column(movie_id, similarity_df, data):
    # Extract the users who have rated the provided movie_id
#    users_movie_id = data[data['movieId'] == movie_id]['userId'].tolist()

    # Make a copy of the input DataFrame
#    similar_movies_copy = similarity_df.copy()
    
    # Calculate common users and assign weights based on the number of common users
#    similar_movies_copy['common_users'] = similar_movies_copy.apply(
#        lambda row: len(set(users_movie_id) & set(data[data['movieId'] == row['movieId']]['userId'].tolist())),
#        axis=1
#    )

    # Add 1 to all common users and calculate weights based on the adjusted common users
#    similar_movies_copy['common_users_weight'] = (similar_movies_copy['common_users'] + 1)\
#    / (similar_movies_copy['common_users'].max() + 1)

#    return similar_movies_copy


# [0.9,1]

# Adds a 'common_users_weight' column to the similarity dataframe, representing the weight based on the number
# of common users between the provided movie_id and each row in similarity dataframe. Common users are adjusted
# by adding 1 to ensure a minimum weight of 1 for all movies.
def add_weighted_common_users_column(movie_id, similarity_df, data):
    # Extract the users who have rated the provided movie_id
    users_movie_id = data[data['movieId'] == movie_id]['userId'].tolist()

    # Make a copy of the input DataFrame
    similar_movies_copy = similarity_df.copy()
    
    # Calculate common users and assign weights based on the number of common users
    similar_movies_copy['common_users'] = similar_movies_copy.apply(
        lambda row: len(set(users_movie_id) & set(data[data['movieId'] == row['movieId']]['userId'].tolist())),
        axis=1
    )

    # Add 1 to all common users and calculate weights based on the adjusted common users
    max_common_users_weight = similar_movies_copy['common_users'].max() + 1
    similar_movies_copy['common_users_weight'] = (similar_movies_copy['common_users'] + 1) / max_common_users_weight
    
    # Ensure weights fall between 0.9 and 1
    min_weight = 0.9
    max_weight = 1
    similar_movies_copy['common_users_weight'] = similar_movies_copy['common_users_weight'] *\
    (max_weight - min_weight) + min_weight

    return similar_movies_copy


# Calculate the weighted mean using similarity, rating, and common_users_weight
def weighted_mean_rating_common_users(similarity_df):
    weighted_sum = (similarity_df['similarity'] * similarity_df['rating'] * similarity_df['common_users_weight']).sum()
    total_weight = (similarity_df['similarity'] * similarity_df['common_users_weight']).sum()

    # Avoid division by zero
    if total_weight == 0:
        return None

    # Calculate the rating
    rating = weighted_sum / total_weight

    return rating


# Adds a 'variance' column to the similarity dataframe, representing the variance of ratings for each movie
#def calculate_movie_variances(similarity_df,data):
    # Make a copy of the input DataFrame
#    similar_movies_copy = similarity_df.copy()
    
    # Group ratings by movieId and calculate variance for each movie
#    movie_variances = data.groupby('movieId')['rating'].var().reset_index()
#    movie_variances.columns = ['movieId', 'variance']
    
    # Merge with the similarity_df
#    similar_movies_copy = pd.merge(similar_movies_copy, movie_variances, on='movieId', how='left')
   
#    return similar_movies_copy


# [0.8, 1]

# Adds a 'variance' column to the similarity dataframe, representing the variance of ratings for each movie
def calculate_movie_variances(similarity_df, data):
    # Make a copy of the input DataFrame
    similar_movies_copy = similarity_df.copy()
    
    # Group ratings by movieId and calculate variance for each movie
    movie_variances = data.groupby('movieId')['rating'].var().reset_index()
    
    # Logarithm of variance
    movie_variances['variance_weight'] = np.log(movie_variances['rating'] + 1)
    # Normalize variance
    movie_variances['variance_weight'] = (movie_variances['variance_weight'] - movie_variances['variance_weight'].min()) / \
                                  (movie_variances['variance_weight'].max() - movie_variances['variance_weight'].min())
    
    # Adjust variance column in [0.8, 1]
    movie_variances['variance_weight'] = 0.2 * movie_variances['variance_weight'] + 0.8
    movie_variances = movie_variances.drop(columns=['rating'])

    # Merge with the similarity_df
    similar_movies_copy = pd.merge(similar_movies_copy, movie_variances, on='movieId', how='left')
   
    return similar_movies_copy


def weighted_mean_rating_variance(similarity_df):
    # Calculate the total weight as the sum of weighted variances
    total_weight = (similarity_df['similarity'] * similarity_df['variance_weight']).sum()

    # Calculate the prediction using the weighted mean with variance
    prediction = (similarity_df['similarity'] * similarity_df['variance_weight'] * similarity_df['rating']).sum() / total_weight

    return prediction

In [4]:
# Function to find the top N similar movies for a given user and movie
def find_similar_movies_for_movie(data, userId, movieId, N):
    # Get the user-movie matrix with all the movies that the specified user has rated, 
    # plus the movie we want to predict its rating
    user_rated_movies = get_user_rated_movies(userId, data)
    user_movie_matrix = get_user_movie_matrix(user_rated_movies, data, movieId)
    
    # Calculate adjusted cosine similarity for the specified movie
    movie_similarity_df = adjusted_cosine_similarity_for_movie(user_movie_matrix, movieId)
    
    # Find the top N similar movies for the specified user and movie
    similar_movies = find_similar_movies(N, movieId, movie_similarity_df)
    
    # Get the specified users ratings for the similar movies
    similar_movies = find_user_ratings(similar_movies, data, userId)
    
    return similar_movies


# Adjust the calculated weighted mean rating to ensure it does not exceed the maximum rating of 5.0
# or fall below the minimum rating of 0.5
def adjust_rating(rating):
    if((rating < -1) or (rating > 6.5)):
        print("rating out of bounds: ", rating)
        
    if rating > 5.0:
        return 5.0
    elif rating < 0.5:
        return 0.5
    # If the rating is within bounds, return it unchanged
    else:
        return rating


# Prediction Function 1

# The following function calculates the weighted mean rating for a target movie using its N nearest neighbors.
#def predict_weighted_mean_rating(similar_movies_df):
#    # Calculate the weighted mean rating
#    weighted_mean = (similar_movies_df['similarity'] * similar_movies_df['rating']).sum()\
#    / similar_movies_df['similarity'].sum()
    
#    weighted_mean = adjust_rating(weighted_mean)
#    return weighted_mean


# Prediction Function 1 (sim>0)

# The following function calculates the weighted mean rating for a target movie using its N nearest neighbors.
#def predict_weighted_mean_rating(similar_movies_df):
def predict_weighted_mean_rating(similar_movies_df):
    # Filter out rows with similarity less than 0
    similar_movies_df = similar_movies_df[similar_movies_df['similarity'] > 0]
    
    # Check if there are any rows left after filtering
    if similar_movies_df.empty:
        print("rating out of bounds: 0")
        return 0  # Return 0 if there are no rows with positive similarity
    
    # Calculate the weighted mean rating
    weighted_mean = (similar_movies_df['similarity'] * similar_movies_df['rating']).sum()\
    / similar_movies_df['similarity'].sum()
    
    # Adjust the rating if necessary
    weighted_mean = adjust_rating(weighted_mean)
    
    return weighted_mean


# Prediction Function 2 (sim>0)

# The following function predicts the rating for a movie using a weighted mean approach with user bias adjustment.
def predict_weighted_mean_rating_2(data, similar_movies, userId, movieId):
    # Filter out rows with similarity less than 0
    similar_movies = similar_movies[similar_movies['similarity'] > 0]
    
    # Check if there are any rows left after filtering
    if similar_movies.empty:
        print("rating out of bounds: 0")
        return 0  # Return 0 if there are no rows with positive similarity
    
    similar_movies_bias = add_bias_removal_column(similar_movies, data)
    weighted_mean_2 = weighted_mean_rating_2(data, similar_movies_bias, userId, movieId)
    
    weighted_mean_2 = adjust_rating(weighted_mean_2)
    
    return weighted_mean_2


# Prediction Function 3 (sim>0)

# The following function predicts the rating for a movie using a weighted mean based on common users with similar movies.
def predict_weighted_mean_rating_common_users(data, similar_movies, movieId):
    # Filter out rows with similarity less than 0
    similar_movies = similar_movies[similar_movies['similarity'] > 0]
    
    # Check if there are any rows left after filtering
    if similar_movies.empty:
        print("rating out of bounds: 0")
        return 0  # Return 0 if there are no rows with positive similarity
    
    similar_movies_common_users = add_weighted_common_users_column(movieId, similar_movies, data)
    weighted_mean_3 = weighted_mean_rating_common_users(similar_movies_common_users)
    
    weighted_mean_3 = adjust_rating(weighted_mean_3)
    
    return weighted_mean_3


# Prediction Function 4 (sim>0)

# Calculate the prediction using a weighted mean with variance, where weights are based on the similarity and variance.
def predict_weighted_mean_rating_variance(data, similar_movies):
    # Filter out rows with similarity less than 0
    similar_movies = similar_movies[similar_movies['similarity'] > 0]
    
    # Check if there are any rows left after filtering
    if similar_movies.empty:
        print("rating out of bounds: 0")
        return 0  # Return 0 if there are no rows with positive similarity
    
    similar_movies_variances = calculate_movie_variances(similar_movies,data)
    weighted_mean_4 = weighted_mean_rating_variance(similar_movies_variances)
    
    weighted_mean_4 = adjust_rating(weighted_mean_4)
    
    
    return weighted_mean_4

In [5]:
# Predicts the rating for a given movie (movieId).
# It finds the top N similar movies, then employs four different prediction methods:
# 1. Weighted Mean Rating
# 2. Weighted Mean Rating with Adjustment and Bias Removal
# 3. Weighted Mean Rating with Common Users
# 4. Weighted Mean Rating with Variance
def predict_ratings(data, userId, movieId, N, method):
    # Step 1: Find top N similar movies with movie movieId
    similar = find_similar_movies_for_movie(data, userId, movieId, N)
    
    # Step 2: Predict the rating of the movie movieId
    # Call the prediction function based on the method
    if method == 1:
        prediction = predict_weighted_mean_rating(similar)
    elif method == 2:
        prediction = predict_weighted_mean_rating_2(data, similar, userId, movieId)
    elif method == 3:
        prediction = predict_weighted_mean_rating_common_users(data, similar, movieId)
    elif method == 4:
        prediction = predict_weighted_mean_rating_variance(data, similar)
    
    return prediction

In [6]:
# Experiments

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, precision_score, recall_score, confusion_matrix

def run_experiments():
    N_values = [3, 5, 7, 9, 12]
    
    # Initialize dictionaries to store evaluation results
    evaluation_results = {'Prediction Method': [], 'N': [], 'MAE': [], 'Precision': [], 'Recall': [], 'Confusion Matrix': []}

    # Loop over each prediction method
    for method in range(1, 5):
        for N in N_values:
            mae_scores = []
            precision_scores = []
            recall_scores = []
            confusion_matrices = []
            
            # Initialize KFold cross-validation
            kf = KFold(n_splits=5, shuffle=True)
            
            # Iterate over the splits
            for train_index, test_index in kf.split(filtered_ratings):
                # Split the data into training and testing sets
                train_data = filtered_ratings.iloc[train_index]
                test_data = filtered_ratings.iloc[test_index]
                
                # Run predictions and evaluate
                mae, precision, recall, confusion_matrix_values = evaluate_predictions(train_data, test_data, N, method)
                
                mae_scores.append(mae)
                precision_scores.append(precision)
                recall_scores.append(recall)
                confusion_matrices.append(confusion_matrix_values)
            
            # Calculate average evaluation scores across all runs
            avg_mae = np.mean(mae_scores)
            avg_precision = np.mean(precision_scores)
            avg_recall = np.mean(recall_scores)
            
            # Calculate confusion matrix across all iterations
            confusion_matrix = np.sum(confusion_matrices, axis=0)
            
            # Store results in the dictionary
            evaluation_results['Prediction Method'].append(method)
            evaluation_results['N'].append(N)
            evaluation_results['MAE'].append(avg_mae)
            evaluation_results['Precision'].append(avg_precision)
            evaluation_results['Recall'].append(avg_recall)
            evaluation_results['Confusion Matrix'].append(confusion_matrix)
    
    # Convert the dictionary to a DataFrame
    results_df = pd.DataFrame(evaluation_results)
    
    return results_df


def evaluate_predictions(train_data, test_data, N, method):
    # Placeholder for evaluation metrics
    mae_scores = []
    precision_scores = []
    recall_scores = []
    
    # Initialize confusion matrix
    confusion_matrix_values = np.zeros((2, 2))
    
    # Loop over each user and movie in the test set
    for _, row in test_data.iterrows():
        userId = row['userId']
        movieId = row['movieId']
        
        # Call the prediction function
        prediction = predict_ratings(train_data, userId, movieId, N, method)
            
        # Calculate the error
        actual_rating = row['rating']
        mae = abs(actual_rating - prediction)
        
        # Append the error to the list of errors
        mae_scores.append(mae)
        
        # Binary classification for precision and recall
        binary_actual = 1 if actual_rating >= 3 else 0
        binary_prediction = 1 if prediction >= 3 else 0
        
        precision_scores.append(binary_actual == binary_prediction)
        recall_scores.append(binary_actual)
        
        # Update confusion matrix
        confusion_matrix_values[binary_actual, binary_prediction] += 1
    
    # Calculate MAE
    mae = np.mean(mae_scores)
    
    # Calculate Precision and Recall
    precision = precision_score(precision_scores, recall_scores, average='macro')
    recall = recall_score(precision_scores, recall_scores, average='macro')
    
    return mae, precision, recall, confusion_matrix_values


# Run the experiments
results = run_experiments()

# Print the results
print(results)

KeyboardInterrupt: 