In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr

In [18]:

# List to collect results
results = []

# List of prediction columns/models
models = [
    #'simulated_ratings',
    'knn_predicted_ratings',
    'svd_predicted_ratings',
    'nmf_predicted_ratings',
    'als_predicted_ratings'
]

# Loop over the test sets U1 to U5
for i in range(1, 6):
    filename = f'outputs/rating_test_df_u{i}.csv'
    # Read the CSV file
    df = pd.read_csv(filename)
    
    # Replace 'rating_test_df_u1.csv' with one of your CSV filenames
    print(df.columns.tolist())
    
    # Remove rows where any of the prediction columns have missing values
    df_clean = df.dropna(subset=models)
    
    # Ground truth ratings
    y_true = df_clean['rating'].values

    # Check if there are any rows left after dropping
    if len(df_clean) == 0:
        print(f"No data available for evaluation in {filename} after removing rows with missing predictions.")
        continue

    # Dictionary to store metrics for this file
    file_metrics = {'File': f'U{i}'}
    
    # Calculate metrics for each model
    for model in models:
        y_pred = df_clean[model].values

        # Compute RMSE
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        # Compute MAE
        mae = mean_absolute_error(y_true, y_pred)
        # Compute Pearson Correlation Coefficient
        if np.std(y_pred) == 0 or np.std(y_true) == 0:
            # Avoid division by zero
            pearson_corr = np.nan
        else:
            pearson_corr, _ = pearsonr(y_true, y_pred)
        
        # Store metrics with model and metric names
        file_metrics[f'{model}_RMSE'] = rmse
        file_metrics[f'{model}_MAE'] = mae
        file_metrics[f'{model}_Pearson'] = pearson_corr

    # Add the metrics for this file to the results list
    results.append(file_metrics)




['user_id', 'movie_id', 'rating', 'timestamp', 'knn_predicted_ratings', 'svd_predicted_ratings', 'nmf_predicted_ratings', 'als_predicted_ratings']
['user_id', 'movie_id', 'rating', 'timestamp', 'knn_predicted_ratings', 'svd_predicted_ratings', 'nmf_predicted_ratings', 'als_predicted_ratings']
['user_id', 'movie_id', 'rating', 'timestamp', 'knn_predicted_ratings', 'svd_predicted_ratings', 'nmf_predicted_ratings', 'als_predicted_ratings']
['Unnamed: 0', 'user_id', 'movie_id', 'rating', 'timestamp', 'knn_predicted_ratings', 'svd_predicted_ratings', 'nmf_predicted_ratings', 'als_predicted_ratings']
['Unnamed: 0', 'user_id', 'movie_id', 'rating', 'timestamp', 'knn_predicted_ratings', 'svd_predicted_ratings', 'nmf_predicted_ratings', 'als_predicted_ratings']


In [19]:
# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Set 'File' as the index
results_df.set_index('File', inplace=True)

# Display the results
print(results_df)

      knn_predicted_ratings_RMSE  knn_predicted_ratings_MAE  \
File                                                          
U1                      0.984937                   0.713374   
U2                      0.985216                   0.712734   
U3                      0.986664                   0.714379   
U4                      0.985734                   0.713204   
U5                      0.988415                   0.716794   

      knn_predicted_ratings_Pearson  svd_predicted_ratings_RMSE  \
File                                                              
U1                         0.492494                    0.996387   
U2                         0.491923                    0.995611   
U3                         0.493142                    0.997818   
U4                         0.493238                    0.996254   
U5                         0.491966                    0.998957   

      svd_predicted_ratings_MAE  svd_predicted_ratings_Pearson  \
File                  

In [20]:
results_df.to_csv("evaluation_result.csv")