In [6]:
import pandas as pd
from sklearn.metrics import root_mean_squared_error, precision_score, recall_score, accuracy_score

In [7]:
# path to the folder containing movielens data
Path = "D:/Canada/Danial/UoW/Dataset/MovieLens/100K/ml-100k"


# loading ratings: Train set
rating_df = pd.read_csv(f'{Path}/u1.base', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'],
                        encoding='latin-1')


# Count the number of interactions per user in the training set
user_interaction_counts = rating_df['user_id'].value_counts()

# Identify cold start users: those with fewer than 5 interactions in trainset_df
cold_start_users = user_interaction_counts[user_interaction_counts < 6].index.tolist()




In [8]:
processed_ratings_file = "../Samples/Data/100k/rating_test_df_u1.csv"
rating_test_df = pd.read_csv(processed_ratings_file)

# Create a new testset_df that only includes rows for these cold start users
rating_test_df = rating_test_df[rating_test_df['user_id'].isin(cold_start_users)]

cleaned_df = rating_test_df.dropna(subset=['simulated_ratings'])
cleaned_df.head(100)

Unnamed: 0,user_id,movie_id,rating,timestamp,simulated_ratings,knn_predicted_ratings,svd_predicted_ratings,nmf_predicted_ratings,als_predicted_ratings
14851,310,14,5,879436268,4.0,4,4,3,3
14852,310,24,4,879436242,3.0,3,3,4,4
14853,310,116,5,879436104,2.0,4,3,3,4
14854,310,181,4,879436104,5.0,4,4,4,4
14855,310,222,3,879436062,5.0,4,4,3,3
14856,310,251,5,879436035,2.0,4,4,3,3
14857,310,257,5,879436576,4.0,4,4,3,4
14858,310,258,3,879435606,5.0,4,4,3,3
14859,310,274,3,879436534,2.0,4,3,3,2
14860,310,275,5,879436137,4.0,4,4,2,4


In [9]:

# Ensure the values are integers
cleaned_df['rating'] = cleaned_df['rating'].astype(int)
cleaned_df['simulated_ratings'] = cleaned_df['simulated_ratings'].astype(int)

# RMSE
rmse = root_mean_squared_error(cleaned_df['rating'], cleaned_df['simulated_ratings'])

# Exact match
exact_matches = (cleaned_df['rating'] == cleaned_df['simulated_ratings'])
exact_match_count = exact_matches.sum()
exact_match_percentage = exact_match_count / len(cleaned_df) * 100

# Off by 1 level
off_by_1 = (cleaned_df['rating'] - cleaned_df['simulated_ratings']).abs() == 1
off_by_1_count = off_by_1.sum()
off_by_1_percentage = off_by_1_count / len(cleaned_df) * 100

# Off by more than 1 level
off_by_more_than_1 = (cleaned_df['rating'] - cleaned_df['simulated_ratings']).abs() > 1
off_by_more_than_1_count = off_by_more_than_1.sum()
off_by_more_than_1_percentage = off_by_more_than_1_count / len(cleaned_df) * 100

# Weighted accuracy
weighted_accuracy = (exact_matches * 1 + off_by_1 * 0.8).sum() / len(cleaned_df)

# Output the results
print(f"RMSE: {rmse}")
print(f"Exact match count: {exact_match_count}")
print(f"Exact match percentage: {exact_match_percentage:.2f}%")
print(f"Off by 1 level count: {off_by_1_count}")
print(f"Off by 1 level percentage: {off_by_1_percentage:.2f}%")
print(f"Off by more than 1 level count: {off_by_more_than_1_count}")
print(f"Off by more than 1 level percentage: {off_by_more_than_1_percentage:.2f}%")
print(f"Weighted Accuracy: {weighted_accuracy:.2f}")


RMSE: 1.7489492643904123
Exact match count: 3
Exact match percentage: 17.65%
Off by 1 level count: 8
Off by 1 level percentage: 47.06%
Off by more than 1 level count: 6
Off by more than 1 level percentage: 35.29%
Weighted Accuracy: 0.55
