# Data Cleaning

In [1]:
import kagglehub
import os
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("abhikjha/movielens-100k")
subdir_path = os.path.join(path, "ml-latest-small")

# Load the data into DataFrames
movies = pd.read_csv(os.path.join(subdir_path, "movies.csv"))
ratings = pd.read_csv(os.path.join(subdir_path, "ratings.csv"))
tags = pd.read_csv(os.path.join(subdir_path, "tags.csv"))

# Drop the 'timestamp' column
ratings = ratings.drop(columns=['timestamp'])
tags = tags.drop(columns=['timestamp'])




# Feature engineering

In [2]:
# Split genres into individual columns
genres = movies['genres'].str.get_dummies(sep='|')
movies = pd.concat([movies, genres], axis=1)

In [3]:
# Drop the '(no genres listed)' column if it exists
if '(no genres listed)' in movies.columns:
    movies = movies.drop(columns=['(no genres listed)'])

In [4]:
# Merge ratings with movies to include genres
user_genre_data = pd.merge(ratings, movies, on='movieId')

In [5]:
# Ensure columns exist for genres
genres_cols = [col for col in genres.columns if col in user_genre_data.columns]

In [6]:
# Calculate average rating per genre for each user
user_genre_avg = user_genre_data.groupby('userId')[genres_cols].mean().reset_index()

In [7]:
# Merge tags with movies
user_tags_data = pd.merge(tags, movies, on='movieId')

In [8]:
# Aggregate tags for each user
user_tags_agg = user_tags_data.groupby('userId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

In [9]:
# Merge average ratings with aggregated tags
user_profiles = pd.merge(user_genre_avg, user_tags_agg, on='userId', how='left')
user_profiles['tag'] = user_profiles['tag'].fillna('')

In [10]:
# Combine genre averages and tags into a single text field
user_profiles['user_combined'] = user_profiles[genres_cols].apply(lambda x: ' '.join(x.index[x > 0]), axis=1) + ' ' + user_profiles['tag']

In [11]:
# Remove duplicates
user_profiles = user_profiles.drop_duplicates(subset=['user_combined'])

In [12]:
# Create item profiles by combining relevant features
movies['item_combined'] = movies['genres']

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorize item profiles
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
item_vectors = vectorizer.fit_transform(movies['item_combined'])

In [14]:
# Calculate cosine similarity between item profiles
item_cosine_sim = cosine_similarity(item_vectors, item_vectors)

In [15]:
# Vectorize user profiles
user_vectors = vectorizer.transform(user_profiles['user_combined'])

# Similarity-based recommender system

In [16]:
# Create a recommendation function based on item similarity
def recommend_similar_items(movie_title, top_n=10):
    idx = movies[movies['title'] == movie_title].index[0]
    similar_indices = item_cosine_sim[idx].argsort()[-top_n:][::-1]
    similar_movies = [movies.iloc[i]['title'] for i in similar_indices]
    return similar_movies

In [17]:
# Example: Recommend similar movies to 'Toy Story (1995)'
recommended_movies = recommend_similar_items('Toy Story (1995)')

In [18]:
recommended_movies

['Toy Story (1995)',
 'Tale of Despereaux, The (2008)',
 'Antz (1998)',
 'Toy Story 2 (1999)',
 'Adventures of Rocky and Bullwinkle, The (2000)',
 "Emperor's New Groove, The (2000)",
 'Asterix and the Vikings (Astérix et les Vikings) (2006)',
 'The Good Dinosaur (2015)',
 'Monsters, Inc. (2001)',
 'Moana (2016)']

In [19]:
# Example: Recommend similar movies to 'Toy Story (1995)'
recommended_movies = recommend_similar_items('Toy Story (1995)')

# Display the recommendations in a table
recommendations_df = pd.DataFrame(recommended_movies, columns=['Recommended Movies'])
recommendations_df

Unnamed: 0,Recommended Movies
0,Toy Story (1995)
1,"Tale of Despereaux, The (2008)"
2,Antz (1998)
3,Toy Story 2 (1999)
4,"Adventures of Rocky and Bullwinkle, The (2000)"
5,"Emperor's New Groove, The (2000)"
6,Asterix and the Vikings (Astérix et les Viking...
7,The Good Dinosaur (2015)
8,"Monsters, Inc. (2001)"
9,Moana (2016)


# Evaluation

In [20]:
# Merge seen movies with user_profiles
user_seen_movies = user_genre_data.groupby('userId')['title'].apply(set).reset_index()
user_seen_movies.rename(columns={'title': 'Seen Movies'}, inplace=True)

# Append recommendations to the existing dataframe for each user
user_profiles = user_profiles.merge(user_seen_movies, on='userId', how='left')


In [21]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error

def evaluate_movie(movie_title, top_n=10, default_rating=2.5):
    recommendations = recommend_similar_items(movie_title, top_n)
    
    all_precisions, all_recalls, all_f1_scores, all_rmses = [], [], [], []
    
    for user_id in user_profiles['userId']:
        seen_movies = user_profiles[user_profiles['userId'] == user_id]['Seen Movies'].values[0]
        y_true = [1 if movie in seen_movies else 0 for movie in movies['title']]
        y_pred = [1 if movie in recommendations else 0 for movie in movies['title']]
        
        # Calculate precision, recall, and f1 score
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        
        # Append to lists
        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1_scores.append(f1)
        
        # RMSE calculation
        movie_idx = movies[movies['title'] == movie_title].index[0]
        cosine_sim = item_cosine_sim[movie_idx]
        user_ratings_pred = [cosine_sim[i] for i in range(len(movies))]
        actual_ratings = ratings[ratings['userId'] == user_id]
        merged = pd.merge(movies[['movieId', 'title']], actual_ratings, on='movieId', how='left')
        merged = merged.copy()
        merged['rating'] = merged['rating'].fillna(default_rating)
        
        y_true_rmse = merged['rating'].values
        y_pred_rmse = user_ratings_pred[:len(y_true_rmse)]
        rmse = np.sqrt(mean_squared_error(y_true_rmse, y_pred_rmse))
        
        all_rmses.append(rmse)
    
    average_cluster_precision = np.mean(all_precisions)
    average_cluster_recall = np.mean(all_recalls)
    average_cluster_f1 = np.mean(all_f1_scores)
    average_cluster_rmse = np.mean(all_rmses)
    
    return average_cluster_precision, average_cluster_recall, average_cluster_f1, average_cluster_rmse


### Evaluate for 1 example "Toy Story (1995)"

In [22]:
# Calculate metrics for "Toy Story (1995)"
movie_title = "Toy Story (1995)"
precision, recall, f1, rmse = evaluate_movie(movie_title)
print(f'Similarity-based System for {movie_title}:')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(f'RMSE: {rmse}')


Similarity-based System for Toy Story (1995):
Precision: 0.06232558139534884
Recall: 0.005166273345234001
F1-Score: 0.008592430775734254
RMSE: 2.4365549333487886


### Evaluate all recommendations

In [23]:
import time

# Measure the time for a single evaluation
start_time = time.time()
_ = evaluate_movie('Toy Story (1995)')  # Use a sample movie title for the test
single_evaluation_time = time.time() - start_time
print(f"Time for single evaluation: {single_evaluation_time:.2f} seconds")

# Estimate total time
total_movies = len(movies['title'])
estimated_total_time = single_evaluation_time * total_movies
print(f"Estimated total time: {estimated_total_time/60:.2f} minutes")


Time for single evaluation: 12.40 seconds
Estimated total time: 2013.31 minutes


This takes too long. So we will reduce the sample.

In [31]:
import random
import time

# Evaluate a sample of 610 movies
sample_size = 610  # Sample size
sampled_movies = random.sample(list(movies['title']), sample_size)

start_time = time.time()

top_n = 10
all_precisions, all_recalls, all_f1_scores, all_rmses = [], [], [], []

for movie_title in sampled_movies:
    precision, recall, f1, rmse = evaluate_movie(movie_title, top_n)
    all_precisions.append(precision)
    all_recalls.append(recall)
    all_f1_scores.append(f1)
    all_rmses.append(rmse)

# Average the results
average_precision = np.mean(all_precisions)
average_recall = np.mean(all_recalls)
average_f1 = np.mean(all_f1_scores)
average_rmse = np.mean(all_rmses)

# Calculate total execution time
total_time = time.time() - start_time

# Display metrics
print(f"Similarity-based System for sampled movies:")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1-Score: {average_f1}")
print(f"Average RMSE: {average_rmse}")

print(f"Total execution time: {total_time/60:.2f} minutes")

Similarity-based System for sampled movies:
Average Precision: 0.011745607042595226
Average Recall: 0.0007486621494222052
Average F1-Score: 0.001242082814169579
Average RMSE: 2.4122537452770247
Total execution time: 117.72 minutes


In [30]:
# from joblib import Parallel, delayed

# # Evaluate all recommendations in parallel
# top_n = 10
# results = Parallel(n_jobs=-1)(delayed(evaluate_movie)(movie_title, top_n) for movie_title in movies['title'])

# # Extract metrics from results
# all_precisions = [res[0] for res in results]
# all_recalls = [res[1] for res in results]
# all_f1_scores = [res[2] for res in results]
# all_rmses = [res[3] for res in results]

# # Average the results
# average_precision = np.mean(all_precisions)
# average_recall = np.mean(all_recalls)
# average_f1 = np.mean(all_f1_scores)
# average_rmse = np.mean(all_rmses)

# # Calculate metrics for all movies
# print(f"Similarity-based System for all movies:")
# print(f"Average Precision: {average_precision}")
# print(f"Average Recall: {average_recall}")
# print(f"Average F1-Score: {average_f1}")
# print(f"Average RMSE: {average_rmse}")


# Average Number of New/Unseen Courses Recommended Per User

In [26]:
def count_new_recommendations(user_id, top_n=10):
    seen_movies = user_profiles[user_profiles['userId'] == user_id]['Seen Movies'].values[0]
    recommended_movies = set()  # Use a set to avoid duplicates
    for movie in seen_movies:
        similar_movies = recommend_similar_items(movie, top_n)
        recommended_movies.update(similar_movies)
    new_recommendations = [movie for movie in recommended_movies if movie not in seen_movies]
    return len(new_recommendations)

# Calculate new/unseen recommendations for each user
user_profiles['new_recommendations'] = user_profiles['userId'].apply(count_new_recommendations)
average_new_recommendations = user_profiles['new_recommendations'].mean()
print(f"Average number of new/unseen courses recommended per user: {average_new_recommendations:.2f}")


Average number of new/unseen courses recommended per user: 470.53


# Top 10 Most Frequently Recommended

In [27]:
from collections import Counter

# Generate recommendations for each user and collect them
all_recommendations = []
for user_id in user_profiles['userId'].unique():
    seen_movies = user_profiles[user_profiles['userId'] == user_id]['Seen Movies'].values[0]
    for movie in seen_movies:
        all_recommendations.extend(recommend_similar_items(movie, top_n=10))

# Count frequencies of each recommended movie
recommendation_counts = Counter(all_recommendations)

# Get the top 10 most commonly recommended movies
top_10_recommendations = recommendation_counts.most_common(10)
top_10_recommendations_df = pd.DataFrame(top_10_recommendations, columns=['Movie', 'Frequency'])

# Display the top 10 recommendations as a table
top_10_recommendations_df


Unnamed: 0,Movie,Frequency
0,Monsieur Ibrahim (Monsieur Ibrahim et les fleu...,2131
1,"Mudge Boy, The (2003)",2131
2,Sarafina! (1992),2131
3,"Man Who Cried, The (2000)",2131
4,Love & Mercy (2014),2131
5,Ned Kelly (2003),2131
6,Last Dance (1996),2131
7,"Way Back, The (2010)",2131
8,All or Nothing (2002),2131
9,Ice Castles (1978),2131
