# Data Cleaning

In [1]:
import kagglehub
import os
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("abhikjha/movielens-100k")
subdir_path = os.path.join(path, "ml-latest-small")

# Load the data into DataFrames
movies = pd.read_csv(os.path.join(subdir_path, "movies.csv"))
ratings = pd.read_csv(os.path.join(subdir_path, "ratings.csv"))
tags = pd.read_csv(os.path.join(subdir_path, "tags.csv"))

# Drop the 'timestamp' column
ratings = ratings.drop(columns=['timestamp'])
tags = tags.drop(columns=['timestamp'])




# Feature engineering

In [2]:
# Split genres into individual columns
genres = movies['genres'].str.get_dummies(sep='|')
movies = pd.concat([movies, genres], axis=1)

In [3]:
# Drop the '(no genres listed)' column if it exists
if '(no genres listed)' in movies.columns:
    movies = movies.drop(columns=['(no genres listed)'])

In [4]:
# Merge ratings with movies to include genres
user_genre_data = pd.merge(ratings, movies, on='movieId')

In [5]:
# Ensure columns exist for genres
genres_cols = [col for col in genres.columns if col in user_genre_data.columns]

In [6]:
# Calculate average rating per genre for each user
user_genre_avg = user_genre_data.groupby('userId')[genres_cols].mean().reset_index()

In [7]:
# Merge tags with movies
user_tags_data = pd.merge(tags, movies, on='movieId')

In [8]:
# Aggregate tags for each user
user_tags_agg = user_tags_data.groupby('userId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

In [9]:
# Merge average ratings with aggregated tags
user_profiles = pd.merge(user_genre_avg, user_tags_agg, on='userId', how='left')
user_profiles['tag'] = user_profiles['tag'].fillna('')

In [10]:
# Combine genre averages and tags into a single text field
user_profiles['user_combined'] = user_profiles[genres_cols].apply(lambda x: ' '.join(x.index[x > 0]), axis=1) + ' ' + user_profiles['tag']

In [11]:
# Remove duplicates
user_profiles = user_profiles.drop_duplicates(subset=['user_combined'])

In [12]:
# Create item profiles by combining relevant features
movies['item_combined'] = movies['genres']

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorize item profiles
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
item_vectors = vectorizer.fit_transform(movies['item_combined'])

In [14]:
# # Calculate cosine similarity between item profiles
# item_cosine_sim = cosine_similarity(item_vectors, item_vectors)

In [15]:
# Vectorize user profiles
user_vectors = vectorizer.transform(user_profiles['user_combined'])

# Clustering-based recommender system

In [16]:
from sklearn.cluster import KMeans
import pandas as pd

# Use the user vectors for clustering
kmeans = KMeans(n_clusters=10, random_state=42)
user_clusters = kmeans.fit_predict(user_vectors)
user_profiles['cluster'] = user_clusters

In [17]:
# Map recommendations back to movie titles
movie_id_map = {idx: row['title'] for idx, row in movies.iterrows()}

In [18]:
# Recommend top movies for each cluster
def recommend_for_cluster(cluster_id, top_n=10):
    cluster_users = user_profiles[user_profiles['cluster'] == cluster_id]
    cluster_profiles = cluster_users['user_combined'].apply(lambda x: x).tolist()
    
    # Create a combined profile for the cluster
    combined_profile = ' '.join(cluster_profiles)
    
    # Vectorize the combined profile
    combined_vector = vectorizer.transform([combined_profile])
    
    # Calculate similarity between the combined profile and item profiles
    cosine_sim = cosine_similarity(combined_vector, item_vectors)
    
    # Get top N recommendations
    top_indices = cosine_sim[0].argsort()[-top_n:][::-1]
    recommendations = [movie_id_map[i] for i in top_indices]
    
    return recommendations

In [19]:
# Example usage
cluster_id = 9
top_movies = recommend_for_cluster(cluster_id)

In [20]:
top_movies

['Rubber (2010)',
 'Angel Heart (1987)',
 'Lost Highway (1997)',
 'Gilda (1946)',
 'In a Lonely Place (1950)',
 'This World, Then the Fireworks (1997)',
 'Born to Kill (1947)',
 'Blood Simple (1984)',
 'Grifters, The (1990)',
 'Limits of Control, The (2009)']

In [21]:
# Display the recommendations in a table
recommendations_df = pd.DataFrame(top_movies, columns=['Recommended Movies'])
print(f"Top movies for cluster {cluster_id}:")
recommendations_df

Top movies for cluster 9:


Unnamed: 0,Recommended Movies
0,Rubber (2010)
1,Angel Heart (1987)
2,Lost Highway (1997)
3,Gilda (1946)
4,In a Lonely Place (1950)
5,"This World, Then the Fireworks (1997)"
6,Born to Kill (1947)
7,Blood Simple (1984)
8,"Grifters, The (1990)"
9,"Limits of Control, The (2009)"


# Evaluation

In [22]:
# Merge seen movies with user_profiles
user_seen_movies = user_genre_data.groupby('userId')['title'].apply(set).reset_index()
user_seen_movies.rename(columns={'title': 'Seen Movies'}, inplace=True)

# Append recommendations to the existing dataframe for each user
user_profiles = user_profiles.merge(user_seen_movies, on='userId', how='left')


In [23]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_cluster(cluster_id, top_n=10, default_rating=2.5):
    cluster_users = user_profiles[user_profiles['cluster'] == cluster_id]
    recommendations = recommend_for_cluster(cluster_id, top_n)
    
    all_precisions, all_recalls, all_f1_scores, all_rmses = [], [], [], []
    
    for user_id in cluster_users['userId']:
        seen_movies = user_profiles[user_profiles['userId'] == user_id]['Seen Movies'].values[0]
        y_true = [1 if movie in seen_movies else 0 for movie in movies['title']]
        y_pred = [1 if movie in recommendations else 0 for movie in movies['title']]
        
        # Calculate precision, recall, and f1 score
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        
        # Append to lists
        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1_scores.append(f1)
        
        # Calculate cosine similarity for RMSE
        combined_vector = vectorizer.transform([user_profiles[user_profiles['userId'] == user_id]['user_combined'].values[0]])
        cosine_sim = cosine_similarity(combined_vector, item_vectors)
        
        # RMSE calculation
        user_ratings_pred = [1 - cosine_sim[0][i] for i in range(len(movies))]
        actual_ratings = ratings[ratings['userId'] == user_id]
        merged = pd.merge(movies[['movieId', 'title']], actual_ratings, on='movieId', how='left')
        merged = merged.copy()
        merged['rating'] = merged['rating'].fillna(default_rating)
        
        y_true_rmse = merged['rating'].values
        y_pred_rmse = user_ratings_pred[:len(y_true_rmse)]
        rmse = np.sqrt(mean_squared_error(y_true_rmse, y_pred_rmse))
        
        all_rmses.append(rmse)
    
    average_cluster_precision = np.mean(all_precisions)
    average_cluster_recall = np.mean(all_recalls)
    average_cluster_f1 = np.mean(all_f1_scores)
    average_cluster_rmse = np.mean(all_rmses)
    
    return average_cluster_precision, average_cluster_recall, average_cluster_f1, average_cluster_rmse

### Evaluate for 1 cluster

In [24]:
# Calculate metrics for cluster_id
cluster_id = 9
precision, recall, f1, rmse = evaluate_cluster(cluster_id)
print(f"Clustering-based System for cluster {cluster_id}:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"RMSE: {rmse}")

Clustering-based System for cluster 9:
Precision: 0.011111111111111112
Recall: 0.0011947431302270013
F1-Score: 0.002157497303128371
RMSE: 1.6199738082142554


### Evaluate for all clusters

In [25]:
# Calculate metrics for all users
top_n=10
all_precisions, all_recalls, all_f1_scores, all_rmses = [], [], [], []
for cluster_id in np.unique(user_profiles['cluster']):
    precision, recall, f1, rmse = evaluate_cluster(cluster_id, top_n)
    all_precisions.append(precision)
    all_recalls.append(recall)
    all_f1_scores.append(f1)
    all_rmses.append(rmse)

# Average the results
average_precision = np.mean(all_precisions)
average_recall = np.mean(all_recalls)
average_f1 = np.mean(all_f1_scores)
average_rmse = np.mean(all_rmses)
    
print(f"Clustering-based System for all users:")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1-Score: {average_f1}")
print(f"Average RMSE: {average_rmse}")


Clustering-based System for all users:
Average Precision: 0.017161408528928813
Average Recall: 0.0011230808545965945
Average F1-Score: 0.0019731171360007887
Average RMSE: 1.6425146678659022


# Average Number of New/Unseen Courses Recommended Per User

In [26]:
def count_new_recommendations(row):
    seen_movies = row['Seen Movies']
    recommended_movies = set(recommend_for_cluster(row['userId'], top_n=10))
    new_recommendations = recommended_movies - seen_movies
    return len(new_recommendations)

# Calculate new recommendations for each user
user_profiles['new_recommendations'] = user_profiles.apply(count_new_recommendations, axis=1)

# Compute the average number of new/unseen recommendations per user
average_new_recommendations = user_profiles['new_recommendations'].mean()
print(f"Average number of new/unseen movies recommended per user: {average_new_recommendations}")


Average number of new/unseen movies recommended per user: 9.897674418604652


# Top 10 Most Frequently Recommended

In [27]:
from collections import Counter

# Generate recommendations for each user and collect them
all_recommendations = []
for user_id in user_profiles['userId'].unique():
    all_recommendations.extend(recommend_for_cluster(user_id, top_n=10))

# Count frequencies of each recommended movie
recommendation_counts = Counter(all_recommendations)

# Get the top 10 most commonly recommended movies
top_10_recommendations = recommendation_counts.most_common(10)
top_10_recommendations_df = pd.DataFrame(top_10_recommendations, columns=['Movie', 'Frequency'])

# Display the top 10 recommendations as a table
top_10_recommendations_df


Unnamed: 0,Movie,Frequency
0,Andrew Dice Clay: Dice Rules (1991),208
1,Cats & Dogs (2001),208
2,Beach Blanket Bingo (1965),208
3,Another Woman (1988),208
4,Alice (1990),208
5,Rape Me (Baise-moi) (2000),208
6,Lost and Delirious (2001),208
7,Scary Movie 2 (2001),208
8,Kiss of the Dragon (2001),208
9,Lumumba (2000),208
