# Data Cleaning

In [1]:
import kagglehub
import os
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("abhikjha/movielens-100k")
subdir_path = os.path.join(path, "ml-latest-small")

# Load the data into DataFrames
movies = pd.read_csv(os.path.join(subdir_path, "movies.csv"))
ratings = pd.read_csv(os.path.join(subdir_path, "ratings.csv"))
tags = pd.read_csv(os.path.join(subdir_path, "tags.csv"))

# Drop the 'timestamp' column
ratings = ratings.drop(columns=['timestamp'])
tags = tags.drop(columns=['timestamp'])




# Feature engineering

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Split genres into individual columns
genres = movies['genres'].str.get_dummies(sep='|')
movies = pd.concat([movies, genres], axis=1)

# Drop the '(no genres listed)' column if it exists
if '(no genres listed)' in movies.columns:
    movies = movies.drop(columns=['(no genres listed)'])

# Merge ratings with movies to include genres
user_genre_data = pd.merge(ratings, movies, on='movieId')

# Ensure columns exist for genres
genres_cols = [col for col in genres.columns if col in user_genre_data.columns]

# Calculate average rating per genre for each user
user_genre_avg = user_genre_data.groupby('userId')[genres_cols].mean().reset_index()

# Merge tags with movies
user_tags_data = pd.merge(tags, movies, on='movieId')

# Aggregate tags for each user
user_tags_agg = user_tags_data.groupby('userId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Merge average ratings with aggregated tags
user_profiles = pd.merge(user_genre_avg, user_tags_agg, on='userId', how='left')
user_profiles['tag'] = user_profiles['tag'].fillna('')

# Combine genre averages and tags into a single text field
user_profiles['user_combined'] = user_profiles[genres_cols].apply(lambda x: ' '.join(x.index[x > 0]), axis=1) + ' ' + user_profiles['tag']

# Remove duplicates
user_profiles = user_profiles.drop_duplicates(subset=['user_combined'])

# Create item profiles by combining relevant features
movies['item_combined'] = movies['genres']

# Vectorize item profiles
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
item_vectors = vectorizer.fit_transform(movies['item_combined'])

# Vectorize user profiles
user_vectors = vectorizer.transform(user_profiles['user_combined'])


# User-based recommender system

In [3]:
def recommend_for_user(user_id, top_n=10):
    user_idx = user_profiles[user_profiles['userId'] == user_id].index
    if user_idx.empty:
        return []
    
    user_vector = user_vectors[user_idx[0]]
    
    # Calculate the cosine similarity between the user vector and item profiles
    cosine_sim = cosine_similarity(user_vector, item_vectors)
    
    # Get the top N recommendations
    top_indices = cosine_sim[0].argsort()[-top_n:][::-1]
    recommended_movies = [movies.iloc[i]['title'] for i in top_indices]
    
    return recommended_movies


In [4]:
# Example: Recommend movies for a specific user
user_id = 1
recommended_movies = recommend_for_user(user_id)


In [5]:
recommended_movies

['Lost Highway (1997)',
 'Rubber (2010)',
 'Angel Heart (1987)',
 'Song of the Thin Man (1947)',
 'Vanilla Sky (2001)',
 'Alphaville (Alphaville, une étrange aventure de Lemmy Caution) (1965)',
 'Aelita: The Queen of Mars (Aelita) (1924)',
 'Inside Out (2015)',
 'Chicken Little (2005)',
 'Ratchet & Clank (2016)']

In [6]:
# Display the recommendations in a table
recommendations_df = pd.DataFrame(recommended_movies, columns=['Recommended Movies'])
recommendations_df


Unnamed: 0,Recommended Movies
0,Lost Highway (1997)
1,Rubber (2010)
2,Angel Heart (1987)
3,Song of the Thin Man (1947)
4,Vanilla Sky (2001)
5,"Alphaville (Alphaville, une étrange aventure d..."
6,Aelita: The Queen of Mars (Aelita) (1924)
7,Inside Out (2015)
8,Chicken Little (2005)
9,Ratchet & Clank (2016)


# Evaluation

In [7]:
# Merge seen movies with user_profiles
user_seen_movies = user_genre_data.groupby('userId')['title'].apply(set).reset_index()
user_seen_movies.rename(columns={'title': 'Seen Movies'}, inplace=True)

# Append recommendations to the existing dataframe for each user
user_profiles = user_profiles.merge(user_seen_movies, on='userId', how='left')


In [8]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error

def evaluate_user(user_id, top_n=10, default_rating=2.5):
    if user_id not in user_profiles['userId'].values:
        raise ValueError(f"User ID {user_id} not found in user profiles.")
    
    recommended_movies = recommend_for_user(user_id, top_n=top_n)
    seen_movies = user_profiles[user_profiles['userId'] == user_id]['Seen Movies'].values[0]
    
    y_true = [1 if movie in seen_movies else 0 for movie in movies['title']]
    y_pred = [1 if movie in recommended_movies else 0 for movie in movies['title']]
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    # Calculate RMSE
    user_vector = user_vectors[user_profiles[user_profiles['userId'] == user_id].index[0]]
    cosine_sim = cosine_similarity(user_vector, item_vectors)
    user_ratings_pred = cosine_sim.flatten()
    actual_ratings = ratings[ratings['userId'] == user_id]
    
    merged = pd.merge(movies[['movieId', 'title']], actual_ratings, on='movieId', how='left')
    merged = merged.copy()
    merged['rating'] = merged['rating'].fillna(default_rating)
    
    y_true_rmse = merged['rating'].values
    y_pred_rmse = user_ratings_pred[:len(y_true_rmse)]
    rmse = np.sqrt(mean_squared_error(y_true_rmse, y_pred_rmse))
    
    return precision, recall, f1, rmse


### Evaluate for 1 user

In [9]:
precision, recall, f1, rmse = evaluate_user(user_id=1)
print(f"User profile-based System for user {user_id}:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"RMSE: {rmse}")


User profile-based System for user 1:
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
RMSE: 2.4544170799563503


### Evaluate for all users

In [10]:
# Calculate metrics for all users
user_ids = user_profiles['userId'].unique()
metrics = [evaluate_user(user_id) for user_id in user_ids]
rmses = [evaluate_user(user_id) for user_id in user_profiles['userId'].unique()]

# Average the results
average_precision = np.mean([m[0] for m in metrics])
average_recall = np.mean([m[1] for m in metrics])
average_f1 = np.mean([m[2] for m in metrics])
average_rmse = np.mean(rmses)

print(f"User profile-based System for all users:")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1-Score: {average_f1}")
print(f"Average RMSE: {average_rmse}")


User profile-based System for all users:
Average Precision: 0.02418604651162791
Average Recall: 0.0023377598758864446
Average F1-Score: 0.0037201350594944847
Average RMSE: 0.6073388644628674


# Average number of new/unseen courses recommended per user

In [11]:
def count_new_recommendations(row):
    seen_movies = row['Seen Movies']
    recommended_movies = set(recommend_for_user(row['userId'], top_n=10))
    new_recommendations = recommended_movies - seen_movies
    return len(new_recommendations)

# Calculate new recommendations for each user
user_profiles['new_recommendations'] = user_profiles.apply(count_new_recommendations, axis=1)

# Compute the average number of new/unseen recommendations per user
average_new_recommendations = user_profiles['new_recommendations'].mean()
print(f"Average number of new/unseen movies recommended per user: {average_new_recommendations}")


Average number of new/unseen movies recommended per user: 9.758139534883721


# Top 10 Most Frequently Recommended

In [12]:
from collections import Counter

# Generate recommendations for each user and collect them
all_recommendations = []
for user_id in user_profiles['userId'].unique():
    all_recommendations.extend(recommend_for_user(user_id, top_n=10))

# Count frequencies of each recommended movie
recommendation_counts = Counter(all_recommendations)

# Get the top 10 most commonly recommended movies
top_10_recommendations = recommendation_counts.most_common(10)
top_10_recommendations_df = pd.DataFrame(top_10_recommendations, columns=['Movie', 'Frequency'])

# Display the top 10 recommendations as a table
top_10_recommendations_df


Unnamed: 0,Movie,Frequency
0,Vanilla Sky (2001),169
1,Aelita: The Queen of Mars (Aelita) (1924),167
2,"Alphaville (Alphaville, une étrange aventure d...",165
3,Inside Out (2015),105
4,Ratchet & Clank (2016),100
5,Song of the Thin Man (1947),94
6,Chicken Little (2005),92
7,Meet the Robinsons (2007),90
8,Lost Highway (1997),74
9,X-Men: First Class (2011),70
