# Data Cleaning

In [1]:
import kagglehub
import os
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("abhikjha/movielens-100k")
subdir_path = os.path.join(path, "ml-latest-small")

# Load the data into DataFrames
movies = pd.read_csv(os.path.join(subdir_path, "movies.csv"))
ratings = pd.read_csv(os.path.join(subdir_path, "ratings.csv"))
tags = pd.read_csv(os.path.join(subdir_path, "tags.csv"))

# Drop the 'timestamp' column
ratings = ratings.drop(columns=['timestamp'])
tags = tags.drop(columns=['timestamp'])




# Feature Engineering

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Split genres into individual columns
genres = movies['genres'].str.get_dummies(sep='|')
movies = pd.concat([movies, genres], axis=1)

# Drop the '(no genres listed)' column if it exists
if '(no genres listed)' in movies.columns:
    movies = movies.drop(columns=['(no genres listed)'])

# Merge ratings with movies to include genres
user_genre_data = pd.merge(ratings, movies, on='movieId')

# Ensure columns exist for genres
genres_cols = [col for col in genres.columns if col in user_genre_data.columns]

# Calculate average rating per genre for each user
user_genre_avg = user_genre_data.groupby('userId')[genres_cols].mean().reset_index()

# Merge tags with movies
user_tags_data = pd.merge(tags, movies, on='movieId')

# Aggregate tags for each user
user_tags_agg = user_tags_data.groupby('userId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Merge average ratings with aggregated tags
user_profiles = pd.merge(user_genre_avg, user_tags_agg, on='userId', how='left')
user_profiles['tag'] = user_profiles['tag'].fillna('')

# Combine genre averages and tags into a single text field
user_profiles['user_combined'] = user_profiles[genres_cols].apply(lambda x: ' '.join(x.index[x > 0]), axis=1) + ' ' + user_profiles['tag']

# Remove duplicates
user_profiles = user_profiles.drop_duplicates(subset=['user_combined'])

# Create item profiles by combining relevant features
movies['item_combined'] = movies['genres']

# Vectorize item profiles
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
item_vectors = vectorizer.fit_transform(movies['item_combined'])

# Vectorize user profiles
user_vectors = vectorizer.transform(user_profiles['user_combined'])


# KNN-based recommender system

In [3]:
from sklearn.neighbors import NearestNeighbors

# Fit KNN model
knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(user_vectors)

# Function to recommend items based on KNN
def knn_recommend(user_id, top_n=10):
    user_vector = user_vectors[user_profiles[user_profiles['userId'] == user_id].index[0]]
    distances, indices = knn.kneighbors(user_vector, n_neighbors=top_n)
    recommended_movies = movies.iloc[indices[0]]['title'].tolist()
    return recommended_movies


In [4]:
knn_recommend(user_id=5, top_n=10)

['First Knight (1995)',
 'Cure, The (1995)',
 'Father of the Bride Part II (1995)',
 'Drop Zone (1994)',
 'Desperado (1995)',
 'Dumb & Dumber (Dumb and Dumber) (1994)',
 "Mr. Holland's Opus (1995)",
 'Dracula: Dead and Loving It (1995)',
 'Tom and Huck (1995)',
 'Indian in the Cupboard, The (1995)']

In [5]:
# Recommend movies for a specific user using Neural Network Embeddings
user_id = 1
recommended_movies = knn_recommend(user_id, top_n=10)

# Create a DataFrame to display the recommendations as a table
recommendations_df = pd.DataFrame(recommended_movies, columns=['Recommended Movies'])
recommendations_df.index = range(1, len(recommendations_df) + 1)  # Add ranking

# Display the DataFrame
recommendations_df


Unnamed: 0,Recommended Movies
1,Toy Story (1995)
2,Assassins (1995)
3,Don't Be a Menace to South Central While Drink...
4,"Misérables, Les (1995)"
5,Jade (1995)
6,Dunston Checks In (1996)
7,Sabrina (1995)
8,Heat (1995)
9,Beauty of the Day (Belle de jour) (1967)
10,Cutthroat Island (1995)


# Evaluation

In [6]:
# Merge seen movies with user_profiles
user_seen_movies = user_genre_data.groupby('userId')['title'].apply(set).reset_index()
user_seen_movies.rename(columns={'title': 'Seen Movies'}, inplace=True)

# Append recommendations to the existing dataframe for each user
user_profiles = user_profiles.merge(user_seen_movies, on='userId', how='left')


In [7]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error

def evaluate_knn(user_id, top_n=10):
    # Check if user_id is valid
    if user_id not in user_profiles['userId'].values:
        raise ValueError(f"User ID {user_id} not found in user profiles.")

    # Get the recommended movies
    recommended_movies = knn_recommend(user_id, top_n=top_n)
    
    # Get the movies the user has already seen
    seen_movies = user_profiles[user_profiles['userId'] == user_id]['Seen Movies'].values[0]
    
    # Convert to binary arrays
    y_true = [1 if movie in seen_movies else 0 for movie in movies['title']]
    y_pred = [1 if movie in recommended_movies else 0 for movie in movies['title']]
    
    # Calculate precision, recall, and f1 score
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Get the recommended movies and their distances
    user_vector = user_vectors[user_profiles[user_profiles['userId'] == user_id].index[0]]
    distances, indices = knn.kneighbors(user_vector, n_neighbors=top_n)
    
    # Create a DataFrame for the recommended movies
    recommended_movies = movies.iloc[indices[0]][['movieId', 'title']]
    
    # Get the actual ratings
    actual_ratings = ratings[ratings['userId'] == user_id]
    
    # Merge to get actual ratings for the recommended movies
    merged = pd.merge(recommended_movies, actual_ratings, on='movieId', how='left')
    
    # Assign default ratings to missing values
    default_rating = 2.5
    merged = merged.copy()
    merged['rating'] = merged['rating'].fillna(default_rating)
    
    # Get the predicted ratings
    predicted_ratings = [1 - dist for dist in distances[0][:len(merged)]]  # Match length of actual ratings
    
    # Calculate RMSE
    y_true = merged['rating'].values
    y_pred = predicted_ratings
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    return precision, recall, f1, rmse

### Evaluate for 1 user

In [8]:
rmse, precision, recall, f1 = evaluate_knn(user_id=1)
print(f"KNN-based System for user {user_id}:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"RMSE: {rmse}")

KNN-based System for user 1:
Precision: 0.008620689655172414
Recall: 0.01652892561983471
F1-Score: 1.9563803667008415
RMSE: 0.2


### Evaluate for all users

In [9]:
# Calculate metrics for all users
user_ids = user_profiles['userId'].unique()
metrics = [evaluate_knn(user_id) for user_id in user_ids]
rmses = [evaluate_knn(user_id) for user_id in user_profiles['userId'].unique()]

# Average the results
average_precision = np.mean([m[0] for m in metrics])
average_recall = np.mean([m[1] for m in metrics])
average_f1 = np.mean([m[2] for m in metrics])
average_rmse = np.mean(rmses)

print(f"KNN-based System for all users:")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1-Score: {average_f1}")
print(f"Average RMSE: {average_rmse}")


KNN-based System for all users:
Average Precision: 0.0386046511627907
Average Recall: 0.0045157466313646985
Average F1-Score: 0.007432272590011486
Average RMSE: 0.43038819564701514


# Average number of new/unseen courses recommended per user

In [10]:
def count_new_recommendations(row):
    seen_movies = row['Seen Movies']
    recommended_movies = set(knn_recommend(row['userId'], top_n=10))
    new_recommendations = recommended_movies - seen_movies
    return len(new_recommendations)

# Calculate new recommendations for each user
user_profiles['new_recommendations'] = user_profiles.apply(count_new_recommendations, axis=1)

# Compute the average number of new/unseen recommendations per user
average_new_recommendations = user_profiles['new_recommendations'].mean()
print(f"Average number of new/unseen movies recommended per user: {average_new_recommendations}")


Average number of new/unseen movies recommended per user: 9.613953488372093


# Top 10 most commonly recommended

In [11]:
from collections import Counter

# Generate recommendations for each user and collect them
all_recommendations = []
for user_id in user_profiles['userId'].unique():
    all_recommendations.extend(knn_recommend(user_id, top_n=10))

# Count frequencies of each recommended movie
recommendation_counts = Counter(all_recommendations)

# Get the top 10 most commonly recommended movies
top_10_recommendations = recommendation_counts.most_common(10)
top_10_recommendations_df = pd.DataFrame(top_10_recommendations, columns=['Movie', 'Frequency'])

# Display the top 10 recommendations as a table
top_10_recommendations_df


Unnamed: 0,Movie,Frequency
0,Mary Reilly (1996),34
1,"Brothers McMullen, The (1995)",33
2,Down Periscope (1996),31
3,Living in Oblivion (1995),30
4,Clerks (1994),29
5,White Squall (1996),29
6,Home for the Holidays (1995),27
7,Heavyweights (Heavy Weights) (1995),26
8,"Indian in the Cupboard, The (1995)",26
9,Drop Zone (1994),25
