# Data Cleaning

In [1]:
import kagglehub
import os
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("abhikjha/movielens-100k")
subdir_path = os.path.join(path, "ml-latest-small")

# Load the data into DataFrames
movies = pd.read_csv(os.path.join(subdir_path, "movies.csv"))
ratings = pd.read_csv(os.path.join(subdir_path, "ratings.csv"))
tags = pd.read_csv(os.path.join(subdir_path, "tags.csv"))

# Drop the 'timestamp' column
ratings = ratings.drop(columns=['timestamp'])
tags = tags.drop(columns=['timestamp'])




# Feature Engineering

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Split genres into individual columns
genres = movies['genres'].str.get_dummies(sep='|')
movies = pd.concat([movies, genres], axis=1)

# Drop the '(no genres listed)' column if it exists
if '(no genres listed)' in movies.columns:
    movies = movies.drop(columns=['(no genres listed)'])

# Merge ratings with movies to include genres
user_genre_data = pd.merge(ratings, movies, on='movieId')

# Ensure columns exist for genres
genres_cols = [col for col in genres.columns if col in user_genre_data.columns]

# Calculate average rating per genre for each user
user_genre_avg = user_genre_data.groupby('userId')[genres_cols].mean().reset_index()

# Merge tags with movies
user_tags_data = pd.merge(tags, movies, on='movieId')

# Aggregate tags for each user
user_tags_agg = user_tags_data.groupby('userId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Merge average ratings with aggregated tags
user_profiles = pd.merge(user_genre_avg, user_tags_agg, on='userId', how='left')
user_profiles['tag'] = user_profiles['tag'].fillna('')

# Combine genre averages and tags into a single text field
user_profiles['user_combined'] = user_profiles[genres_cols].apply(lambda x: ' '.join(x.index[x > 0]), axis=1) + ' ' + user_profiles['tag']

# Remove duplicates
user_profiles = user_profiles.drop_duplicates(subset=['user_combined'])

# Create item profiles by combining relevant features
movies['item_combined'] = movies['genres']

# Vectorize item profiles
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
item_vectors = vectorizer.fit_transform(movies['item_combined'])

# Vectorize user profiles
user_vectors = vectorizer.transform(user_profiles['user_combined'])


# NMF-based recommender system

In [3]:
import numpy as np
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler

# Create a pivot table (user-item matrix)
ratings_pivot = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Normalize data before applying NMF
scaler = MinMaxScaler()
ratings_normalized = scaler.fit_transform(ratings_pivot)

# Apply NMF
nmf = NMF(n_components=20, random_state=42)
W = nmf.fit_transform(ratings_normalized)
H = nmf.components_

# Create user-item matrix
user_item_matrix = user_genre_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Function to recommend items based on NMF
def nmf_recommend(user_id, top_n=10):
    user_index = user_profiles[user_profiles['userId'] == user_id].index[0]
    user_ratings_pred = np.dot(W[user_index], H)
    
    # Get top N recommendations
    movie_indices = user_ratings_pred.argsort()[-top_n:][::-1]
    recommended_movies = [movies.iloc[i]['title'] for i in movie_indices]
    
    return recommended_movies


In [4]:
nmf_recommend(user_id=1, top_n=10)

["Cheech and Chong's Up in Smoke (1978)",
 'Star Wars: Episode IV - A New Hope (1977)',
 "Once Upon a Time in the West (C'era una volta il West) (1968)",
 'Princess Bride, The (1987)',
 'Local Hero (1983)',
 'Terminator 2: Judgment Day (1991)',
 'Walk on the Moon, A (1999)',
 "Jane Austen's Mafia! (1998)",
 'Brazil (1985)',
 'Some Kind of Wonderful (1987)']

In [5]:
# Recommend movies for a specific user using Neural Network Embeddings
user_id = 1
recommended_movies = nmf_recommend(user_id, top_n=10)

# Create a DataFrame to display the recommendations as a table
recommendations_df = pd.DataFrame(recommended_movies, columns=['Recommended Movies'])
recommendations_df.index = range(1, len(recommendations_df) + 1)  # Add ranking

# Display the DataFrame
recommendations_df


Unnamed: 0,Recommended Movies
1,Cheech and Chong's Up in Smoke (1978)
2,Star Wars: Episode IV - A New Hope (1977)
3,Once Upon a Time in the West (C'era una volta ...
4,"Princess Bride, The (1987)"
5,Local Hero (1983)
6,Terminator 2: Judgment Day (1991)
7,"Walk on the Moon, A (1999)"
8,Jane Austen's Mafia! (1998)
9,Brazil (1985)
10,Some Kind of Wonderful (1987)


# Evaluation

In [6]:
# Merge seen movies with user_profiles
user_seen_movies = user_genre_data.groupby('userId')['title'].apply(set).reset_index()
user_seen_movies.rename(columns={'title': 'Seen Movies'}, inplace=True)

# Append recommendations to the existing dataframe for each user
user_profiles = user_profiles.merge(user_seen_movies, on='userId', how='left')


In [7]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error

recommended_movies

def evaluate_nmf(user_id, top_n=10):
    if user_id not in user_profiles['userId'].values:
        raise ValueError(f"User ID {user_id} not found in user profiles.")
    
    recommended_movies = nmf_recommend(user_id, top_n=top_n)
    seen_movies = user_profiles[user_profiles['userId'] == user_id]['Seen Movies'].values[0]
    
    # Convert to binary arrays
    y_true = [1 if movie in seen_movies else 0 for movie in movies['title']]
    y_pred = [1 if movie in recommended_movies else 0 for movie in movies['title']]
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    # Get actual and predicted ratings
    user_index = user_profiles[user_profiles['userId'] == user_id].index[0]
    user_ratings_pred = np.dot(W[user_index], H)
    actual_ratings = ratings_pivot.iloc[user_index].values
    
    # Only consider items that the user has rated
    mask = actual_ratings > 0
    y_true = actual_ratings[mask]
    y_pred = user_ratings_pred[mask]
    
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    return precision, recall, f1, rmse
    

### Evaluate 1 user

In [8]:
rmse, precision, recall, f1 = evaluate_nmf(user_id=1)
print(f"NMF-based System for user {user_id}:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"RMSE: {rmse}")

NMF-based System for user 1:
Precision: 0.008620689655172414
Recall: 0.01652892561983471
F1-Score: 4.169699666937647
RMSE: 0.2


### Evaluate all users

In [9]:
# Calculate metrics for all users
user_ids = user_profiles['userId'].unique()
metrics = [evaluate_nmf(user_id) for user_id in user_ids]
rmses = [evaluate_nmf(user_id) for user_id in user_profiles['userId'].unique()]

# Average the results
average_precision = np.mean([m[0] for m in metrics])
average_recall = np.mean([m[1] for m in metrics])
average_f1 = np.mean([m[2] for m in metrics])
average_rmse = np.mean(rmses)

print(f"NMF-based System for all users:")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1-Score: {average_f1}")
print(f"Average RMSE: {average_rmse}")


NMF-based System for all users:
Average Precision: 0.16558139534883723
Average Recall: 0.025731552352016275
Average F1-Score: 0.04018293551149663
Average RMSE: 0.967050864075967


# Average number of new/unseen courses recommended per user

In [10]:
def count_new_recommendations(row):
    seen_movies = row['Seen Movies']
    recommended_movies = set(nmf_recommend(row['userId'], top_n=10))
    new_recommendations = recommended_movies - seen_movies
    return len(new_recommendations)

# Calculate new recommendations for each user
user_profiles['new_recommendations'] = user_profiles.apply(count_new_recommendations, axis=1)

# Compute the average number of new/unseen recommendations per user
average_new_recommendations = user_profiles['new_recommendations'].mean()
print(f"Average number of new/unseen movies recommended per user: {average_new_recommendations}")


Average number of new/unseen movies recommended per user: 8.344186046511627


# Top 10 most commonly recommended

In [11]:
from collections import Counter

# Generate recommendations for each user and collect them
all_recommendations = []
for user_id in user_profiles['userId'].unique():
    all_recommendations.extend(nmf_recommend(user_id, top_n=10))

# Count frequencies of each recommended movie
recommendation_counts = Counter(all_recommendations)

# Get the top 10 most commonly recommended movies
top_10_recommendations = recommendation_counts.most_common(10)
top_10_recommendations_df = pd.DataFrame(top_10_recommendations, columns=['Movie', 'Frequency'])

# Display the top 10 recommendations as a table
top_10_recommendations_df


Unnamed: 0,Movie,Frequency
0,Forrest Gump (1994),145
1,"Shawshank Redemption, The (1994)",136
2,Pulp Fiction (1994),135
3,"Walk on the Moon, A (1999)",103
4,Home Alone 2: Lost in New York (1992),81
5,"Silence of the Lambs, The (1991)",76
6,Jurassic Park (1993),69
7,Braveheart (1995),67
8,White Water Summer (1987),63
9,"Cooler, The (2003)",62
