# Data Cleaning

In [1]:
import kagglehub
import os
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("abhikjha/movielens-100k")
subdir_path = os.path.join(path, "ml-latest-small")

# Load the data into DataFrames
movies = pd.read_csv(os.path.join(subdir_path, "movies.csv"))
ratings = pd.read_csv(os.path.join(subdir_path, "ratings.csv"))
tags = pd.read_csv(os.path.join(subdir_path, "tags.csv"))

# Drop the 'timestamp' column
ratings = ratings.drop(columns=['timestamp'])
tags = tags.drop(columns=['timestamp'])




# Feature Engineering

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Split genres into individual columns
genres = movies['genres'].str.get_dummies(sep='|')
movies = pd.concat([movies, genres], axis=1)

# Drop the '(no genres listed)' column if it exists
if '(no genres listed)' in movies.columns:
    movies = movies.drop(columns=['(no genres listed)'])

# Merge ratings with movies to include genres
user_genre_data = pd.merge(ratings, movies, on='movieId')

# Ensure columns exist for genres
genres_cols = [col for col in genres.columns if col in user_genre_data.columns]

# Calculate average rating per genre for each user
user_genre_avg = user_genre_data.groupby('userId')[genres_cols].mean().reset_index()

# Merge tags with movies
user_tags_data = pd.merge(tags, movies, on='movieId')

# Aggregate tags for each user
user_tags_agg = user_tags_data.groupby('userId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Merge average ratings with aggregated tags
user_profiles = pd.merge(user_genre_avg, user_tags_agg, on='userId', how='left')
user_profiles['tag'] = user_profiles['tag'].fillna('')

# Combine genre averages and tags into a single text field
user_profiles['user_combined'] = user_profiles[genres_cols].apply(lambda x: ' '.join(x.index[x > 0]), axis=1) + ' ' + user_profiles['tag']

# Remove duplicates
user_profiles = user_profiles.drop_duplicates(subset=['user_combined'])

# Create item profiles by combining relevant features
movies['item_combined'] = movies['genres']

# Vectorize item profiles
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
item_vectors = vectorizer.fit_transform(movies['item_combined'])

# Vectorize user profiles
user_vectors = vectorizer.transform(user_profiles['user_combined'])


# Neural Network Embedding-based recommender system

In [3]:
import tensorflow as tf
import numpy as np
import random

seed_value = 42
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
random.seed(seed_value)


In [4]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot

# Prepare data for embedding model
user_ids = ratings['userId'].unique()
movie_ids = ratings['movieId'].unique()

user_id_map = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_id_map = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

ratings['userId_mapped'] = ratings['userId'].map(user_id_map)
ratings['movieId_mapped'] = ratings['movieId'].map(movie_id_map)

# Build embedding model
user_input = Input(shape=(1,))
movie_input = Input(shape=(1,))

user_embedding = Embedding(len(user_ids), 20)(user_input)
movie_embedding = Embedding(len(movie_ids), 20)(movie_input)

user_vec = Flatten()(user_embedding)
movie_vec = Flatten()(movie_embedding)

dot_product = Dot(axes=1)([user_vec, movie_vec])

model = Model(inputs=[user_input, movie_input], outputs=dot_product)
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit([ratings['userId_mapped'], ratings['movieId_mapped']], ratings['rating'], epochs=5, verbose=1)

# Function to recommend items based on neural network embeddings
def nn_recommend(user_id, top_n=10):
    user_idx = user_id_map[user_id]
    user_vector = model.get_layer('embedding').get_weights()[0][user_idx]
    scores = user_vector.dot(model.get_layer('embedding_1').get_weights()[0].T)
    top_indices = scores.argsort()[-top_n:][::-1]
    recommended_movies = movies.iloc[top_indices]['title'].tolist()
    return recommended_movies


Epoch 1/5
[1m3152/3152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 11.6982
Epoch 2/5
[1m3152/3152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 2.4568
Epoch 3/5
[1m3152/3152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 1.3300
Epoch 4/5
[1m3152/3152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 1.0129
Epoch 5/5
[1m3152/3152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.8732


In [5]:
nn_recommend(user_id=1, top_n=10)

['Affair of the Necklace, The (2001)',
 'Harry and the Hendersons (1987)',
 'Hiding Out (1987)',
 'Safety of Objects, The (2001)',
 'Julien Donkey-Boy (1999)',
 'Lonely Are the Brave (1962)',
 'Pajama Game, The (1957)',
 'JFK (1991)',
 'Acid House, The (1998)',
 'Pumpkinhead (1988)']

In [6]:
# Recommend movies for a specific user using Neural Network Embeddings
user_id = 1
recommended_movies = nn_recommend(user_id, top_n=10)

# Create a DataFrame to display the recommendations as a table
recommendations_df = pd.DataFrame(recommended_movies, columns=['Recommended Movies'])
recommendations_df.index = range(1, len(recommendations_df) + 1)  # Add ranking

# Display the DataFrame
recommendations_df


Unnamed: 0,Recommended Movies
1,"Affair of the Necklace, The (2001)"
2,Harry and the Hendersons (1987)
3,Hiding Out (1987)
4,"Safety of Objects, The (2001)"
5,Julien Donkey-Boy (1999)
6,Lonely Are the Brave (1962)
7,"Pajama Game, The (1957)"
8,JFK (1991)
9,"Acid House, The (1998)"
10,Pumpkinhead (1988)


# Evaluation

In [7]:
# Merge seen movies with user_profiles
user_seen_movies = user_genre_data.groupby('userId')['title'].apply(set).reset_index()
user_seen_movies.rename(columns={'title': 'Seen Movies'}, inplace=True)

# Append recommendations to the existing dataframe for each user
user_profiles = user_profiles.merge(user_seen_movies, on='userId', how='left')


In [8]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error

# Function to calculate evaluation metrics
def evaluate_nn(user_id, top_n=10):
    recommended_movies = nn_recommend(user_id, top_n)
    seen_movies = user_profiles[user_profiles['userId'] == user_id]['Seen Movies'].values[0]
    
    # Flatten lists
    y_true = list(seen_movies)
    y_pred = recommended_movies
    
    # Convert lists to sets to find common elements
    common_elements = set(y_true).intersection(set(y_pred))
    y_true_flat = list(common_elements)
    y_pred_flat = y_true_flat  # aligning lengths for a fair comparison

    # Calculate precision, recall, and F1-score
    precision = precision_score([1]*len(y_true_flat), [1]*len(y_pred_flat), zero_division=0)
    recall = recall_score([1]*len(y_true_flat), [1]*len(y_pred_flat), zero_division=0)
    f1 = f1_score([1]*len(y_true_flat), [1]*len(y_pred_flat), zero_division=0)

    # Actual ratings
    actual_ratings = ratings[ratings['userId'] == user_id]
    
    # Predicted ratings
    user_idx = user_id_map[user_id]
    user_vector = model.get_layer('embedding').get_weights()[0][user_idx]
    predictions = user_vector.dot(model.get_layer('embedding_1').get_weights()[0].T)
    
    # Align predictions with actual ratings
    actual_ratings = actual_ratings.set_index('movieId')
    actual_ratings['predicted_rating'] = actual_ratings.index.map(lambda x: predictions[movie_id_map[x]] if x in movie_id_map else np.nan)
    actual_ratings = actual_ratings.dropna()
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(actual_ratings['rating'], actual_ratings['predicted_rating']))
    
    return precision, recall, f1, rmse


### Evaluate for 1 user

In [9]:
rmse, precision, recall, f1 = evaluate_nn(user_id=1)
print(f"Neural Network Embedding-based System for user {user_id}:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"RMSE: {rmse}")

Neural Network Embedding-based System for user 1:
Precision: 1.0
Recall: 1.0
F1-Score: 0.7361418801992113
RMSE: 1.0


### Evaluate for all users

In [10]:
# Calculate metrics for all users
user_ids = user_profiles['userId'].unique()
metrics = [evaluate_nn(user_id) for user_id in user_ids]
rmses = [evaluate_nn(user_id) for user_id in user_profiles['userId'].unique()]

# Average the results
average_precision = np.mean([m[0] for m in metrics])
average_recall = np.mean([m[1] for m in metrics])
average_f1 = np.mean([m[2] for m in metrics])
average_rmse = np.mean(rmses)

print(f"Neural Network Embedding-based System for all users:")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1-Score: {average_f1}")
print(f"Average RMSE: {average_rmse}")


Neural Network Embedding-based System for all users:
Average Precision: 0.07906976744186046
Average Recall: 0.07906976744186046
Average F1-Score: 0.07906976744186046
Average RMSE: 0.27439262776076284


# Average number of new/unseen courses recommended per user

In [11]:
def count_new_recommendations(row):
    seen_movies = row['Seen Movies']
    recommended_movies = set(nn_recommend(row['userId'], top_n=10))
    new_recommendations = recommended_movies - seen_movies
    return len(new_recommendations)

# Calculate new recommendations for each user
user_profiles['new_recommendations'] = user_profiles.apply(count_new_recommendations, axis=1)

# Compute the average number of new/unseen recommendations per user
average_new_recommendations = user_profiles['new_recommendations'].mean()
print(f"Average number of new/unseen movies recommended per user: {average_new_recommendations}")


Average number of new/unseen movies recommended per user: 9.902325581395349


# Top 10 most commonly recommended

In [12]:
from collections import Counter

# Generate recommendations for each user and collect them
all_recommendations = []
for user_id in user_profiles['userId'].unique():
    all_recommendations.extend(nn_recommend(user_id, top_n=10))

# Count frequencies of each recommended movie
recommendation_counts = Counter(all_recommendations)

# Get the top 10 most commonly recommended movies
top_10_recommendations = recommendation_counts.most_common(10)
top_10_recommendations_df = pd.DataFrame(top_10_recommendations, columns=['Movie', 'Frequency'])

# Display the top 10 recommendations as a table
top_10_recommendations_df


Unnamed: 0,Movie,Frequency
0,"Pajama Game, The (1957)",208
1,"Safety of Objects, The (2001)",207
2,Lonely Are the Brave (1962),207
3,Harry and the Hendersons (1987),204
4,"Affair of the Necklace, The (2001)",203
5,JFK (1991),202
6,Julien Donkey-Boy (1999),195
7,"Acid House, The (1998)",193
8,Hiding Out (1987),178
9,Love Affair (1994),106
