# User-based Collaborative Filtering

Core idea:
"If Alice and Bob have rated movies similarly in the past, then we can recommend to Alice the movies that Bob liked (but Alice hasn’t seen yet)."

How it works:

Find users similar to the target user (e.g., Alice).

Look at what those similar users liked.

Recommend those movies to Alice.

Example:
If Alice and Bob both liked Inception and The Matrix, and Bob also liked Interstellar, then Interstellar might be recommended to Alice.

In [6]:
import pandas as pd
import os

# Define the base path to your data
base_path = r'C:\Users\Sara\Documents\DS Masters\Semester 2\Solution Engineering\Python\movies-database\ml-25m'
print(os.listdir(base_path))
# Load all CSV files using full paths
movies = pd.read_csv(os.path.join(base_path, 'movies.csv'))
ratings = pd.read_csv(os.path.join(base_path, 'ratings.csv'))
tags = pd.read_csv(os.path.join(base_path, 'tags.csv'))
links = pd.read_csv(os.path.join(base_path, 'links.csv'))
genome_scores = pd.read_csv(os.path.join(base_path, 'genome-scores.csv'))
genome_tags = pd.read_csv(os.path.join(base_path, 'genome-tags.csv'))

# Merge genome data
genome = pd.merge(genome_scores, genome_tags, on='tagId', how='left')

# Merge all on movieId
movies_merged = movies \
    .merge(links, on='movieId', how='left') \
    .merge(ratings, on='movieId', how='left') \
    .merge(tags, on=['movieId', 'userId'], how='left') \
    .merge(genome, on='movieId', how='left')

# Save the merged dataset to the same directory
movies_merged.to_csv(os.path.join(base_path, 'movies_merged.csv'), index=False)


['EDA.ipynb', 'genome-scores.csv', 'genome-tags.csv', 'links.csv', 'movies.csv', 'ratings.csv', 'README.txt', 'tags.csv', 'uas-soe-python-ss-2025']


MemoryError: Unable to allocate 212. GiB for an array with shape (28498437227,) and data type int64

In [7]:
import pandas as pd
import os

# Base path to your local data
base_path = r'C:\Users\Sara\Documents\DS Masters\Semester 2\Solution Engineering\Python\movies-database\ml-25m'

# Load base datasets
movies = pd.read_csv(os.path.join(base_path, 'movies.csv'))
links = pd.read_csv(os.path.join(base_path, 'links.csv'))
ratings = pd.read_csv(os.path.join(base_path, 'ratings.csv'))

# Sample a small subset (10k ratings)
ratings_sample = ratings.sample(n=10000, random_state=42)
movie_ids_sample = ratings_sample['movieId'].unique()

# Filter other datasets based on sampled movieIds
movies_sample = movies[movies['movieId'].isin(movie_ids_sample)]
links_sample = links[links['movieId'].isin(movie_ids_sample)]

# Optionally add tags, genome if desired
tags = pd.read_csv(os.path.join(base_path, 'tags.csv'))
tags_sample = tags[tags['movieId'].isin(movie_ids_sample)]

genome_scores = pd.read_csv(os.path.join(base_path, 'genome-scores.csv'))
genome_tags = pd.read_csv(os.path.join(base_path, 'genome-tags.csv'))
genome = pd.merge(genome_scores, genome_tags, on='tagId', how='left')
genome_sample = genome[genome['movieId'].isin(movie_ids_sample)]

# Merge datasets
df = ratings_sample \
    .merge(movies_sample, on='movieId', how='left') \
    .merge(links_sample, on='movieId', how='left') \
    .merge(tags_sample, on=['userId', 'movieId'], how='left') \
    .merge(genome_sample, on='movieId', how='left')

# Save to file
df.to_csv(os.path.join(base_path, 'movies_sampled_merged.csv'), index=False)


In [10]:
movies_merged  = pd.read_csv(os.path.join(base_path, 'movies_sampled_merged.csv'))


  movies_merged  = pd.read_csv(os.path.join(base_path, 'movies_sampled_merged.csv'))


In [11]:
movies_merged.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,title,genres,imdbId,tmdbId,tag_x,timestamp_y,tagId,relevance,tag_y
0,99476,104374,3.5,1467897440,About Time (2013),Drama|Fantasy|Romance,2194499,122906.0,,,1.0,0.02975,007
1,99476,104374,3.5,1467897440,About Time (2013),Drama|Fantasy|Romance,2194499,122906.0,,,2.0,0.02875,007 (series)
2,99476,104374,3.5,1467897440,About Time (2013),Drama|Fantasy|Romance,2194499,122906.0,,,3.0,0.0475,18th century
3,99476,104374,3.5,1467897440,About Time (2013),Drama|Fantasy|Romance,2194499,122906.0,,,4.0,0.06125,1920s
4,99476,104374,3.5,1467897440,About Time (2013),Drama|Fantasy|Romance,2194499,122906.0,,,5.0,0.0415,1930s


In [12]:
len(movies_merged)

11354603

In [18]:
movies_merged.movieId.nunique()

3658

User-based Collaborative Filtering

In [13]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
import pandas as pd

# Ensure correct columns exist
data_subset = movies_merged[['userId', 'movieId', 'rating']].dropna()

# Create a Surprise reader for 0.5 to 5.0 star ratings
reader = Reader(rating_scale=(0.5, 5.0))

# Load the DataFrame into Surprise
data = Dataset.load_from_df(data_subset, reader)

# Split into training and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Configure UBCF with cosine similarity
sim_options = {
    'name': 'cosine',
    'user_based': True  # Set to False for item-based CF
}

# Train the model
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# Evaluate with RMSE
predictions = algo.test(testset)
print("Test RMSE:", rmse(predictions))

ModuleNotFoundError: No module named 'surprise'

In [14]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Step 1: Create a user-item ratings matrix
user_item_matrix = movies_merged.pivot_table(index='userId', columns='movieId', values='rating')

# Step 2: Normalize (center) ratings by subtracting user mean
user_means = user_item_matrix.mean(axis=1)
user_item_centered = user_item_matrix.sub(user_means, axis=0)

# Step 3: Fill NaNs with 0 for cosine similarity
user_item_filled = user_item_centered.fillna(0)

# Step 4: Compute user-user similarity matrix
user_sim = cosine_similarity(user_item_filled)
user_sim_df = pd.DataFrame(user_sim, index=user_item_filled.index, columns=user_item_filled.index)

In [15]:
target_user = 99476
target_movie = 104374

# Find users who rated movie 104374
users_who_rated = user_item_matrix[target_movie].dropna()

# Similarities of those users to target user
similarities = user_sim_df.loc[target_user, users_who_rated.index]

# Ratings by similar users for the movie
ratings = users_who_rated

# Weighted average
predicted_rating = (similarities * ratings).sum() / similarities.sum()
print(f"Predicted rating for user {target_user} and movie {target_movie}: {predicted_rating:.2f}")

Predicted rating for user 99476 and movie 104374: nan


  predicted_rating = (similarities * ratings).sum() / similarities.sum()


In [16]:
def predict_rating(user_id, movie_id, user_item_matrix, user_sim_df, fallback='user_mean'):
    if movie_id not in user_item_matrix.columns or user_id not in user_item_matrix.index:
        return None  # movie or user not in data

    # Users who rated this movie
    users_rated = user_item_matrix[movie_id].dropna()

    if users_rated.empty:
        return None  # no one rated this movie

    # Similarities between target user and others who rated the movie
    sims = user_sim_df.loc[user_id, users_rated.index]
    ratings = users_rated

    # Compute weighted average
    weighted_sum = (sims * ratings).sum()
    sim_sum = sims.sum()

    if sim_sum == 0:
        if fallback == 'user_mean':
            return user_item_matrix.loc[user_id].mean()
        elif fallback == 'global_mean':
            return user_item_matrix.stack().mean()
        else:
            return None

    return weighted_sum / sim_sum

In [17]:
pred = predict_rating(99476, 104374, user_item_matrix, user_sim_df)
print(f"Predicted rating: {pred:.2f}" if pred else "Prediction unavailable")

Predicted rating: 3.50


In [None]:
# Load ratings and movie info
movies = pd.read_csv(os.path.join(base_path, 'movies.csv'))
ratings = pd.read_csv(os.path.join(base_path, 'ratings.csv'))

# Merge to get movie titles (optional, for display)
ratings_merged = ratings.merge(movies[['movieId', 'title']], on='movieId', how='left')

# Drop missing values (just in case)
ratings_merged.dropna(subset=['userId', 'movieId', 'rating'], inplace=True)



   userId  movieId  rating   timestamp  \
0       1      296     5.0  1147880044   
1       1      306     3.5  1147868817   
2       1      307     5.0  1147868828   
3       1      665     5.0  1147878820   
4       1      899     3.5  1147868510   

                                              title  
0                               Pulp Fiction (1994)  
1  Three Colors: Red (Trois couleurs: Rouge) (1994)  
2  Three Colors: Blue (Trois couleurs: Bleu) (1993)  
3                                Underground (1995)  
4                        Singin' in the Rain (1952)  


In [20]:
# Check the structure
ratings_merged.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,296,5.0,1147880044,Pulp Fiction (1994)
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994)
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993)
3,1,665,5.0,1147878820,Underground (1995)
4,1,899,3.5,1147868510,Singin' in the Rain (1952)


In [21]:
len(ratings_merged)

25000095

In [22]:
# Create user-item matrix
user_item_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')

# Center ratings by subtracting user mean (optional but improves similarity quality)
user_item_centered = user_item_matrix.sub(user_item_matrix.mean(axis=1), axis=0)

# Fill NaNs with 0 to compute cosine similarity
user_item_filled = user_item_centered.fillna(0)

# Compute user-user cosine similarity matrix
user_sim_matrix = cosine_similarity(user_item_filled)
user_sim_df = pd.DataFrame(user_sim_matrix, index=user_item_filled.index, columns=user_item_filled.index)

  user_item_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')


MemoryError: Unable to allocate 71.5 GiB for an array with shape (162541, 59047) and data type float64

In [None]:
def predict_rating(user_id, movie_id, user_item_matrix, user_sim_df):
    if movie_id not in user_item_matrix.columns or user_id not in user_item_matrix.index:
        return None

    # Get users who rated this movie
    users_rated = user_item_matrix[movie_id].dropna()
    if users_rated.empty:
        return None

    # Similarities between the target user and other users
    similarities = user_sim_df.loc[user_id, users_rated.index]
    ratings = users_rated

    weighted_sum = (similarities * ratings).sum()
    sim_sum = similarities.sum()

    if sim_sum == 0:
        return user_item_matrix.loc[user_id].mean()  # fallback to user's average rating

    return weighted_sum / sim_sum

In [None]:
def recommend_top_n(user_id, user_item_matrix, user_sim_df, movies_df, n=5):
    # Movies the user hasn't rated
    unrated_movies = user_item_matrix.columns[user_item_matrix.loc[user_id].isna()]

    predictions = []
    for movie_id in unrated_movies:
        pred = predict_rating(user_id, movie_id, user_item_matrix, user_sim_df)
        if pred is not None:
            predictions.append((movie_id, pred))

    # Sort by predicted rating
    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]

    # Add titles
    top_n_with_titles = [(movie_id, movies_df.loc[movies_df.movieId == movie_id, 'title'].values[0], rating)
                         for movie_id, rating in top_n]

    return top_n_with_titles

In [None]:
top_recs = recommend_top_n(user_id=99476,
                           user_item_matrix=user_item_matrix,
                           user_sim_df=user_sim_df,
                           movies_df=movies,
                           n=5)

print("Top 5 Recommendations:")
for movie_id, title, score in top_recs:
    print(f"{title} (Movie ID: {movie_id}) — Predicted Rating: {score:.2f}")