# Collaborative Filtering Recommendation System

## Import Modules

In [157]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


## Import Dataset

In [158]:
# Load the ratings / interactions dataset.
# `names=[...]` forces these column names onto the CSV columns.
df1 = pd.read_csv(
    filepath_or_buffer="./Movie_data.csv",
    names=["user_id", "username", "item_id", "rating", "timestamp"]
)

# Convert the UNIX timestamp (seconds since epoch) into a pandas datetime object.
# unit="s" indicates the values are in seconds (not milliseconds).
df1["timestamp"] = pd.to_datetime(df1["timestamp"], unit="s")

# Load the movie metadata dataset (maps item_id -> title, and possibly other columns)
df2 = pd.read_csv(filepath_or_buffer="./Movie_Id_Titles.csv")

# Merge the ratings data with the movie titles using item_id as the join key.
# how="left" keeps ALL rows from df1 (ratings), attaching title info where available.
# If a rating's item_id is missing from df2, its title will become NaN.
join_df = df1.merge(df2, on="item_id", how="left")

# Drop item_id after merging since the title is now the preferred movie identifier.
join_df = join_df.drop(columns="item_id")

In [159]:
join_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100003 entries, 0 to 100002
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   user_id    100003 non-null  int64         
 1   username   100003 non-null  object        
 2   rating     100003 non-null  int64         
 3   timestamp  100003 non-null  datetime64[ns]
 4   title      100003 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 3.8+ MB


In [160]:
join_df.isna().sum()

user_id      0
username     0
rating       0
timestamp    0
title        0
dtype: int64

In [161]:
join_df.head()

Unnamed: 0,user_id,username,rating,timestamp,title
0,0,Shawn Wilson,5,1997-12-04 15:55:49,Star Wars (1977)
1,0,Shawn Wilson,5,1997-12-04 15:55:49,"Empire Strikes Back, The (1980)"
2,0,Shawn Wilson,1,1997-12-04 15:55:49,Gone with the Wind (1939)
3,196,Bessie White,3,1997-12-04 15:55:49,Kolya (1996)
4,196,Bessie White,4,1997-12-04 16:11:03,Mrs. Doubtfire (1993)


## Explore Dataset

### Dimensions of the Dataset

In [162]:
rows, cols = join_df.shape
print(f"The shape of the dataset is: {rows} rows × {cols} columns")
print(f"The size of the dataset is: {join_df.size:,}")


The shape of the dataset is: 100003 rows × 5 columns
The size of the dataset is: 500,015


### Statistical Summary of the Dataset

In [163]:
join_df["rating"].describe()

count    100003.000000
mean          3.529864
std           1.125704
min           1.000000
25%           3.000000
50%           4.000000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

### Number of Ratings given by each User

In [164]:
join_df.groupby("username")["rating"].count().sort_values(ascending=False).head(10)

username
Anna Free           737
Jeanne Maldonado    685
Bradley Butler      636
Mary Cooks          540
Bobbie Reyes        518
June Miyamoto       493
Burt Jean           490
Richard Spelman     484
Ruth Shepherd       480
Charles Haskell     448
Name: rating, dtype: int64

In [165]:
join_df.groupby("username")["rating"].mean().sort_values(ascending=False)

username
Jamie Hinger      4.869565
Justin Snowden    4.833333
Edward Coleman    4.724138
Glenn Claycomb    4.703704
Lee Paterson      4.687500
                    ...   
Frances Hays      2.058036
Alice Bingle      2.050000
Frederick Abdo    1.985185
Anna Free         1.834464
Adela Vignola     1.491954
Name: rating, Length: 944, dtype: float64

### Number of Unique Movies and Users

In [166]:
n_movies = join_df.title.unique().shape[0]
n_users = join_df.user_id.unique().shape[0]

print(f"There are {n_movies} unique movies and {n_users} unique users.")

There are 1664 unique movies and 944 unique users.


## Create Interaction Matrix

In [167]:
# Build a user–movie interaction matrix using pandas.
# - Rows = users (user_id)
# - Columns = movies (title)
# - Values = ratings (numeric)
interaction_df = (
    join_df
    # pivot_table reshapes the data into a matrix format:
    #   index="user_id"   -> each unique user_id becomes a row
    #   columns="title"   -> each unique movie title becomes a column
    #   values="rating"   -> the cell values are the user's rating for that movie
    .pivot_table(
        index="user_id",
        columns="title",
        values="rating",

        # If the same user rated the same movie more than once,
        # aggregate those multiple ratings into a single value (mean here).
        aggfunc="mean",

        # Fill missing user–movie pairs (i.e., movies a user never rated)
        # with 0.0 to represent "no interaction / unrated".
        # (Alternative: use np.nan if you want missing values explicitly.)
        fill_value=0.0
    )
)

# Convert the interaction DataFrame into a raw NumPy array.
# Shape will be: (n_users, n_movies)
interaction = interaction_df.to_numpy()

# Extract the ordered list of user IDs that correspond to the rows of `interaction`.
# This mapping is critical for translating between row indices and real user IDs.
users = interaction_df.index.to_numpy()

# Extract the ordered list of movie titles that correspond to the columns of `interaction`.
# This mapping is critical for translating between column indices and movie names.
movies = interaction_df.columns.to_numpy()

# Display the interaction matrix (DataFrame view) in the notebook
interaction_df

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Check Interaction Matrix Sparsity

In [168]:
# Total number of cells in the interaction matrix:
# (number of users) * (number of movies)
total_entries = interaction.size

# Count how many cells are non-zero.
# Assumption: 0.0 means "no rating", and any non-zero value means a user rated that movie.
non_zero_entries = np.count_nonzero(interaction)

# Number of unrated entries is everything else (i.e., zeros)
zero_entries = total_entries - non_zero_entries

# Sparsity is the fraction of the matrix that is "empty" (unrated / zero entries)
# A value close to 1.0 means extremely sparse; close to 0.0 means dense.
sparsity = zero_entries / total_entries

# Print a readable summary
print(f"Total entries: {total_entries:,}")
print(f"Observed ratings (non-zero): {non_zero_entries:,}")
print(f"Unrated (zero): {zero_entries:,}")
print(f"Sparsity: {sparsity:.4f} ({sparsity * 100:.2f}% of the matrix is empty)")

Total entries: 1,570,816
Observed ratings (non-zero): 99,696
Unrated (zero): 1,471,120
Sparsity: 0.9365 (93.65% of the matrix is empty)


## Create Similarity Matrix

In [169]:
# Compute the user–user similarity matrix using cosine similarity.
# - Each user is represented by their rating vector across all movies (a row in `interaction`).
# - Cosine similarity measures how aligned two users' rating patterns are, ignoring magnitude:
#     1.0  -> very similar taste (vectors point in the same direction)
#     0.0  -> no similarity (orthogonal vectors)
#    -1.0  -> opposite taste (rare here since ratings are typically non-negative)
user_similarity = cosine_similarity(interaction)

# Wrap the NumPy similarity matrix in a DataFrame for readability and easier indexing.
# - Rows and columns are both labeled by user_id values from `users`
# - user_similarity_df.loc[a, b] gives the similarity between user a and user b
user_similarity_df = pd.DataFrame(user_similarity, index=users, columns=users)

# Display the first few rows/columns to inspect the similarity matrix
user_similarity_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,934,935,936,937,938,939,940,941,942,943
0,1.0,0.120267,0.116561,0.0,0.161475,0.149818,0.061552,0.085212,0.228361,0.170817,...,0.139326,0.0,0.060997,0.157028,0.09639,0.0,0.150342,0.0,0.183361,0.118904
1,0.120267,1.0,0.168937,0.048388,0.064561,0.37967,0.429682,0.443097,0.320079,0.078385,...,0.372213,0.11986,0.26986,0.193343,0.197949,0.118722,0.315064,0.149086,0.181612,0.399432
2,0.116561,0.168937,1.0,0.113393,0.179694,0.073623,0.242106,0.108604,0.104257,0.16247,...,0.147095,0.310661,0.363328,0.410725,0.322713,0.231096,0.228793,0.162911,0.175273,0.106732
3,0.0,0.048388,0.113393,1.0,0.349781,0.021592,0.074018,0.067423,0.084419,0.062039,...,0.033885,0.043453,0.16714,0.071288,0.126278,0.026758,0.164539,0.102899,0.136757,0.02699
4,0.161475,0.064561,0.179694,0.349781,1.0,0.031804,0.068431,0.091507,0.18806,0.101284,...,0.054615,0.036784,0.133619,0.196561,0.146058,0.030202,0.196858,0.152041,0.171538,0.058752


## Provide Recommendations

In [170]:
def recommend_movies_for_user(target_user_id: int, k: int = 10, n_recommendations: int = 10) -> pd.DataFrame:
    """
    Recommend movies for a given user based on user–user cosine similarity.

    Steps:
    1. Find the k most similar users (neighbors) to the target user.
    2. Compute the average rating of each movie across these k users.
    3. Return the top n_recommendations movies with the highest average rating,
       excluding movies already rated by the target user.

    Parameters
    ----------
    target_user_id : int
        The ID of the user for whom we want recommendations. Must exist in `users`.
    k : int, optional
        Number of most similar users to consider (default = 10).
    n_recommendations : int, optional
        Number of movies to recommend (default = 10).

    Returns
    -------
    pd.DataFrame
        DataFrame with columns:
        - 'movie': movie title (or item identifier)
        - 'predicted_rating': average rating from the k similar users
    """
    # ---- Safety checks -------------------------------------------------------
    # Ensure the user–user similarity matrix and interaction matrix exist
    if "user_similarity" not in globals():
        raise RuntimeError("user_similarity matrix not found. Compute it with cosine_similarity(interaction) first.")

    if "interaction" not in globals() or "users" not in globals() or "movies" not in globals():
        raise RuntimeError("interaction, users, or movies not found. Make sure they are defined from interaction_df.")

    # Check that target user exists in the list of users
    if target_user_id not in users:
        raise ValueError(f"User ID {target_user_id} not found in users array.")

    # ---- Index mappings ------------------------------------------------------
    # Map each user_id to its index in the interaction matrix (row index)
    user_to_idx = {uid: idx for idx, uid in enumerate(users)}

    # Get the row index for the target user
    target_idx = user_to_idx[target_user_id]

    # ---- Step 1: Find k most similar users -----------------------------------
    # Extract the similarity vector for the target user:
    # user_similarity[target_idx] is a 1D array of similarities to all users
    sim_vector = user_similarity[target_idx].copy()

    # Exclude the user themself from their neighbors by setting similarity to -inf
    # (so they never show up in the top-k)
    sim_vector[target_idx] = float("-inf")

    # Get indices of the top-k most similar users
    # np.argsort returns indices sorted ascending; take the last k for highest similarity
    neighbor_indices = np.argsort(sim_vector)[-k:]

    # Optional: if you want them sorted from most to least similar, reverse them
    neighbor_indices = neighbor_indices[np.argsort(sim_vector[neighbor_indices])[::-1]]

    # ---- Step 2: Average ratings of movies rated by these k users ------------
    # Extract the rating rows for these k neighbors
    # Shape: (k, n_movies)
    neighbor_ratings = interaction[neighbor_indices, :]

    # Build a mask of where neighbors have actually rated a movie (rating > 0)
    # Assumes: 0.0 means "no rating"
    rated_mask = neighbor_ratings > 0

    # Sum of ratings per movie across neighbors
    rating_sums = neighbor_ratings.sum(axis=0)

    # Count of neighbors who rated each movie
    rating_counts = rated_mask.sum(axis=0)

    # Avoid division by zero: only compute averages where at least one neighbor rated the movie
    with np.errstate(divide="ignore", invalid="ignore"):
        avg_neighbor_ratings = np.where(
            rating_counts > 0,
            rating_sums / rating_counts,
            0.0  # 0 means "no information" / no neighbor rating
        )

    # ---- Exclude movies already rated by the target user ---------------------
    # Get the target user's rating vector (shape: (n_movies,))
    target_user_ratings = interaction[target_idx, :]

    # Boolean mask: True where the target user has already rated the movie
    target_already_rated = target_user_ratings > 0

    # We only want to recommend:
    # - movies that have at least one neighbor rating (rating_counts > 0), AND
    # - movies that the target user has NOT rated yet
    candidate_mask = (rating_counts > 0) & (~target_already_rated)

    # Apply mask: keep candidate scores, set others to 0 to avoid recommending them
    candidate_scores = np.where(candidate_mask, avg_neighbor_ratings, 0.0)

    # ---- Step 3: Select top-n movies by average neighbor rating --------------
    # Get indices of the top-n recommendation candidates
    if n_recommendations > 0:
        top_movie_indices = np.argsort(candidate_scores)[-n_recommendations:][::-1]
    else:
        top_movie_indices = np.array([], dtype=int)

    # Retrieve the corresponding movie titles and scores
    recommended_movies = movies[top_movie_indices]
    recommended_scores = candidate_scores[top_movie_indices]

    # Wrap results in a DataFrame for readability
    recommendations_df = pd.DataFrame({
        "movie": recommended_movies,
        "predicted_rating": recommended_scores
    })

    return recommendations_df

## View Recommendations

In [171]:
recommendations = recommend_movies_for_user(20, 10, 10)
recommendations

Unnamed: 0,movie,predicted_rating
0,"Usual Suspects, The (1995)",5.0
1,"Quiet Man, The (1952)",5.0
2,Don Juan DeMarco (1995),5.0
3,To Catch a Thief (1955),5.0
4,Once Upon a Time in the West (1969),5.0
5,"Deer Hunter, The (1978)",5.0
6,"Beautician and the Beast, The (1997)",5.0
7,"Craft, The (1996)",5.0
8,Wallace & Gromit: The Best of Aardman Animatio...,5.0
9,Emma (1996),5.0
