# Advance Retrieval 101: Use Collaborative Filtering to Build a Movie Recommendation System

https://qdrant.tech/documentation/advanced-tutorials/collaborative-filtering/

In [2]:
from collections import defaultdict

import pandas as pd
from qdrant_client import QdrantClient, models
from qdrant_client.models import NamedSparseVector, PointStruct, SparseVector

## Prepare the data

In [4]:
# Load CSV file
ratings_df = pd.read_csv("data/ml-latest-small/ratings.csv", low_memory=False)
movies_df = pd.read_csv("data/ml-latest-small/movies.csv", low_memory=False)

# Convert movieId in ratings_df and movies_df to string
ratings_df["movieId"] = ratings_df["movieId"].astype(str)
movies_df["movieId"] = movies_df["movieId"].astype(str)

rating = ratings_df["rating"]

# Normalize ratings
ratings_df["rating"] = (rating - rating.mean()) / rating.std()

# Merge rating with movie metadata to get movie titles
merged_df = ratings_df.merge(
    movies_df[["movieId", "title"]], left_on="movieId", right_on="movieId", how="inner"
)

# Aggregate ratings to handle duplicate (userId, title) pairs
ratings_agg_df = merged_df.groupby(["userId", "movieId"]).rating.mean().reset_index()
ratings_agg_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,0.478109
1,1,1009,-0.481096
2,1,101,1.437315
3,1,1023,1.437315
4,1,1024,1.437315


## Convert to Sparse

In [6]:
# Convert ratings to sparse vectors.
user_sparse_vectors = defaultdict(lambda: {"values": [], "indices": []})
for row in ratings_agg_df.itertuples():
    user_sparse_vectors[row.userId]["values"].append(row.rating)
    user_sparse_vectors[row.userId]["indices"].append(int(row.movieId))

## Upload the data

In [28]:
def data_generator():
    for user_id, sparse_vector in user_sparse_vectors.items():
        yield PointStruct(
            id=user_id,
            vector={
                "ratings": SparseVector(
                    indices=sparse_vector["indices"], values=sparse_vector["values"]
                )
            },
            payload={"user_id": user_id, "movie_id": sparse_vector["indices"]},
        )


client = QdrantClient(":memory:")
client.create_collection(
    collection_name="movies",
    sparse_vectors_config={"ratings": models.SparseVectorParams()},
    vectors_config={},
)
client.upload_points(
    collection_name="movies",
    points=data_generator(),
)

## Define query

In [23]:
my_ratings = {
    603: 1,  # Matrix
    13475: 1,  # Star Trek
    11: 1,  # Star Wars
    1091: -1,  # The Thing
    862: 1,  # Toy Story
    597: -1,  # Titanic
    680: -1,  # Pulp Fiction
    13: 1,  # Forrest Gump
    120: 1,  # Lord of the Rings
    87: -1,  # Indiana Jones
    562: -1,  # Die Hard
}

In [29]:
# Create sparse vector from my_ratings
def to_vector(ratings):
    vector = SparseVector(values=[], indices=[])
    for movie_id, rating in ratings.items():
        vector.values.append(rating)
        vector.indices.append(movie_id)
    return vector

## Run the query

In [48]:
# Perform the search
results = client.query_points(
    collection_name="movies", query=to_vector(my_ratings), using="ratings", limit=20
).points

In [49]:
# Convert results to scores and sort by score
def results_to_scores(results):
    movie_scores = defaultdict(lambda: 0)
    for result in results:
        for movie_id in result.payload["movie_id"]:
            movie_scores[movie_id] += result.score
    return movie_scores


# Convert results to scores and sort by score
movie_scores = results_to_scores(results)
top_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)

In [59]:
for movieId, score in top_movies[:5]:
    movie = movies_df[movies_df['movieId'] == str(movieId)]
    print(movie.title.values[0], score)

Forrest Gump (1994) 44.63442826271057
Pulp Fiction (1994) 43.19412624835968
Pretty Woman (1990) 42.71452331542969
Star Wars: Episode V - The Empire Strikes Back (1980) 37.91998839378357
American Beauty (1999) 37.91849493980408
