In [59]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.storage import LocalFileStore
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import CacheBackedEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import pandas as pd
import json
import os
from tqdm import tqdm
from sentence_transformers import CrossEncoder
import numpy as np
import time

from pathlib import Path

ROOT_DIR = Path("/home/alron/movie-recommender")
MOVIES_FILEPATH = ROOT_DIR/'data/movies_metadata.csv'


all_cols = pd.read_csv(MOVIES_FILEPATH).columns
search_cols = ["title", "overview", "genres"]
metadata_cols = [col for col in all_cols if col not in search_cols]

docs = CSVLoader(file_path=MOVIES_FILEPATH, metadata_columns=metadata_cols).load()


model_name = "BAAI/llm-embedder"
model_name_wo_org = Path(model_name).parts[-1]
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
bge_embedding_model = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

store = LocalFileStore(ROOT_DIR/f"data/cache_{model_name_wo_org}/")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    bge_embedding_model, store, namespace=model_name_wo_org
)


text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)

  all_cols = pd.read_csv(MOVIES_FILEPATH).columns
.gitattributes: 100%|██████████| 1.52k/1.52k [00:00<00:00, 9.79MB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 1.21MB/s]
README.md: 100%|██████████| 28.8k/28.8k [00:00<00:00, 16.5MB/s]
config.json: 100%|██████████| 731/731 [00:00<00:00, 4.24MB/s]
config_sentence_transformers.json: 100%|██████████| 123/123 [00:00<00:00, 742kB/s]
model.safetensors: 100%|██████████| 438M/438M [01:08<00:00, 6.41MB/s] 
pytorch_model.bin: 100%|██████████| 438M/438M [00:40<00:00, 10.7MB/s] 
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 312kB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 749kB/s]
tokenizer.json: 100%|██████████| 712k/712k [00:00<00:00, 1.25MB/s]
tokenizer_config.json: 100%|██████████| 396/396 [00:00<00:00, 2.48MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 17.8MB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 1.61MB/s]


In [60]:
%%time
from tqdm import tqdm

FAISS_INDEX_PATH = ROOT_DIR/f"data/faiss_index_{model_name_wo_org}"

def load_or_generate_index(docs, cache_path=FAISS_INDEX_PATH):

    if os.path.exists(FAISS_INDEX_PATH):
        db = FAISS.load_local(cache_path, cached_embedder)
    else:
        db = None
        with tqdm(total=len(docs), desc="Ingesting documents") as pbar:
            for d in docs:
                if db:
                    db.add_documents([d])
                else:
                    db = FAISS.from_documents([d], cached_embedder)
                pbar.update(1)
        # db = FAISS.from_documents(documents[:100], cached_embedder)
        db.save_local(FAISS_INDEX_PATH)
    
    return db


db = load_or_generate_index(documents)

Ingesting documents: 100%|██████████| 45466/45466 [1:22:08<00:00,  9.22it/s]


CPU times: user 10h 44min 42s, sys: 3.08 s, total: 10h 44min 45s
Wall time: 1h 22min 10s


In [61]:
def doc_to_data_dict(doc):
    data = doc.page_content.split("\n")
    data_dict = {}
    for kv_tuple in data:
        try:
            key, value = kv_tuple.split(":", maxsplit=1)
            data_dict[key] = value
        except Exception:
            pass

    data_dict.update(doc.metadata)
    return data_dict

def rerank(cross_encoder, query, docs):
    
    # So we create the respective sentence combinations
    data_dicts = [doc_to_data_dict(doc) for doc in docs]
    sentence_combinations = [[query, ','.join(data_dict["genres"])+ " " +data_dict["title"] +" " + data_dict["overview"]] for data_dict in data_dicts]

    # Compute the similarity scores for these combinations
    similarity_scores = cross_encoder.predict(sentence_combinations)

    # Sort the scores in decreasing order
    sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))

    # Print the scores
    # print("Query:", query)
    # for idx in sim_scores_argsort:
    #     print("{:.2f}\t{}".format(similarity_scores[idx], docs[idx]))

    reranked_docs = [docs[i] for i in sim_scores_argsort]
    scores = [similarity_scores[i] for i in sim_scores_argsort]

    return reranked_docs, scores
    

def print_doc_info(doc):
    data_dict = doc_to_data_dict(doc)

    print(data_dict['title'].strip())
    # print(doc.metadata["popularity"])
    print("---")
    print("Synopsis: ", data_dict['overview'].strip())
    genres = json.loads(data_dict['genres'].replace("'", '"'))
    print("Genres:", [x["name"] for x in genres])
    # print([x["name"] for x in ])
    print(f"Source: Row {doc.metadata['row']}, IMDB: https://imdb.com/title/{doc.metadata['imdb_id']}")
    print(doc.metadata)
    print()
    

In [64]:
cross_encoder = CrossEncoder('BAAI/bge-reranker-large')

In [63]:
QUERY = "ninja animated film for kids"
###########################################################

retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 20, "score_threshold": .5,})

start_retriever = time.time()
retriever_query = "Represent this sentence for searching relevant movie descriptions: " + QUERY
docs = retriever.get_relevant_documents(retriever_query)
end_retriever = time.time()
scores = [0 for i in range(len(docs))]


# Filter only to top 10 most popular
start_filter = time.time()
# docs = [doc for doc in docs if int(doc.metadata["vote_count"]) > 0]
# docs = sorted(docs, key=lambda doc: doc.metadata["vote_count"], reverse=True)
# docs = docs[:10]
end_filter = time.time()

start_reranker = time.time()
# docs, scores = rerank(cross_encoder, QUERY, docs)
# assert len(docs) > 0
# assert len(scores) > 0
end_reranker = time.time()

print(f"Retriever: {end_retriever-start_retriever:.2f}s")
print(f"Filter: {end_filter-start_filter:.2f}s")
print(f"Reranker: {end_reranker-start_reranker:.2f}s")


for doc, score in zip(docs, scores):
    print(f"[{score:.2f}]")
    print_doc_info(doc)

Retriever: 0.09s
Filter: 0.00s
Reranker: 0.00s
[0.00]
Pocket Ninjas
---
Synopsis:  Somewhere in New Jersey, a group masked rollerblading children "save the universe" by training to fight the evil Cobra Khan through a series of action-packed montages.  Along the way they hypothesize some slapstick hijinks in a balloon factory, and save the day through a hypothetical Sonic Virtual Reality battle.  A real treat!
Genres: ['Action', 'Family']
Source: Row 30443, IMDB: https://imdb.com/title/tt0107838
{'source': PosixPath('/home/alron/movie-recommender/data/movies_metadata.csv'), 'row': 30443, 'adult': 'False', 'belongs_to_collection': '', 'budget': '0', 'homepage': '', 'id': '16530', 'imdb_id': 'tt0107838', 'original_language': 'en', 'original_title': 'Pocket Ninjas', 'popularity': '0.325189', 'poster_path': '/zu53XzaQivIi6Mzxt1ktHW5747W.jpg', 'production_companies': '[]', 'production_countries': "[{'iso_3166_1': 'US', 'name': 'United States of America'}]", 'release_date': '1997-03-25', 'rev