In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.storage import LocalFileStore
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import CacheBackedEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import pandas as pd
import json
import os
from tqdm import tqdm

from pathlib import Path

ROOT_DIR = Path("/home/alron/movie-recommender")
MOVIES_FILEPATH = ROOT_DIR/'data/movies_metadata.csv'


all_cols = pd.read_csv(MOVIES_FILEPATH).columns
search_cols = ["title", "overview", "genres"]
metadata_cols = [col for col in all_cols if col not in search_cols]

docs = CSVLoader(file_path=MOVIES_FILEPATH, metadata_columns=metadata_cols).load()

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
bge_embedding_model = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)
# embeddings = OpenAIEmbeddings()
store = LocalFileStore(ROOT_DIR/"data/cache/")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    bge_embedding_model, store, namespace="BAAI-bge-large-en"
)


text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)

  all_cols = pd.read_csv(MOVIES_FILEPATH).columns
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%%time
from tqdm import tqdm

FAISS_INDEX_PATH = ROOT_DIR/"data/faiss_index"

def load_or_generate_index(docs, cache_path=FAISS_INDEX_PATH):

    if os.path.exists(FAISS_INDEX_PATH):
        db = FAISS.load_local(cache_path, cached_embedder)
    else:
        db = None
        with tqdm(total=len(docs), desc="Ingesting documents") as pbar:
            for d in docs:
                if db:
                    db.add_documents([d])
                else:
                    db = FAISS.from_documents([d], cached_embedder)
                pbar.update(1)
        # db = FAISS.from_documents(documents[:100], cached_embedder)
        db.save_local(FAISS_INDEX_PATH)
    
    return db


db = load_or_generate_index(documents)

CPU times: user 1.25 s, sys: 116 ms, total: 1.36 s
Wall time: 1.45 s


In [6]:
QUERY = "girl teen feel good film"

###########################################################
def print_doc_info(doc):
    data = doc.page_content.split("\n")
    data_dict = {}
    for kv_tuple in data:
        try:
            key, value = kv_tuple.split(":", maxsplit=1)
            data_dict[key] = value
        except Exception:
            pass

    print(data_dict['title'].strip())
    # print(doc.metadata["popularity"])
    print("---")
    print("Synopsis: ", data_dict['overview'].strip())
    genres = json.loads(data_dict['genres'].replace("'", '"'))
    print("Genres:", [x["name"] for x in genres])
    # print([x["name"] for x in ])
    
    print(f"Source: Row {doc.metadata['row']}, IMDB: https://imdb.com/title/{doc.metadata['imdb_id']}")
    print(doc.metadata)
    print()
    

retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 50, "score_threshold": .5,})
docs = retriever.get_relevant_documents(QUERY)
popularity_scores = [float(doc.metadata["popularity"]) for doc in docs]

# Re-rank results according to popularity scores
# reranked_docs = sorted(zip(docs, popularity_scores), key=lambda x: x[1], reverse=True)
reranked_docs = zip(docs,popularity_scores)

for doc, popularity_score in reranked_docs:
    # print(doc.page_content)
    print_doc_info(doc)
    


# SCORE_THRESHOLD = 0.5
# results = db.similarity_search_with_score(QUERY, k=500)
# results = [(doc, 1-score) for doc, score in results]
# results = [(doc, score) for doc, score in results if score >= SCORE_THRESHOLD]

# for doc, score in results:
#     print(f"{score:.2%}")
#     print_doc_info(doc)
    

The American Mall
---
Synopsis:  The executive producers of High School Musical keep the good times rolling with this upbeat musical comedy set in the one place every American teenager's home away from home - the local shopping mall. Ally (Nina Dobrev) is an optimistic adolescent singer/songwriter whose hard working mother owns the mall music shop frequented by every teen in town. When Ally shares her music with Joey (Rob Mayes), a janitor in the mall who harbors rock star ambitions, she is thrilled to find someone who can truly relate to her songs as well as her heart. Trouble looms on the horizon, however, in the form of the mall owner's spoiled rotten daughter Madison (Autumn Reeser). Madison is the kind of girl who's used to getting whatever she wants, and what she wants now could prove disastrous for both Ally's ambitions, and her mother's popular music store.
Genres: ['Romance', 'Comedy', 'Drama', 'Music']
Source: Row 35117, IMDB: https://imdb.com/title/tt1160313
{'source': Posix