In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import pandas as pd
movies = pd.read_csv("movies_cleaned.csv")

In [None]:
movies

In [None]:
movies["tagged_overview"]

In [None]:
movies["tagged_overview"].to_csv("tagged_overview.txt", index=False, header=False)

In [None]:
raw_documents = TextLoader("tagged_overview.txt", encoding="utf-8").load()
text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)

In [None]:
documents[0]

In [None]:
db_movies = Chroma.from_documents(
    documents,
    embedding=OpenAIEmbeddings()
)

In [None]:
query = "A movie which tells about story happened during world war 2"
docs = db_movies.similarity_search(query, k = 5)
docs

In [None]:
movies[movies["id"] == int(docs[3].page_content.split()[0].strip())]

In [None]:
import re

def retrieve_semantic_recommendations(query, top_k: int) -> pd.DataFrame:
    recs = db_movies.similarity_search(query, k=50)
    movie_ids = []

    for rec in recs:
        # Remove leading/trailing quotes and spaces
        text = rec.page_content.strip().lstrip('"').rstrip('"').strip()

        # Extract the first integer (the movie ID)
        match = re.match(r'(\d+)', text)
        if match:
            movie_ids.append(int(match.group(1)))

    # Remove duplicates while preserving order
    movie_ids = list(dict.fromkeys(movie_ids))

    # Return top_k rows from your main movies DataFrame
    return movies[movies["id"].isin(movie_ids)].head(top_k)


In [None]:
 retrieve_semantic_recommendations("A movie about a father and detective searching for 2 missing childs", 5)

In [None]:
movies["genre"].str.split(",").explode().str.strip().value_counts().reset_index()

In [None]:
movies[movies["genre"].str.contains("Fiction", na=False)]