In [1]:
import re
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import pandas as pd

movies = pd.read_csv("movies_cleaned.csv")

In [5]:
# add sign for splitting
movies["tagged_plots"] = movies["tagged_plots"] + "***"

In [7]:
movies["tagged_plots"].to_csv("tagged_plots.txt",
                                   sep = "\n",
                                   index = False,
                                   header = False)

In [8]:
raw_documents = TextLoader("tagged_plots.txt", encoding="utf-8").load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="***")
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 505, which is longer than the specified 0
Created a chunk of size 446, which is longer than the specified 0
Created a chunk of size 446, which is longer than the specified 0
Created a chunk of size 911, which is longer than the specified 0
Created a chunk of size 742, which is longer than the specified 0
Created a chunk of size 1293, which is longer than the specified 0
Created a chunk of size 1382, which is longer than the specified 0
Created a chunk of size 191, which is longer than the specified 0
Created a chunk of size 3647, which is longer than the specified 0
Created a chunk of size 491, which is longer than the specified 0
Created a chunk of size 882, which is longer than the specified 0
Created a chunk of size 269, which is longer than the specified 0
Created a chunk of size 410, which is longer than the specified 0
Created a chunk of size 670, which is longer than the specified 0
Created a chunk of size 615, which is longer than the specified 0
Created

In [14]:
# Load a free embedding model (MiniLM is lightweight & fast)
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create ChromaDB with local embeddings
db_movies = Chroma.from_documents(documents, embedding=embedding_function)

  embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [31]:
query = "A movie to spy on the enemy"
docs = db_movies.similarity_search(query, k = 10)
docs

[Document(id='ab7a9aff-f792-4f1b-83e0-d80aaad68301', metadata={'source': 'tagged_plots.txt'}, page_content='"\n"4335336 The plot involves rogue elements of the communist Chinese army who use fantastic burrowing machines in an effort to place atomic bombs under major U.S. cities. The U.S. Navy sends troops underground to combat them. The film has been described as ""deliriously paranoid"".'),
 Document(id='8f117073-fad0-4b16-9d0c-413655bb23d8', metadata={'source': 'tagged_plots.txt'}, page_content='"\n1437226 The plot involves a U.S. effort to root out Nazi saboteurs at a shipyard during World War II. Pat O\'Brien plays an American intelligence officer who goes undercover at the yard, working at a construction job and looking for possible spies among the managers and employees.'),
 Document(id='4c4d4244-28dd-46bf-b3b2-fa4a7c4f6c1b', metadata={'source': 'tagged_plots.txt'}, page_content='"\n6645863 The film tells the tale of two policemen who go undercover to defeat narcotics trafficking

In [49]:
def retrieve_semantic_recommendations(
        query: str,
        top_k: int = 10,
) -> pd.DataFrame:
    recs = db_movies.similarity_search(query, k = 50)

    movies_list = []

    for i in range(0, len(recs)):
        movies_list += [int(recs[i].page_content.strip('"').strip("\n").strip('"').split()[0])]

    return movies[movies["unique_id"].isin(movies_list)]

In [51]:
retrieve_semantic_recommendations("A movie to spy on the enemy")

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Genre,Wiki Page,Plot,words_in_plot,unique_id,tagged_plots
64,1914,Captain Alvarez,American,Rollin S. Sturgeon,historical drama,https://en.wikipedia.org/wiki/Captain_Alvarez,A melodrama about an American who becomes a re...,41,1869617,1869617 A melodrama about an American who beco...
571,1926,Hands Up!,American,Clarence Badger,comedy,https://en.wikipedia.org/wiki/Hands_Up!_(1926_...,"The film tells the story of Jack, a spy for th...",62,1405581,"1405581 The film tells the story of Jack, a sp..."
3237,1942,Spy Smasher,American,William Witney,spy serial,https://en.wikipedia.org/wiki/Spy_Smasher_(ser...,Alan Armstrong (Kane Richmond) as the Spy Smas...,197,9113765,9113765 Alan Armstrong (Kane Richmond) as the ...
3692,1944,Secret Command,American,A. Edward Sutherland,drama,https://en.wikipedia.org/wiki/Secret_Command,The plot involves a U.S. effort to root out Na...,46,1437226,1437226 The plot involves a U.S. effort to roo...
5069,1951,Decision Before Dawn,American,Anatole Litvak,war,https://en.wikipedia.org/wiki/Decision_Before_...,"By late 1944, it is obvious that the Germans w...",507,5738014,"5738014 By late 1944, it is obvious that the G..."
6833,1958,Wind Across the Everglades,American,Nicholas Ray,drama,https://en.wikipedia.org/wiki/Wind_Across_the_...,"Set in the early 20th century, the film follow...",73,8488427,"8488427 Set in the early 20th century, the fil..."
6935,1959,Paratroop Command,American,William Witney,war drama,https://en.wikipedia.org/wiki/Paratroop_Command,Charlie is a soldier who suffers the scorn of ...,35,1768560,1768560 Charlie is a soldier who suffers the s...
7056,1960,Pay or Die,American,Richard Wilson,crime,https://en.wikipedia.org/wiki/Pay_or_Die,The film is a dramatization of the career of c...,51,4237026,4237026 The film is a dramatization of the car...
7087,1960,This Rebel Breed,American,Richard L. Bare,crime drama,https://en.wikipedia.org/wiki/This_Rebel_Breed,The film tells the tale of two policemen who g...,37,6645863,6645863 The film tells the tale of two policem...
7301,1962,Moon Pilot,American,James Neilson,comedy,https://en.wikipedia.org/wiki/Moon_Pilot,Air Force Capt. Richmond Talbot inadvertently ...,226,6627864,6627864 Air Force Capt. Richmond Talbot inadve...


In [66]:
retrieve_semantic_recommendations("A movie to spy on the enemy")

ValueError: invalid literal for int() with base 10: 'The'

In [64]:
movies["starts_with_num"] = movies["tagged_plots"].str.match(r'^\d+')


# Check if all rows start with a number
all_start_with_number = movies['starts_with_num'].all()
print("Do all sentences start with a number?", all_start_with_number)

Do all sentences start with a number? True
