In [5]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [6]:
from dotenv import load_dotenv
load_dotenv()

True

In [7]:
import pandas as pd
movies = pd.read_csv("movies_cleaned.csv")

In [8]:
movies

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count,release_year,tagged_overview
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,1994,278 Framed in the 1940s for the double murder ...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731,1995,"19404 Raj is a rich, carefree, happy-go-lucky ..."
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280,1972,"238 Spanning the years 1945 to 1955, a chronic..."
3,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811,1974,240 In the continuing saga of the Corleone cri...
4,129,Spirited Away,"Animation,Family,Fantasy",ja,"A young girl, Chihiro, becomes trapped in a st...",92.056,2001-07-20,8.5,13093,2001,"129 A young girl, Chihiro, becomes trapped in ..."
...,...,...,...,...,...,...,...,...,...,...,...
6229,168098,Cell,"Horror,Science Fiction,Thriller",en,When a strange signal pulsates through all cel...,19.521,2016-07-06,4.7,910,2016,168098 When a strange signal pulsates through ...
6230,10196,The Last Airbender,"Action,Adventure,Fantasy",en,"The story follows the adventures of Aang, a yo...",98.322,2010-06-30,4.7,3347,2010,10196 The story follows the adventures of Aang...
6231,13995,Captain America,"Action,Science Fiction,War",en,"During World War II, a brave, patriotic Americ...",18.333,1990-12-14,4.6,332,1990,"13995 During World War II, a brave, patriotic ..."
6232,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama",en,A man named Farmer sets out to rescue his kidn...,15.159,2007-11-29,4.7,668,2007,2312 A man named Farmer sets out to rescue his...


In [9]:
movies["tagged_overview"]

0       278 Framed in the 1940s for the double murder ...
1       19404 Raj is a rich, carefree, happy-go-lucky ...
2       238 Spanning the years 1945 to 1955, a chronic...
3       240 In the continuing saga of the Corleone cri...
4       129 A young girl, Chihiro, becomes trapped in ...
                              ...                        
6229    168098 When a strange signal pulsates through ...
6230    10196 The story follows the adventures of Aang...
6231    13995 During World War II, a brave, patriotic ...
6232    2312 A man named Farmer sets out to rescue his...
6233    455957 Seeking justice for his partner’s murde...
Name: tagged_overview, Length: 6234, dtype: object

In [10]:
movies["tagged_overview"].to_csv("tagged_overview.txt", index=False, header=False)

In [11]:
raw_documents = TextLoader("tagged_overview.txt", encoding="utf-8").load()
text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 394, which is longer than the specified 1
Created a chunk of size 393, which is longer than the specified 1
Created a chunk of size 313, which is longer than the specified 1
Created a chunk of size 228, which is longer than the specified 1
Created a chunk of size 207, which is longer than the specified 1
Created a chunk of size 369, which is longer than the specified 1
Created a chunk of size 448, which is longer than the specified 1
Created a chunk of size 288, which is longer than the specified 1
Created a chunk of size 338, which is longer than the specified 1
Created a chunk of size 258, which is longer than the specified 1
Created a chunk of size 309, which is longer than the specified 1
Created a chunk of size 326, which is longer than the specified 1
Created a chunk of size 402, which is longer than the specified 1
Created a chunk of size 228, which is longer than the specified 1
Created a chunk of size 245, which is longer than the specified 1
Created a 

In [12]:
documents[0]

Document(metadata={'source': 'tagged_overview.txt'}, page_content='"278 Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope."')

In [13]:
db_movies = Chroma.from_documents(
    documents,
    embedding=OpenAIEmbeddings()
)

In [14]:
query = "A movie which tells about story happened during world war 2"
docs = db_movies.similarity_search(query, k = 5)
docs

[Document(id='8dc14f75-dc6f-42d4-8212-aad97d9ac461', metadata={'source': 'tagged_overview.txt'}, page_content='"515001 A World War II satire that follows a lonely German boy whose world view is turned upside down when he discovers his single mother is hiding a young Jewish girl in their attic. Aided only by his idiotic imaginary friend, Adolf Hitler, Jojo must confront his blind nationalism."'),
 Document(id='d55bd49f-aa16-4f13-9e35-4177baf50150', metadata={'source': 'tagged_overview.txt'}, page_content='256962 An eight-year-old boy is willing to do whatever it takes to end World War II so he can bring his father home. The story reveals the indescribable love a father has for his little boy and the love a son has for his father.'),
 Document(id='7c112461-1586-43d6-ad27-443daa834cda', metadata={'source': 'tagged_overview.txt'}, page_content='"339987 A German soldier tries to determine if the Dutch resistance has planted a spy to infiltrate the home of Kaiser Wilhelm in Holland during th

In [16]:
import re

def retrieve_semantic_recommendations(query, top_k: int) -> pd.DataFrame:
    recs = db_movies.similarity_search(query, k=50)
    movie_ids = []

    for rec in recs:
        # Remove leading/trailing quotes and spaces
        text = rec.page_content.strip().lstrip('"').rstrip('"').strip()

        # Extract the first integer (the movie ID)
        match = re.match(r'(\d+)', text)
        if match:
            movie_ids.append(int(match.group(1)))

    # Remove duplicates while preserving order
    movie_ids = list(dict.fromkeys(movie_ids))

    # Return top_k rows from your main movies DataFrame
    return movies[movies["id"].isin(movie_ids)].head(top_k)


In [23]:
 res = retrieve_semantic_recommendations("A movie about a father and detective searching for 2 missing childs", 5)
 res.loc[res['title']=="Prisoners", 'overview'].values

array(["Keller Dover faces a parent's worst nightmare when his 6-year-old daughter, Anna, and her friend go missing. The only lead is an old motorhome that had been parked on their street. The head of the investigation, Detective Loki, arrests the driver, but a lack of evidence forces Loki to release his only suspect. Dover, knowing that his daughter's life is at stake, decides that he has no choice but to take matters into his own hands."],
      dtype=object)

In [18]:
movies["genre"].str.split(",").explode().str.strip().value_counts().reset_index()

Unnamed: 0,genre,count
0,Drama,2737
1,Comedy,2359
2,Thriller,1614
3,Action,1486
4,Adventure,1081
5,Romance,1066
6,Crime,884
7,Horror,816
8,Family,797
9,Science Fiction,746


In [19]:
movies[movies["genre"].str.contains("Fiction", na=False)]

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count,release_year,tagged_overview
27,283566,Evangelion: 3.0+1.0 Thrice Upon a Time,"Animation,Action,Science Fiction,Drama",ja,"In the aftermath of the Fourth Impact, strande...",108.156,2021-03-08,8.4,528,2021,"283566 In the aftermath of the Fourth Impact, ..."
31,18491,Neon Genesis Evangelion: The End of Evangelion,"Drama,Animation,Science Fiction,Action",ja,The second of two theatrically released follow...,32.759,1997-07-19,8.4,1102,1997,18491 The second of two theatrically released ...
32,1891,The Empire Strikes Back,"Adventure,Action,Science Fiction",en,"The epic saga continues as Luke Skywalker, in ...",28.323,1980-05-20,8.4,14384,1980,1891 The epic saga continues as Luke Skywalker...
38,27205,Inception,"Action,Science Fiction,Adventure",en,"Cobb, a skilled thief who commits corporate es...",111.757,2010-07-15,8.4,31917,2010,"27205 Cobb, a skilled thief who commits corpor..."
49,618344,Justice League Dark: Apokolips War,"Animation,Action,Adventure,Fantasy,Science Fic...",en,Earth is decimated after intergalactic tyrant ...,41.778,2020-05-05,8.3,1189,2020,618344 Earth is decimated after intergalactic ...
...,...,...,...,...,...,...,...,...,...,...,...
6215,5550,RoboCop 3,"Action,Adventure,Crime,Science Fiction,Thriller",en,The mega corporation Omni Consumer Products is...,23.046,1993-04-17,4.7,958,1993,5550 The mega corporation Omni Consumer Produc...
6221,145221,Monsters: Dark Continent,"Thriller,Drama,Science Fiction",en,"Seven years on from the events of Monsters, an...",9.599,2014-10-09,4.7,279,2014,145221 Seven years on from the events of Monst...
6224,8285,The Spirit,"Action,Comedy,Thriller,Crime,Science Fiction",en,Down these mean streets a man must come. A he...,11.377,2008-12-25,4.7,803,2008,8285 Down these mean streets a man must come. ...
6229,168098,Cell,"Horror,Science Fiction,Thriller",en,When a strange signal pulsates through all cel...,19.521,2016-07-06,4.7,910,2016,168098 When a strange signal pulsates through ...
