In [None]:
pip install -U langchain-text-splitters

In [None]:
pip install chromadb

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/movies_metadata.csv')
df.head()

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import NLTKTextSplitter
import chromadb

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
df = df[['original_title','overview']]

In [None]:
text_splitter = NLTKTextSplitter(chunk_size=1500)

In [None]:
def split_overview(overview):
  if pd.isna(overview):
    return []
  return text_splitter.split_text(str(overview))

In [None]:
df['chunks'] = df['overview'].apply(split_overview)

In [None]:
chunked_df = df.explode('chunks').reset_index(drop=True)

In [None]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def encode_chunk(chunk):
  if not isinstance(chunk, str) or chunk.strip() == "":
    return None
  return embedder.encode(chunk).tolist()

In [None]:
chunked_df['embeddings'] = chunked_df['chunks'].apply(encode_chunk)

In [None]:
chunked_df.dropna(subset=['embeddings'], inplace=True)

In [None]:
client = chromadb.Client()
collection = client.create_collection(name='movies')

In [None]:
for idx, row in chunked_df.iterrows():
  collection.add(
      ids = [str(idx)],
      embeddings=[row['embeddings']],
      metadatas=[{
          'original_title': row['original_title'],
          'chunk': row['chunks']
      }]
  )
  print("Data successfully stored in ChromaDB.")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from chromadb import Client
import torch

In [None]:
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    device_map='auto'
)

In [None]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task='text-generation',
    return_full_text=True,
    max_new_tokens=800
)

In [None]:
def retrieve_documents(query, collection, top_k=5):

  query_embedding = sentence_model.encode(query).tolist()

  results = collection.query(
      query_embeddings = [query_embedding],
      n_results=top_k
  )

  if not results['documents']:
    print("No results found for the query.")
    return [], []

  chunks = []
  titles = []
  for document in results['metadatas'][0]:
    chunks.append(document['chunk'])
    titles.append(document['original_title'])
    return chunks, titles

In [None]:
def generate_answer(query, chunks, titles, text_generation_pipeline):
    # Prepare the context from chunks and titles
    context = "\n\n".join([f"Title: {title}\nChunk: {chunk}" for title,
                           chunk in zip(titles, chunks)])

    # Prepare the prompt
    prompt = f"""[INST]
    Instruction: You're an expert in movie suggestions. Your task is to analyze carefully the context and come up with an exhaustive answer to the following question:
    {query}

    Here is the context to help you:

    {context}

    [/INST]"""

    # Generate the answer using the model
    generated_text = text_generation_pipeline(prompt)[0]['generated_text']

    return generated_text

In [None]:
client = chromadb.Client()
collection = client.get_collection(name='movies')

query = "What are some good movies to watch on a rainy day?"
top_k = 5

# Retrieve documents
chunks, titles = retrieve_documents(query, collection, top_k)
print(f"Retrieved Chunks: {chunks}")
print(f"Retrieved Titles: {titles}")

In [None]:
if chunks and titles:
    answer = generate_answer(query, chunks, titles,
                             text_generation_pipeline)
    print(answer)
else:
    print("No relevant documents found to generate an answer.")