In [1]:
from langchain_ollama import OllamaLLM
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.vectorstores import Chroma
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
# Load data
metadata_df = pd.read_csv('/Users/borja/Documents/Somniumrema/projects/genai/grag/data/acquired_metadata.csv')
with open('/Users/borja/Documents/Somniumrema/projects/genai/grag/data/acquired_transcripts_all.txt', 'r') as f:
    text_data = f.read()

In [3]:
# Create text splitter with metadata preservation
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", ". ", " ", ""]
)
chunks = text_splitter.create_documents([text_data])

In [4]:
# Initialize embeddings
embeddings = FastEmbedEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    cache_folder="./model_cache"
)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
# Create ChromaDB with metadata
db = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db",
    collection_metadata={"hnsw:space": "cosine"}
)

  db.persist()


In [6]:
# Enhanced RAG prompt
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    Based on the provided context from Acquired podcast transcripts, please answer the question.
    Be specific and cite examples when possible.
    
    Context: {context}
    Question: {question}
    
    Detailed answer:
    """
)

In [7]:
# Initialize QA chain
llm = OllamaLLM(model="llama3.2")
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(
        search_kwargs={"k": 4}
    ),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

In [8]:
# Example query
response = qa.invoke({
    "query": "When did Airbnb go public, what was the price per share?"
})


In [9]:
response

{'query': 'When did Airbnb go public, what was the price per share?',
 'result': 'Airbnb went public on December 10, 2020. The IPO price was $68 per share, and the initial public offering (IPO) raised $3.5 billion for the company. According to Yahoo Finance, the stock opened at $146 a share and later reached $159 a share, resulting in a significant increase of over 100% from the IPO price.',
 'source_documents': [Document(metadata={}, page_content='.  Ben: Close to it.  David: For the year, Airbnb is down in growth by 39%. They filed their S1, and then last night, on December 9, 2020, they priced the IPO at $68 a share and upraising $3.5 billion at a $47 billion market cap.  Ben: Big, man. That Silver Lake investment at $18 billion just six, eight months ago looks like a genius move.  David: Indeed. What did we say? We said they priced at $68 a share. Got Yahoo Finance pulled up here. Who’s currently trading.  Ben: I see it in the Acquired Slack. People are buzzing about it.  David: Yo