# Import Libraries

In [1]:
import os
import pandas as pd
from typing import List, Dict
import google.generativeai as genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


# Parameters

In [None]:
MODEL_NAME = "gemini-2.0-flash-lite"
CHROMA_PERSIST_DIR = "/Users/ani/Projects/6_stock_portfolio_recommendation/chroma_db"
GOOGLE_API_KEY = "AIzaSyBiFMws7zkmeQIT9IH07dSyoymgBlB6cP8"

# Initialize Google Generative AI client

In [3]:
genai.configure(api_key=GOOGLE_API_KEY)

# Import data from CSV

In [28]:
df_sandp500 = pd.read_csv('/Users/ani/Projects/6_stock_portfolio_recommendation/data/sp500_data_sample.csv')
df_sandp500 = df_sandp500.rename(columns={'Security': 'Company_Name', 'GICS Sector': 'Sector', 'GICS Sub-Industry': 'Industry'})
df_sandp500

Unnamed: 0,Ticker,Company_Name,Sector,Industry,Headquarters Location,Date added,CIK,Founded,Annualized_Return,YTD_Pct_Return,...,2023_Pct_Return,2022_Pct_Return,2021_Pct_Return,2020_Pct_Return,Market_Cap,Pct_Diff_200_MA,Annualized_Volatility,Sharpe_Ratio,Beta,Years_Since_Founded
0,MSFT,Microsoft,Information Technology,Systems Software,"Redmond, Washington",1994-06-01,789019,1975,21.32,10.40,...,58.35,-27.69,55.79,22.27,3.421644e+12,10.81,0.27,78.51,1.18,50.0
1,NVDA,Nvidia,Information Technology,Semiconductors,"Santa Clara, California",2001-11-30,1045810,1993,72.79,-2.29,...,246.10,-51.44,124.48,48.40,3.295497e+12,6.61,0.53,137.30,2.07,32.0
2,AAPL,Apple Inc.,Information Technology,"Technology Hardware, Storage & Peripherals","Cupertino, California",1982-11-30,320193,1977,20.76,-17.44,...,54.80,-28.20,38.06,65.49,2.999856e+12,-10.79,0.30,69.06,1.27,48.0
3,AMZN,Amazon,Consumer Discretionary,Broadline Retail,"Seattle, Washington",2005-11-18,1018724,1994,10.66,-6.91,...,77.04,-50.71,4.64,31.80,2.176468e+12,1.80,0.36,29.80,1.43,31.0
4,GOOG,Alphabet Inc. (Class C),Communication Services,Interactive Media & Services,"Mountain View, California",2006-04-03,1652044,1998,19.39,-9.22,...,57.11,-38.84,67.43,22.35,2.090085e+12,0.16,0.31,62.44,1.24,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,APA,APA Corporation,Energy,Oil & Gas Exploration & Production,"Houston, Texas",1997-07-28,1841666,1954,10.16,-25.26,...,-15.71,69.08,83.27,21.51,6.137973e+09,-20.98,0.55,18.30,1.39,71.0
499,CZR,Caesars Entertainment,Consumer Discretionary,Casinos & Gaming,"Reno, Nevada",2021-03-22,1590895,1973,-5.72,-17.52,...,10.93,-55.49,30.63,105.79,5.590180e+09,-22.33,0.56,-10.32,1.97,52.0
500,ENPH,Enphase Energy,Information Technology,Semiconductor Materials & Equipment,"Fremont, California",2021-01-07,1463101,2006,-7.06,-42.00,...,-47.83,43.65,6.21,194.02,5.430658e+09,-44.18,0.68,-10.33,1.48,19.0
501,BRK.B,Berkshire Hathaway,Financials,Multi-Sector Holdings,"Omaha, Nebraska",2010-02-16,1067983,1839,,,...,,,,,,,,,,


# Step 4: Convert DataFrame to Documents

In [30]:
def dataframe_to_documents(df: pd.DataFrame) -> List[Document]:
    """Convert a pandas DataFrame to LangChain Documents using dynamic column names"""
    documents = []
    for idx, row in df.iterrows():
        # Dynamically build the text representation for each row
        text_lines = []
        for col in df.columns:
            text_lines.append(f"{col}: {row[col]}")
        text = "\n".join(text_lines)
        
        # Create metadata for filtering (using common columns if present)
        metadata = {
            'row_index': idx
        }
        for key in ['Ticker', 'Sector', 'Industry', 'Company_Name']:
            if key in df.columns:
                metadata[key] = row[key]
        
        documents.append(Document(page_content=text, metadata=metadata))
    return documents


# Step 5: Set up Text Splitter

In [31]:
def setup_text_splitter():
    """Configure text splitter for chunking"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    return text_splitter

# Step 6: Create Embeddings and Vector Store

In [32]:
def create_vector_store(documents: List[Document]):
    """Create ChromaDB vector store with Gemini embeddings"""
    # Initialize Gemini embeddings
    embeddings = GoogleGenerativeAIEmbeddings(
        model="models/embedding-001",
        google_api_key=GOOGLE_API_KEY
    )
    
    # Create or load vector store
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=CHROMA_PERSIST_DIR
    )
    
    # Persist the vector store
    vector_store.persist()
    
    return vector_store

# Step 7: Set up Gemini LLM

In [33]:
def setup_llm():
    """Initialize Gemini 2.0 Flash model"""
    llm = ChatGoogleGenerativeAI(
        model=MODEL_NAME,
        temperature=0.7,
        google_api_key=GOOGLE_API_KEY
    )
    return llm

# Step 8: Create RAG Chain

In [34]:
def create_rag_chain(vector_store, llm):
    """Create a RAG chain for question answering"""
    
    # Create a custom prompt template
    prompt_template = """You are a helpful stock portfolio advisor. Use the following pieces of context to answer the question at the end. 
    If you don't know the answer based on the context, just say that you don't know, don't try to make up an answer.
    
    Context:
    {context}
    
    Question: {question}
    
    Helpful Answer:"""
    
    PROMPT = PromptTemplate(
        template=prompt_template, 
        input_variables=["context", "question"]
    )
    
    # Create the RAG chain
    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
        chain_type_kwargs={"prompt": PROMPT},
        return_source_documents=True
    )
    
    return rag_chain

# Step 9: Query the RAG System

In [41]:
def query_rag_system(rag_chain, query: str):
    """Query the RAG system and return response with sources"""
    result = rag_chain({"query": query})
    
    print(f"Query: {query}")
    print(f"\nAnswer: {result['result']}")
    # print("\nSource Documents:")
    # for i, doc in enumerate(result['source_documents']):
    #     print(f"\nSource {i+1}:")
    #     print(f"Content: {doc.page_content[:200]}...")
    #     print(f"Metadata: {doc.metadata}")
    
    return result

# Additional utility functions

In [42]:
def load_existing_vector_store():
    """Load an existing vector store from disk"""
    embeddings = GoogleGenerativeAIEmbeddings(
        model="models/embedding-001",
        google_api_key=GOOGLE_API_KEY
    )
    
    vector_store = Chroma(
        persist_directory=CHROMA_PERSIST_DIR,
        embedding_function=embeddings
    )
    
    return vector_store

def add_documents_to_existing_store(new_documents: List[Document]):
    """Add new documents to existing vector store"""
    vector_store = load_existing_vector_store()
    vector_store.add_documents(new_documents)
    vector_store.persist()
    return vector_store

def delete_vector_store():
    """Delete the vector store (use with caution)"""
    import shutil
    if os.path.exists(CHROMA_PERSIST_DIR):
        shutil.rmtree(CHROMA_PERSIST_DIR)
        print(f"Deleted vector store at {CHROMA_PERSIST_DIR}")

# Step 10: Main execution function

In [43]:
def main():
    """Main function to run the RAG system"""
    # Create sample DataFrame
    print("Creating sample DataFrame...")
    df = df_sandp500.copy()
    print(f"DataFrame shape: {df.shape}")
    
    # Convert to documents
    print("\nConverting DataFrame to documents...")
    documents = dataframe_to_documents(df)
    
    # Set up text splitter
    text_splitter = setup_text_splitter()
    
    # Split documents if needed (optional for small documents)
    split_documents = text_splitter.split_documents(documents)
    print(f"Number of document chunks: {len(split_documents)}")
    
    # Create vector store
    print("\nCreating vector store...")
    vector_store = create_vector_store(split_documents)
    
    # Set up LLM
    print("\nSetting up Gemini LLM...")
    llm = setup_llm()
    
    # Create RAG chain
    print("\nCreating RAG chain...")
    rag_chain = create_rag_chain(vector_store, llm)
    
    # Example queries
    print("\n" + "="*50)
    print("TESTING RAG SYSTEM")
    print("="*50)
    
    queries = [
        "What is the Annualized Return of Apple?"
    ]
    
    for query in queries:
        print("\n" + "-"*50)
        query_rag_system(rag_chain, query)

In [44]:
# Run the main function
if __name__ == "__main__":
    main()

Creating sample DataFrame...
DataFrame shape: (503, 21)

Converting DataFrame to documents...
Number of document chunks: 593

Creating vector store...

Setting up Gemini LLM...

Creating RAG chain...

TESTING RAG SYSTEM

--------------------------------------------------
Query: What is the Annualized Return of Apple?

Answer: The Annualized Return of Apple is 20.76.
