# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

import tensorflow as tf
from sklearn.preprocessing import StandardScaler

from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document

from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from tqdm import tqdm
import yfinance as yf
import lxml    

# Import Data

In [15]:
df_sandp500 = pd.read_csv('/Users/ani/Projects/6_stock_portfolio_recommendation/data/stock_data.csv')
df_sandp500

Unnamed: 0,Ticker,Company_Name,Sector,Industry,Headquarters_Location,Founded_Year,Annualized_Return,YTD_Pct_Return,2024_Pct_Return,2023_Pct_Return,2022_Pct_Return,2021_Pct_Return,2020_Pct_Return,Market_Cap,Pct_Diff_200_MA,Annualized_Volatility,Sharpe_Ratio,Beta,Years_Since_Founded
0,MSFT,Microsoft,Information Technology,Systems Software,"Redmond, Washington",1975,21.32,10.40,14.50,58.35,-27.69,55.79,22.27,3.421644e+12,10.81,0.27,78.51,1.18,50.0
1,NVDA,Nvidia,Information Technology,Semiconductors,"Santa Clara, California",1993,72.79,-2.29,178.87,246.10,-51.44,124.48,48.40,3.295497e+12,6.61,0.53,137.30,2.07,32.0
2,AAPL,Apple Inc.,Information Technology,"Technology Hardware, Storage & Peripherals","Cupertino, California",1977,20.76,-17.44,35.56,54.80,-28.20,38.06,65.49,2.999856e+12,-10.79,0.30,69.06,1.27,48.0
3,AMZN,Amazon,Consumer Discretionary,Broadline Retail,"Seattle, Washington",1994,10.66,-6.91,46.33,77.04,-50.71,4.64,31.80,2.176468e+12,1.80,0.36,29.80,1.43,31.0
4,GOOG,Alphabet Inc. (Class C),Communication Services,Interactive Media & Services,"Mountain View, California",1998,19.39,-9.22,36.95,57.11,-38.84,67.43,22.35,2.090085e+12,0.16,0.31,62.44,1.24,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,APA,APA Corporation,Energy,Oil & Gas Exploration & Production,"Houston, Texas",1954,10.16,-25.26,-33.86,-15.71,69.08,83.27,21.51,6.137973e+09,-20.98,0.55,18.30,1.39,71.0
499,CZR,Caesars Entertainment,Consumer Discretionary,Casinos & Gaming,"Reno, Nevada",1973,-5.72,-17.52,-30.05,10.93,-55.49,30.63,105.79,5.590180e+09,-22.33,0.56,-10.32,1.97,52.0
500,ENPH,Enphase Energy,Information Technology,Semiconductor Materials & Equipment,"Fremont, California",2006,-7.06,-42.00,-47.67,-47.83,43.65,6.21,194.02,5.430658e+09,-44.18,0.68,-10.33,1.48,19.0
501,BRK.B,Berkshire Hathaway,Financials,Multi-Sector Holdings,"Omaha, Nebraska",1839,,,,,,,,,,,,,


# Create text chunks for RAG

In [3]:
documents = []
columns = df_sandp500.columns.tolist()
for index, row in df_sandp500.iterrows():

    doc_content = ""
    for col in columns:
        if pd.notna(row[col]):
            doc_content += f"{col}: {row[col]}\n"
    
    documents.append({
        "page_content": doc_content,
        "metadata": {"Ticker": row['Ticker'], "Company_Name": row['Company_Name']}
    })

print("\nExample of a processed document for RAG:")
print(documents[0]['page_content'])
print(documents[0]['metadata'])


Example of a processed document for RAG:
Ticker: MSFT
Company_Name: Microsoft
GICS Sector: Information Technology
GICS Sub-Industry: Systems Software
Headquarters Location: Redmond, Washington
Date added: 1994-06-01
CIK: 789019
Founded: 1975
Annualized_Return: 21.32
YTD_Pct_Return: 10.4
2024_Pct_Return: 14.5
2023_Pct_Return: 58.35
2022_Pct_Return: -27.69
2021_Pct_Return: 55.79
2020_Pct_Return: 22.27
Market_Cap: 3421643997184.0
Pct_Diff_200_MA: 10.81
Annualized_Volatility: 0.27
Sharpe_Ratio: 78.51
Beta: 1.18
Years_Since_Founded: 50.0

{'Ticker': 'MSFT', 'Company_Name': 'Microsoft'}


# Load Embedding Model

In [4]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


# Local Vector Database (ChromaDB)

In [5]:
# Create LangChain documents
langchain_documents = [
    Document(page_content=doc["page_content"], metadata=doc["metadata"])
    for doc in documents
]

# Initialize ChromaDB
persist_directory = "/Users/ani/Projects/6_stock_portfolio_recommendation/chroma_db"

print(f"\nInitializing ChromaDB at: {persist_directory}")

try:
    vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_function)
    if vectorstore._collection.count() == 0: 
        print("ChromaDB is empty. Adding documents...")
        vectorstore.add_documents(langchain_documents)
        print(f"Added {len(langchain_documents)} documents to ChromaDB.")
    else:
        print(f"ChromaDB already contains {vectorstore._collection.count()} documents. Skipping addition.")
except Exception as e:
    print(f"Error loading ChromaDB, attempting to create new: {e}")
    vectorstore = Chroma.from_documents(
        langchain_documents,
        embedding_function,
        persist_directory=persist_directory
    )
    print(f"Created new ChromaDB and added {len(langchain_documents)} documents.")


print("\nVector database (ChromaDB) setup complete.")


Initializing ChromaDB at: /Users/ani/Projects/6_stock_portfolio_recommendation/chroma_db


  vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_function)


ChromaDB already contains 503 documents. Skipping addition.

Vector database (ChromaDB) setup complete.


#  Integrate the Local LLM (Ollama)

In [6]:
llm = Ollama(model="mistral")

print(f"\nOllama LLM initialized with model: {llm.model}")


Ollama LLM initialized with model: mistral


  llm = Ollama(model="mistral")


# Build the RAG Chain with LangChain
1. The user's question is passed to the retriever.
2. The retriever fetches relevant context chunks.
3. The question and context are formatted into the prompt template.
4. The formatted prompt is sent to the LLM.
5. The LLM's response is parsed as a string.

In [7]:
# Retriever setup
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Define the prompt template for the LLM
prompt_template = ChatPromptTemplate.from_template("""
Answer the question based ONLY on the following context.
If the answer cannot be found in the context, politely state that you don't have enough information.

Context:
{context}

Question:
{question}
""")

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

print("\nRAG chain built successfully.")


RAG chain built successfully.


# Query RAG System

In [8]:
# --- Example Queries ---

print("\n--- Querying the RAG System ---")

# Query 1: Simple factual lookup
query1 = "What is the annualized return for Google (GOOGL)?"
print(f"\nQuestion: {query1}")
response1 = rag_chain.invoke(query1)
print(f"Answer: {response1}")

# Query 2: Comparative question
query2 = "Which company has a higher market cap, Apple or Microsoft?"
print(f"\nQuestion: {query2}")
response2 = rag_chain.invoke(query2)
print(f"Answer: {response2}")

# Query 3: Question requiring aggregation/filtering based on context
query3 = "List all stocks in the Information Technology industry with an annualized return greater than 28%."
print(f"\nQuestion: {query3}")
response3 = rag_chain.invoke(query3)
print(f"Answer: {response3}")

# Query 4: Question where information might be limited
query4 = "Tell me about the CEO of Tesla."
print(f"\nQuestion: {query4}")
response4 = rag_chain.invoke(query4)
print(f"Answer: {response4}") # Expected: "I don't have enough information..." because CEO is not in our DataFrame.

# Query 5: General question about an industry
query5 = "What are the key characteristics of companies in the Energy industry based on the data?"
print(f"\nQuestion: {query5}")
response5 = rag_chain.invoke(query5)
print(f"Answer: {response5}")


--- Querying the RAG System ---

Question: What is the annualized return for Google (GOOGL)?
Answer:  The annualized return for Google (GOOGL) is 19.19%.

Question: Which company has a higher market cap, Apple or Microsoft?
Answer:  Based on the provided context, Apple (AAPL) has a Market Cap of $2999855611904 and Microsoft (MSFT) has a Market Cap of $3421643997184. Therefore, Microsoft has a higher market cap than Apple.

Question: List all stocks in the Information Technology industry with an annualized return greater than 28%.
Answer:  Based on the provided context, only one stock, Seagate Technology (STX), has an annualized return greater than 28%. Therefore, the list would include 'Seagate Technology'.

Question: Tell me about the CEO of Tesla.
Answer:  I don't have enough information in the provided context to tell you about the CEO of Tesla. The context only includes data about the company itself, not its executives or leadership. You might want to check a separate source for d

In [9]:
df_sandp500

Unnamed: 0,Ticker,Company_Name,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded,Annualized_Return,YTD_Pct_Return,...,2023_Pct_Return,2022_Pct_Return,2021_Pct_Return,2020_Pct_Return,Market_Cap,Pct_Diff_200_MA,Annualized_Volatility,Sharpe_Ratio,Beta,Years_Since_Founded
0,MSFT,Microsoft,Information Technology,Systems Software,"Redmond, Washington",1994-06-01,789019,1975,21.32,10.40,...,58.35,-27.69,55.79,22.27,3.421644e+12,10.81,0.27,78.51,1.18,50.0
1,NVDA,Nvidia,Information Technology,Semiconductors,"Santa Clara, California",2001-11-30,1045810,1993,72.79,-2.29,...,246.10,-51.44,124.48,48.40,3.295497e+12,6.61,0.53,137.30,2.07,32.0
2,AAPL,Apple Inc.,Information Technology,"Technology Hardware, Storage & Peripherals","Cupertino, California",1982-11-30,320193,1977,20.76,-17.44,...,54.80,-28.20,38.06,65.49,2.999856e+12,-10.79,0.30,69.06,1.27,48.0
3,AMZN,Amazon,Consumer Discretionary,Broadline Retail,"Seattle, Washington",2005-11-18,1018724,1994,10.66,-6.91,...,77.04,-50.71,4.64,31.80,2.176468e+12,1.80,0.36,29.80,1.43,31.0
4,GOOG,Alphabet Inc. (Class C),Communication Services,Interactive Media & Services,"Mountain View, California",2006-04-03,1652044,1998,19.39,-9.22,...,57.11,-38.84,67.43,22.35,2.090085e+12,0.16,0.31,62.44,1.24,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,APA,APA Corporation,Energy,Oil & Gas Exploration & Production,"Houston, Texas",1997-07-28,1841666,1954,10.16,-25.26,...,-15.71,69.08,83.27,21.51,6.137973e+09,-20.98,0.55,18.30,1.39,71.0
499,CZR,Caesars Entertainment,Consumer Discretionary,Casinos & Gaming,"Reno, Nevada",2021-03-22,1590895,1973,-5.72,-17.52,...,10.93,-55.49,30.63,105.79,5.590180e+09,-22.33,0.56,-10.32,1.97,52.0
500,ENPH,Enphase Energy,Information Technology,Semiconductor Materials & Equipment,"Fremont, California",2021-01-07,1463101,2006,-7.06,-42.00,...,-47.83,43.65,6.21,194.02,5.430658e+09,-44.18,0.68,-10.33,1.48,19.0
501,BRK.B,Berkshire Hathaway,Financials,Multi-Sector Holdings,"Omaha, Nebraska",2010-02-16,1067983,1839,,,...,,,,,,,,,,


In [10]:
# list all values that contains 'technology' in the 'GICS Sector' column
tech_companies = df_sandp500[df_sandp500['GICS Sector'].str.contains('Technology', case=False, na=False)]
tech_companies.sort_values('Annualized_Return', ascending=False)

Unnamed: 0,Ticker,Company_Name,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded,Annualized_Return,YTD_Pct_Return,...,2023_Pct_Return,2022_Pct_Return,2021_Pct_Return,2020_Pct_Return,Market_Cap,Pct_Diff_200_MA,Annualized_Volatility,Sharpe_Ratio,Beta,Years_Since_Founded
1,NVDA,Nvidia,Information Technology,Semiconductors,"Santa Clara, California",2001-11-30,1045810,1993,72.79,-2.29,...,246.10,-51.44,124.48,48.40,3.295497e+12,6.61,0.53,137.30,2.07,32.0
332,SMCI,Supermicro,Information Technology,"Technology Hardware, Storage & Peripherals","San Jose, California",2024-03-18,1375365,1993,72.05,33.18,...,238.97,79.96,41.55,19.25,2.388466e+10,2.96,0.80,90.12,1.74,32.0
23,PLTR,Palantir Technologies,Information Technology,Application Software,"Denver, Colorado",2024-09-23,1321655,2003,69.21,75.26,...,168.70,-65.35,-22.08,147.89,3.109889e+11,79.29,0.73,94.57,1.97,22.0
7,AVGO,Broadcom,Information Technology,Semiconductors,"Palo Alto, California",2014-05-08,1730168,1961,56.82,4.67,...,106.27,-13.00,61.13,55.25,1.138201e+12,26.54,0.41,139.66,1.54,64.0
90,ANET,Arista Networks,Information Technology,Communications Equipment,"Santa Clara, California",2018-08-28,1596532,2004,42.66,-22.50,...,94.78,-14.69,103.07,23.84,1.088129e+11,-9.53,0.43,98.19,1.44,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,EPAM,EPAM Systems,Information Technology,IT Consulting & Other Services,"Newtown, Pennsylvania",2021-12-14,1352010,1993,-5.69,-23.86,...,-10.28,-48.97,92.05,53.23,9.885330e+09,-15.26,0.53,-10.83,1.38,32.0
461,AKAM,Akamai Technologies,Information Technology,Internet Services & Infrastructure,"Cambridge, Massachusetts",2007-07-12,1086222,1998,-6.40,-20.43,...,39.28,-28.26,11.21,-0.68,1.080757e+10,-17.03,0.30,-21.06,0.85,27.0
500,ENPH,Enphase Energy,Information Technology,Semiconductor Materials & Equipment,"Fremont, California",2021-01-07,1463101,2006,-7.06,-42.00,...,-47.83,43.65,6.21,194.02,5.430658e+09,-44.18,0.68,-10.33,1.48,19.0
465,SWKS,Skyworks Solutions,Information Technology,Semiconductors,"Irvine, California",2015-03-12,4127,2002,-8.24,-20.33,...,27.43,-41.69,4.20,29.27,1.036340e+10,-14.84,0.41,-20.33,1.56,23.0


In [11]:
query3 = "Was the return for NVIDIA higher or lower in 2022, compared to 2021?"
print(f"\nQuestion: {query3}")
response3 = rag_chain.invoke(query3)
print(f"Answer: {response3}")


Question: Was the return for NVIDIA higher or lower in 2022, compared to 2021?
Answer:  The return for Nvidia (NVDA) was lower in 2022, compared to 2021. In 2021, it returned 124.48%, while in 2022 it returned -51.44%.
