# Install Libraries

In [2]:
! pip install yfinance langchain_pinecone openai python-dotenv langchain-community sentence_transformers

# ! pip install yfinance openai python-dotenv sentence_transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
from langchain_pinecone import PineconeVectorStore
from openai import OpenAI
import json
import yfinance as yf
import concurrent.futures
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import numpy as np
import requests
import os
from dotenv import load_dotenv
import signal

  from tqdm.autonotebook import tqdm


# Set up global functions

In [4]:
# Define a timeout handler
def timeout_handler(signum, frame):
    raise TimeoutError("Execution timed out")

# Set the signal handler
signal.signal(signal.SIGALRM, timeout_handler)

# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"),)

# Connect to your Pinecone index
pc_index = pc.Index("stocks")

# Load environment variables from .env file
load_dotenv(override=True);

# Initialize tracking lists
successful_tickers = []
unsuccessful_tickers = []

In [5]:
def get_stock_info(symbol: str) -> dict:
    """
    Retrieves and formats detailed information about a stock from Yahoo Finance (yf).

    Args:
        symbol (str): The stock ticker symbol to look up.

    Returns:
        dict: A dictionary containing detailed stock information, including ticker, name,
              business summary, city, state, country, industry, and sector.
    """
    try:
        signal.alarm(7)
        data = yf.Ticker(symbol)
        signal.alarm(0)
        stock_info = data.info
    except Exception as e:
        print(f"Error fetching stock info for {symbol}: {e}")
        return None

    properties = {
        "Ticker": stock_info.get('symbol', 'Information not available'),
        'Name': stock_info.get('longName', 'Information not available'),
        'Business Summary': stock_info.get('longBusinessSummary'),
        'City': stock_info.get('city', 'Information not available'),
        'State': stock_info.get('state', 'Information not available'),
        'Country': stock_info.get('country', 'Information not available'),
        'Industry': stock_info.get('industry', 'Information not available'),
        'Sector': stock_info.get('sector', 'Information not available'),
        '10-Day AVG Volume': stock_info.get('averageDailyVolume10Day', 'Information not available'),
        'Market Cap': stock_info.get('marketCap', 'Information not available'),
    }

    return properties

In [6]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    """
    Generates embeddings for the given text using a specified Hugging Face model.

    Args:
        text (str): The input text to generate embeddings for.
        model_name (str): The name of the Hugging Face model to use.
                          Defaults to "sentence-transformers/all-mpnet-base-v2".

    Returns:
        np.ndarray: The generated embeddings as a NumPy array.
    """
    model = SentenceTransformer(model_name)
    return model.encode(text)


def cosine_similarity_between_sentences(sentence1, sentence2):
    """
    Calculates the cosine similarity between two sentences.

    Args:
        sentence1 (str): The first sentence for similarity comparison.
        sentence2 (str): The second sentence for similarity comparison.

    Returns:
        float: The cosine similarity score between the two sentences,
               ranging from -1 (completely opposite) to 1 (identical).

    Notes:
        Prints the similarity score to the console in a formatted string.
    """
    # Get embeddings for both sentences
    embedding1 = np.array(get_huggingface_embeddings(sentence1))
    embedding2 = np.array(get_huggingface_embeddings(sentence2))

    # Reshape embeddings for cosine_similarity function
    # Convert 1D array to 2D array (1 rows x {length of array} columns)
    # "-1" sets the number of columns based on the original array length. 
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)

    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    similarity_score = similarity[0][0]
    print(f"Cosine similarity between the two sentences: {similarity_score:.4f}")
    return similarity_score

In [7]:
# Test similarity score
aapl_info = get_stock_info("AAPL")

aapl_description = aapl_info['Business Summary']

company_description = "I want to find companies that make smartphones and are headquarted in California"

similarity = cosine_similarity_between_sentences(aapl_description, company_description)

Cosine similarity between the two sentences: 0.3635


# Create Vector Index

## Get all the Stocks in the Stock Market

In [8]:
def get_company_tickers():
    """
    Downloads and parses the Stock ticker symbols from the GitHub-hosted SEC company tickers JSON file.

    Returns:
        dict: A dictionary containing company tickers and related information.

    Notes:
        The data is sourced from the official SEC website via a GitHub repository:
        https://raw.githubusercontent.com/team-headstart/Financial-Analysis-and-Automation-with-LLMs/main/company_tickers.json
    """
    # URL to fetch the raw JSON file from GitHub
    url = "https://raw.githubusercontent.com/team-headstart/Financial-Analysis-and-Automation-with-LLMs/main/company_tickers.json"

    # Making a GET request to the URL
    response = requests.get(url)

    # Checking if the request was successful
    if response.status_code == 200:
        # Parse the JSON content directly
        # decode as utf-8 (Unicode --Standard)
        company_tickers = json.loads(response.content.decode('utf-8'))

        # Optionally save the content to a local file for future use
        with open("company_tickers.json", "w", encoding="utf-8") as file:
            json.dump(company_tickers, file, indent=4)

        print("File downloaded successfully and saved as 'company_tickers.json'")
        return company_tickers
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
        return None

company_tickers = get_company_tickers()

File downloaded successfully and saved as 'company_tickers.json'


In [9]:
len(company_tickers)

9998

## Inserting Stocks into Pinecone

In [10]:
pinecone_api_key = os.getenv("PINECONE_API_KEY")

index_name = "stocks"
namespace = "stock-descriptions-rich"

hf_embeddings = HuggingFaceEmbeddings() # use as parameter only. if you want to execute, use $get_huggingface_embeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=hf_embeddings)

  hf_embeddings = HuggingFaceEmbeddings() # use as parameter only. if you want to execute, use $get_huggingface_embeddings()
  hf_embeddings = HuggingFaceEmbeddings() # use as parameter only. if you want to execute, use $get_huggingface_embeddings()


In [10]:
# PROCESS FEW AT A TIME
# pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# index = pc.Index(index_name)

# # Delete all the stocks in the Pinecone index "stock-descriptions"
# # index.delete(delete_all=True, namespace=namespace)

# company_tickers = load_test_tickers()

# # Prepare your tickers: remove already processed + remove duplicates
# tickers_to_process = list(set([company_tickers[num]['ticker'] for num in company_tickers.keys() if company_tickers[num]['ticker'] not in successful_tickers]))

# # Process them
# print(f'{len(tickers_to_process)}:', tickers_to_process)

# # generate embeddings for one stock
# stock_ticker = tickers_to_process[3]
# stock_data = get_stock_info(stock_ticker)
# stock_description = stock_data['Business Summary']
# stock_embedding = get_huggingface_embeddings(stock_description)

# # insert the stock embedding into the Pinecone index
# stock_embedding_list = stock_embedding.tolist() if isinstance(stock_embedding, np.ndarray) else stock_embedding

# index.upsert(vectors=[
#   {"id":str(stock_data["Ticker"]), "values":stock_embedding_list, "metadata":stock_data}
#   ], namespace=namespace)


In [11]:
# List all vectors in the index (CAREFUL: too much data!)
# for ids in index.list(prefix='', limit=10, namespace=namespace):
#     print(ids) # ['pref1', 'pref2', 'pref3']

### Sequential Process Embeddings

[![](https://mermaid.ink/img/pako:eNqNkl1rgzAUhv9KyMXYwF74cSVjYA0rhY6WKRQWe5FqWqU1cUm8GKX_ffmwa65Gc6Hn5LxvzmM8F1jzhsIUHgUZWlCiigG9MlwoXp_AqpPqdS_esmyzCsBivV7o10fxXgagLFbZDsxmb2COi44dzxRsBK-plDoBZSsoaXbuNPeU4941qWBBv0fKVEfOvud5yejh0NWdLr1U0LnMmts2eYhDMLsZgEHa3TV5aEXbEG-zZZnasiXfLMGnaScVeNKRHDiT1DNunRGF5pMFtUaAiCKe5h4hp84jHHks9mJ8mMjBRBOMrT9G45wommis84bjYThZHuPYwzA_xqeIHUU8UZjyYxDOiOIJwhj_uRKnzhOc3FHsdHgoiUNJJhRTfgzFGVEyoRijj0JZAwPYU9GTrtFjfDHbFVQt7WkFUx02RJzMMF21joyKFz-shqkSIw2g4OOxhemBnKXOxqEhiqKO6DHt_3YHwr44v-XXX7It6B4?type=png)](https://mermaid.live/edit#pako:eNqNkl1rgzAUhv9KyMXYwF74cSVjYA0rhY6WKRQWe5FqWqU1cUm8GKX_ffmwa65Gc6Hn5LxvzmM8F1jzhsIUHgUZWlCiigG9MlwoXp_AqpPqdS_esmyzCsBivV7o10fxXgagLFbZDsxmb2COi44dzxRsBK-plDoBZSsoaXbuNPeU4941qWBBv0fKVEfOvud5yejh0NWdLr1U0LnMmts2eYhDMLsZgEHa3TV5aEXbEG-zZZnasiXfLMGnaScVeNKRHDiT1DNunRGF5pMFtUaAiCKe5h4hp84jHHks9mJ8mMjBRBOMrT9G45wommis84bjYThZHuPYwzA_xqeIHUU8UZjyYxDOiOIJwhj_uRKnzhOc3FHsdHgoiUNJJhRTfgzFGVEyoRijj0JZAwPYU9GTrtFjfDHbFVQt7WkFUx02RJzMMF21joyKFz-shqkSIw2g4OOxhemBnKXOxqEhiqKO6DHt_3YHwr44v-XXX7It6B4)

In [14]:
def sequential_process_embeddings():
    for idx, stock in company_tickers.items():
        stock_ticker = stock['ticker']
        stock_data = get_stock_info(stock_ticker)
        stock_description = stock_data['Business Summary']

        print(f"Processing stock {idx} / {len(company_tickers)} :", stock_ticker)

        docs = Document(page_content=stock_description, metadata=stock_data)
        print(docs)
        vectorstore_from_documents = PineconeVectorStore.from_documents(
            documents=[docs],
            embedding=hf_embeddings,
            index_name=index_name,
            namespace=namespace
        )
    return "Done"

### Parallelizing Embeddings

[![](https://mermaid.ink/img/pako:eNqFk0uLgzAQgP_KkMOe7MHXRZaC1baXvsDCwqqHrGarVJMSE9hS-983NnWxULceRmf4Pp2MyQVlLCfIQweOTwXsw4SCuvw4Eiw7wqpsxPsXn_r-bmXAcrtdqts6WuwN2Ecr3wB__blJYTKZwixe45JCwKjgrKoI7zwXdphjlVXwwfiR8CbVH9BxdjPbqKxlJTAlTDbVuYXAjDUNZveSHWcZaRromkj_F61etIbire8Xpt2b9tDslvpCdHrRGYrddF6Ibi-6D4vsBjqcUWDe_NCMl0TAG6gfwwmEWOA7FlgasEYBWwP2KOBowBkFXA24Y4COoW61DRklLcwvQUHUHooEFrK53hHrAbkX7WdF51nRfVLUcX4fs8y6ObawMON-phvyI2CGRVakyEA14TUuc7XnL52ZIFGQmiTIU4855scEJfSqOCwFi840Q57gkhiIM3kokPeNq0Zl8pRjQcISq4NT_1VPmH4y1ufXXwdkCM8?type=png)](https://mermaid.live/edit#pako:eNqFk0uLgzAQgP_KkMOe7MHXRZaC1baXvsDCwqqHrGarVJMSE9hS-983NnWxULceRmf4Pp2MyQVlLCfIQweOTwXsw4SCuvw4Eiw7wqpsxPsXn_r-bmXAcrtdqts6WuwN2Ecr3wB__blJYTKZwixe45JCwKjgrKoI7zwXdphjlVXwwfiR8CbVH9BxdjPbqKxlJTAlTDbVuYXAjDUNZveSHWcZaRromkj_F61etIbire8Xpt2b9tDslvpCdHrRGYrddF6Ibi-6D4vsBjqcUWDe_NCMl0TAG6gfwwmEWOA7FlgasEYBWwP2KOBowBkFXA24Y4COoW61DRklLcwvQUHUHooEFrK53hHrAbkX7WdF51nRfVLUcX4fs8y6ObawMON-phvyI2CGRVakyEA14TUuc7XnL52ZIFGQmiTIU4855scEJfSqOCwFi840Q57gkhiIM3kokPeNq0Zl8pRjQcISq4NT_1VPmH4y1ufXXwdkCM8)

In [12]:
def refresh_tracking_lists():
    # Initialize tracking lists
    successful_tickers = []
    unsuccessful_tickers = []

    # Build tracking lists --Load existing successful/unsuccessful tickers
    try:
        with open('successful_tickers.txt', 'r') as f:
            successful_tickers = [line.strip() for line in f if line.strip()]
        print(f"Loaded {len(successful_tickers)} successful tickers")
    except FileNotFoundError:
        print("No existing successful tickers file found")

    try:
        with open('unsuccessful_tickers.txt', 'r') as f:
            unsuccessful_tickers = [line.strip() for line in f if line.strip()]
        print(f"Loaded {len(unsuccessful_tickers)} unsuccessful tickers")
    except FileNotFoundError:
        print("No existing unsuccessful tickers file found")
    
    return successful_tickers, unsuccessful_tickers

successful_tickers, unsuccessful_tickers = refresh_tracking_lists()

Loaded 9037 successful tickers
Loaded 1134 unsuccessful tickers


In [13]:
def process_stock_pc(stock_ticker: str) -> str:
    """
    Get a stock ticker info and store it in Pinecone.
    Uses global variables (successful_tickers and unsuccessful_tickers) as tracking lists to guarantee ticker is processed only once.
    """

    # Skip if already processed
    if stock_ticker in successful_tickers:
        return f"Already processed {stock_ticker}"

    try:
        # Get and store stock data
        stock_data = get_stock_info(stock_ticker)
        stock_description = stock_data['Business Summary']

        # generate embeddings for one stock
        stock_embedding = get_huggingface_embeddings(stock_description)

        # insert the stock embedding into the Pinecone index
        stock_embedding_list = stock_embedding.tolist() if isinstance(stock_embedding, np.ndarray) else stock_embedding

        pc_index.upsert(vectors=[
        {"id":str(stock_data["Ticker"]), "values":stock_embedding_list, "metadata":stock_data}
        ], namespace=namespace)

        # Track success
        with open('successful_tickers.txt', 'a') as f:
            f.write(f"{stock_ticker}\n")
        successful_tickers.append(stock_ticker)

        return f"Processed {stock_ticker} successfully"

    except Exception as e:
        # Track failure
        with open('unsuccessful_tickers.txt', 'a') as f:
            f.write(f"{stock_ticker}\n")
        unsuccessful_tickers.append(f"{stock_ticker}: [Error] {e}")

        return f"ERROR processing {stock_ticker}: {e}"

In [14]:
def process_stock(stock_ticker: str) -> str:
    """
    Get a stock ticker info and store it in Pinecone.
    Uses global variables (successful_tickers and unsuccessful_tickers) as tracking lists to guarantee ticker is processed only once.
    """

    # Skip if already processed
    if stock_ticker in successful_tickers:
        return f"Already processed {stock_ticker}"

    try:
        # Get and store stock data
        stock_data = get_stock_info(stock_ticker)
        stock_description = stock_data['Business Summary']

        # Store stock description in Pinecone
        vectorstore_from_texts = PineconeVectorStore.from_documents(
            documents=[Document(page_content=stock_description, metadata=stock_data)],
            ids=[stock_ticker],
            embedding=hf_embeddings,
            index_name=index_name,
            namespace=namespace
        )

        # Track success
        with open('successful_tickers.txt', 'a') as f:
            f.write(f"{stock_ticker}\n")
        successful_tickers.append(stock_ticker)

        return f"Processed {stock_ticker} successfully"

    except Exception as e:
        # Track failure
        with open('unsuccessful_tickers.txt', 'a') as f:
            f.write(f"{stock_ticker}\n")
        unsuccessful_tickers.append(stock_ticker)

        return f"ERROR processing {stock_ticker}: {e}"

def parallel_process_stocks(tickers: list, max_workers: int = 10) -> None:
    """
    Processes a list of stock tickers in parallel using a thread pool executor.
    """
    # Module "concurrent.futures" is part of the standard library and provides a high-level interface for asynchronously executing callables (functions or methods) using threads or processes. (futures a like promises)

    # - ThreadPoolExecutor: Manages a pool of threads to execute tasks concurrently.
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # define an executor to run the process_stock function in each thread
            # trigger independent threads with the "submit" method (build set of "futures")
            # handle futures with the "as_completed" method
        # here, we use a list comprehension to build a dictionary of futures
            # list comprehension to build a dictionary from the tickers list
            # "submit" method is provided by the ThreadPoolExecutor and schedules callable for when arguments are ready. It is optimized to schedule multiple calls at once.
        future_to_ticker = {
            executor.submit(process_stock_pc, ticker): ticker
            for ticker in tickers
        }
        # - as_completed(): Returns an iterator over the futures in the order of completion.
        # access local variable "future_to_ticker" to get the output of the callable function and check if there was an error
        for future in concurrent.futures.as_completed(future_to_ticker):
            ticker = future_to_ticker[future]
            try:
                # - result(): Retrieves the result of the callable associated with the future.
                result = future.result()
                print(result)

                # Stop on error
                if result.startswith("ERROR"):
                    print(f"Stopping program due to error in {ticker}")
                    executor.shutdown(wait=False)
                    raise SystemExit(1)

            except Exception as exc:
                print(f'{ticker} generated an exception: {exc}')
                print("Stopping program due to exception")
                executor.shutdown(wait=False)
                raise SystemExit(1)

In [15]:
# Prepare your tickers: remove already processed + remove duplicates
tickers_to_process = list(set([company_tickers[num]['ticker'] for num in company_tickers.keys() if company_tickers[num]['ticker'] not in successful_tickers]))

# Report
print("REPORT")
print("Unique tickers:", len(set(company_tickers.keys())))
print("Unique successful tickers:", len(set(successful_tickers)))
print("Unique to process:", len(set(tickers_to_process)))

REPORT
Unique tickers: 9998
Unique successful tickers: 9037
Unique to process: 961


In [16]:
# Process them
# parallel_process_stocks(tickers_to_process, max_workers=20)

# Explore

## Web Scrapping

In [22]:
import requests
from bs4 import BeautifulSoup

def get_content_from_url(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    s = soup.find('div', class_='entry-content')
    content = soup.find_all('p')
    content = [p.text for p in content]
    content = "\n".join(content)
    return content

url = "https://review.firstround.com/the-uncomfortable-truth-a-3x-founders-guide-to-intellectual-honesty/"
print(get_content_from_url(url))

Crossbeam CEO and co-founder Bob Moore shares his tools for quashing biases in pursuit of the truth at every stage of company building.
Founders are unrelenting optimists. It’s practically a requisite trait to start a company — you have to suspend disbelief to build toward the future state you’ve imagined.
But if grit goes untempered by realism, blind spots can emerge. When you’ve put everything on the line to make your startup work, it’s easy for happy ears syndrome to set in, only taking in what you want to hear and subconsciously filtering out the rest.
Three-time founder Bob Moore ran into this pitfall while building his first two startups, which, by his own account, netted good-not-great outcomes. In a postmortem, he could easily point to bad timing, market dynamics or fearsome competitors — in other words, things outside of his control. Instead, he calls out the limits of his own judgment as a young founder.
Moore launched his first company, an analytics platform called RJMetrics

## LLMs

In [17]:
# process_stock_pc("MOND")

In [63]:
# EXECUTE CODE CREATED BY AN LLM

from langchain_core.tools import Tool
from langchain_experimental.utilities import PythonREPL # read-eval-print-loop

python_repl = PythonREPL();

tools = [
    Tool(
        name="python_repl",
        description="A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.",
        func=python_repl.run,
    )
]

# tools can not output a value. It canonly be used to output prints in the console
script_prompt = """
def funct():
    import yfinance as yf
    ticker = "TSLA"
    stock = yf.Ticker(ticker)
    return stock.info
print(funct())
"""

tools[0].run(script_prompt)

"{'address1': '1 Tesla Road', 'city': 'Austin', 'state': 'TX', 'zip': '78725', 'country': 'United States', 'phone': '512 516 8177', 'website': 'https://www.tesla.com', 'industry': 'Auto Manufacturers', 'industryKey': 'auto-manufacturers', 'industryDisp': 'Auto Manufacturers', 'sector': 'Consumer Cyclical', 'sectorKey': 'consumer-cyclical', 'sectorDisp': 'Consumer Cyclical', 'longBusinessSummary': 'Tesla, Inc. designs, develops, manufactures, leases, and sells electric vehicles, and energy generation and storage systems in the United States, China, and internationally. The company operates in two segments, Automotive, and Energy Generation and Storage. The Automotive segment offers electric vehicles, as well as sells automotive regulatory credits; and non-warranty after-sales vehicle, used vehicles, body shop and parts, supercharging, retail merchandise, and vehicle insurance services. This segment also provides sedans and sport utility vehicles through direct and used vehicle sales, a 

## Yahoo Finance API

Reference:
* https://ranaroussi.github.io/yfinance/index.html

In [59]:
ticker = "TSLA"
stock = yf.Ticker(ticker)

# # General info
# stock.info['sectorKey'] # stock.info['sector'].lower().replace(" ", "-")
# stock.info['industryKey'] # stock.info['industry'].lower().replace(" ", "-")

# # Sector
# yf.Sector(stock.info['sectorKey']).industries.sort_values(by='key', ascending=True)
# yf.Sector(stock.info['sectorKey']).top_companies
yf.Sector(stock.info['sectorKey']).research_reports

# # Industry
# yf.Industry(stock.info['industryKey']).research_reports

# # SEC Filings
# stock.get_sec_filings()
# # Basic Stock Information
# get_stock_info(ticker)
# # Stock History
# stock.history(period="5d")
# # Stock Financials
# stock.financials
# # Stock Earnings (income statement)
# stock.income_stmt
# # Stock Cash Flow (cash flow statement)
# stock.cashflow
# # Stock Balance Sheet (balance sheet)
# stock.balance_sheet
# # Stock Market Capitalization
# stock.info['marketCap']

[{'id': 'ARGUS_42037_TopBottomInsiderActivity_1733484442000',
  'headHtml': 'Daily – Vickers Top Buyers & Sellers for 12/06/2024',
  'provider': 'Argus Research',
  'reportDate': '2024-12-06T11:27:22Z',
  'reportTitle': 'The Vickers Top Buyers & Sellers is a daily report that identifies the five companies the largest insider purchase transactions based on the dollar value of the transactions as well as the five companies the largest insider sales transactions based on the dollar value of the transactions.',
  'reportType': 'Top/Bottom Insider Activity'},
 {'id': 'ARGUS_42036_InsiderActivity_1733484442000',
  'headHtml': 'Daily – Vickers Top Insider Picks for 12/06/2024',
  'provider': 'Argus Research',
  'reportDate': '2024-12-06T11:27:22Z',
  'reportTitle': 'The Vickers Top Insider Picks is a daily report that utilizes a proprietary algorithm to identify 25 companies with compelling insider purchase histories based on transactions over the past three months.',
  'reportType': 'Insider

# Perform RAG

In [235]:
query = "What companies make electric cars?"

## Qualify user query

In [236]:
# build prompt
prompt_qualifier = f"""
You are a financial analyst, helping a client find stocks that are relevant to them. You will be given a raw query and your job is to qualify this query, adding context and keywords that will help delineate the user's interest and help you find the most relevant stocks.
If the user expresses a preference, e.g. location, industry, political affiliation, branding positioning, etc, add that to the query.
Keep your output short and concise, conversational, but detailed with the context and keywords added. Do not include any additional text other than the qualified query.
Finally, do not include any requirement that the company be publicly traded.
Your output should be a JSON object with the following keys:
* "query": the qualified query

Examples:

$$Raw query$$ "What are some companies that manufacture consumer hardware?"
$$Qualified query$$ {json.dumps({"query": "Consumer-facing tech companies specializing in hardware development such as smartphones, laptops, smart home devices, gaming consoles, and VR headsets, listed on major US stock exchanges."})}

$$Raw query$$ "What are some companies that sell home decor?"
$$Qualified query$$ {json.dumps({"query":"Retailers specializing in home furniture, furnishings, and decorative items, such as furniture stores, home goods stores, and online home decor marketplaces, with a strong e-commerce presence."})}
"""

# get qualified query
client = OpenAI(
  base_url="https://api.groq.com/openai/v1",
  api_key=os.getenv("GROQ_API_KEY")
)

llm_response_long = client.chat.completions.create(
    model="llama-3.1-70b-versatile",
    messages=[
        {"role": "system", "content": prompt_qualifier},
        {"role": "user", "content": f"Raw query: {query}"}
    ]
)

query_long = json.loads(llm_response_long.choices[0].message.content)['query']

print(query_long)

Companies manufacturing fully electric vehicles (EVs) with zero tailpipe emissions, including luxury brands, mass market automotive manufacturers, and new entrants from the technology and mobility sectors, with a focus on European, North American, and Asian markets.


In [237]:
prompt_shortener = f"""
You were hired to write a query for a financial analyst. Your query will be used to match stocks with a similarity search. It must be short and conciese and dense with relevant keywords. Keep it at 2 sentences max. You will be given a raw query and your job is to write a short query that keeps the essence of the raw query.
Your output must be only the requested query. No additional text.
"""

llm_response_qualified = client.chat.completions.create(
    model="llama-3.1-70b-versatile",
    messages=[
        {"role": "system", "content": prompt_shortener},
        {"role": "user", "content": f"Raw query: {query_long}"}
    ]
)

query_shortened = llm_response_qualified.choices[0].message.content

print(query_shortened)

Fully electric vehicle manufacturers operating in Europe, North America, and Asia, across luxury, mass market, and emerging tech sectors.


In [238]:
# check if the qualified query uses special filters: Country, Sector, Market Cap, Volume

filter_options_example = '''{"$and": [
  {"Market Cap": {"$gt": 1000}},
  {"10-Day AVG Volume": {"$gte": 1000000}}
]}'''

example_for = '{"$and": [{"genre": {"$eq": "drama"}}, {"year": {"$gte": 2020}}]}'
example_or = '{"$or": [{"genre": {"$eq": "drama"}}, {"year": {"$gte": 2020}}]}'

filter_prompt = f"""
Check this query and assess if the answer to it requires any special filters. There are 2 filter options are: 'Market Cap' and '10-Day AVG Volume'.
If the answer requires any of these filters, return the filter options in a dictionary with the filter name as key and the required value assigned to it. Do not include any other key different than the filter options specified above. If the answer does not require any of these filters, return an empty dictionary.
Your output must be a JSON object with the dictionary of filters. No additional text.

Use any of the following operators:
Filter	Description	Supported types
$eq:    Matches vectors with metadata values that are equal to a specified value.	Number, string, boolean
$ne:	Matches vectors with metadata values that are not equal to a specified value.	Number, string, boolean
$gt:	Matches vectors with metadata values that are greater than a specified value.	Number
$gte:	Matches vectors with metadata values that are greater than or equal to a specified value.	Number
$lt:	Matches vectors with metadata values that are less than a specified value.	Number
$lte:	Matches vectors with metadata values that are less than or equal to a specified value.	Number
$in:	Matches vectors with metadata values that are in a specified array.	String, number
$nin:	Matches vectors with metadata values that are not in a specified array.	String, number
$exists:	Matches vectors with the specified metadata field.	Boolean

Or combine operators:
Operator	Example
$and	    {example_for}
$or         {example_or}

Examples: {filter_options_example}
"""

filters = client.chat.completions.create(
    model="llama-3.1-70b-versatile",
    messages=[
        {"role": "system", "content": filter_prompt},
        {"role": "user", "content": f"Query: {query_shortened}"}
    ],
)

filter_options = json.loads(filters.choices[0].message.content)
print(filter_options)

print(len(query_long), len(query_shortened), query_shortened)

{}
266 137 Fully electric vehicle manufacturers operating in Europe, North America, and Asia, across luxury, mass market, and emerging tech sectors.


## Get relevant stocks

In [239]:
query = query_shortened

## Get relevant data about the stocks

In [240]:
raw_query_embedding = get_huggingface_embeddings(query)

In [241]:
filter_options

{}

In [242]:
top_matches = pc_index.query(
  vector=raw_query_embedding.tolist(),
  top_k=50,
  include_metadata=True,
  namespace=namespace,
  filter=filter_options
  )

In [243]:
# sort top_matches by 'Market Cap'
def get_market_cap(market_cap):
  return float(market_cap if not market_cap == 'Information not available' else 0 )

top_matches['matches'].sort(key=lambda x: get_market_cap(x['metadata']['Market Cap']), reverse=True)

In [244]:
print([x.id for x in top_matches['matches']])
for match in top_matches['matches']:
    print(f"{match['id']} - {match['score']} - {match['metadata']['Market Cap']} - {match['metadata']['Business Summary']}")

['TSLA', 'LAAOF', 'LI', 'RIVN', 'XPEV', 'VFS', 'POAHY', 'POAHF', 'NIO', 'NIOIF', 'LCID', 'ZK', 'FAURY', 'PSNYW', 'PSNY', 'PLUG', 'EVGO', 'DAN', 'FURCF', 'XCH', 'CHPT', 'NWTN', 'MPAA', 'ALLGF', 'NKLA', 'ELVA', 'KNDI', 'LEV', 'CVGI', 'FFIE', 'CENN', 'NAAS', 'XOS', 'GP', 'MULN', 'LOBO', 'AIEV', 'JZXN', 'PEV', 'ZAPP', 'AYRO', 'NXU', 'NVVE', 'ZKGCF', 'EGOXF', 'ZEVY', 'EVGOW', 'VFSWW', 'NVVEW', 'NWTNW']
TSLA - 0.569392085 - 1186085076992.0 - Tesla, Inc. designs, develops, manufactures, leases, and sells electric vehicles, and energy generation and storage systems in the United States, China, and internationally. The company operates in two segments, Automotive, and Energy Generation and Storage. The Automotive segment offers electric vehicles, as well as sells automotive regulatory credits; and non-warranty after-sales vehicle, used vehicles, body shop and parts, supercharging, retail merchandise, and vehicle insurance services. This segment also provides sedans and sport utility vehicles th

In [213]:
contexts = [item['metadata']['Business Summary'] for item in top_matches['matches'][:6]]

In [214]:
print("\n\n-------\n\n".join(contexts))

NVIDIA Corporation provides graphics and compute and networking solutions in the United States, Taiwan, China, Hong Kong, and internationally. The Graphics segment offers GeForce GPUs for gaming and PCs, the GeForce NOW game streaming service and related infrastructure, and solutions for gaming platforms; Quadro/NVIDIA RTX GPUs for enterprise workstation graphics; virtual GPU or vGPU software for cloud-based visual and virtual computing; automotive platforms for infotainment systems; and Omniverse software for building and operating metaverse and 3D internet applications. The Compute & Networking segment comprises Data Center computing platforms and end-to-end networking platforms, including Quantum for InfiniBand and Spectrum for Ethernet; NVIDIA DRIVE automated-driving platform and automotive development agreements; Jetson robotics and other embedded platforms; NVIDIA AI Enterprise and other software; and DGX Cloud software and services. The company's products are used in gaming, pro

In [172]:
augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

In [173]:
filter_options

{}

In [174]:
print(augmented_query)

<CONTEXT>
Tesla, Inc. designs, develops, manufactures, leases, and sells electric vehicles, and energy generation and storage systems in the United States, China, and internationally. The company operates in two segments, Automotive, and Energy Generation and Storage. The Automotive segment offers electric vehicles, as well as sells automotive regulatory credits; and non-warranty after-sales vehicle, used vehicles, body shop and parts, supercharging, retail merchandise, and vehicle insurance services. This segment also provides sedans and sport utility vehicles through direct and used vehicle sales, a network of Tesla Superchargers, and in-app upgrades; purchase financing and leasing services; services for electric vehicles through its company-owned service locations and Tesla mobile service technicians; and vehicle limited warranties and extended service plans. The Energy Generation and Storage segment engages in the design, manufacture, installation, sale, and leasing of solar energy

In [175]:
client = OpenAI(
  base_url="https://api.groq.com/openai/v1",
  api_key=os.getenv("GROQ_API_KEY")
)

In [176]:
system_prompt = f"""You are an expert at providing answers about stocks. Please answer the question provided, in a conversational tone, but keep it professional and sharp. Assume the context in the prompt is part of your knowledge base, not something provided to you. Whenever you are asked to provide a list of companies, look for the companies with the highest market cap, within the prompt provided.
"""

llm_response = client.chat.completions.create(
    model="llama-3.1-70b-versatile",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": augmented_query}
    ]
)

response = llm_response.choices[0].message.content
print(response)

When it comes to electric vehicle (EV) manufacturers that focus on battery technology and sustainable energy, there are several notable companies leading the charge. Here are some of the top players in this space, with a high market capitalization:

1. **Tesla, Inc. (TSLA)** - Tesla is one of the pioneers in the EV industry and a leader in battery technology. They offer a range of electric vehicles, including the Model S, Model X, Model 3, Model Y, and the upcoming Cybertruck.
2. **Lucid Group, Inc. (LCID)** - Lucid is known for its luxury electric vehicles, such as the Lucid Air and the Lucid Gravity, which offer advanced battery technology and sustainable energy solutions.
3. **XPeng Inc. (XPEV)** - XPeng is a Chinese EV manufacturer that focuses on developing advanced battery technology and sustainable energy solutions. Their vehicles, such as the G3 and P7, are designed with both performance and sustainability in mind.
4. **Rivian Inc. (RIVN)** - While not listed in the provided co