In [1]:
!pip install langchain langchain_community langchain-chroma langchain_text_splitters -U -q
!pip install sentence-transformers -q

In [2]:
import os
import itertools
import pickle

In [3]:
from google.colab import drive
import os

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Change directory to where your notebook is located
notebook_directory = '/content/drive/My Drive/Colab Notebooks/2024 NLP GenAI/NLP/project'
os.chdir(notebook_directory)

# Step 3: Verify the current working directory
print("Current working directory:", os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Current working directory: /content/drive/My Drive/Colab Notebooks/2024 NLP GenAI/NLP/project


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain.schema import Document


emb_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=100,
        length_function=len,
        # is_separator_regex = False,
    )

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [6]:
# Config
# ticker = ['AMD']
ticker = ['NVDA', 'AMD', 'INTC', 'QCOM', 'AVGO', 'DELL']
year = ['2024']
combinations = list(itertools.product(ticker, year))

### DB for markdown based SEC filings

In [7]:
markdown_dir = "./output/SEC_EDGAR_FILINGS_MD"

md_content_list = []

for combo in combinations:
  ticker, year = combo
  for md_dirs in os.listdir(os.path.join(markdown_dir,f"{ticker}-{year}")):
    md_file_path = os.path.join(markdown_dir,f"{ticker}-{year}",md_dirs,f"{md_dirs}.md")
    print(f'{ticker}-{year}-{md_dirs}')
    with open(md_file_path, 'r') as file:
      content = file.read()
    md_content_list.append([content, ticker, year, '-'.join(md_dirs.split('-')[-2:])])

NVDA-2024-nvda-20240428-10-Q2
NVDA-2024-nvda-20240128-10-K
AMD-2024-amd-20240330-10-Q1
INTC-2024-intc-20240330-10-Q1
QCOM-2024-qcom-20240324-10-Q1
AVGO-2024-avgo-20240505-10-Q2
AVGO-2024-avgo-20240204-10-Q1
DELL-2024-dell-20240503-10-Q2


In [8]:
sec_markdown_docs = []

for md_content in md_content_list:
  md_header_splits = markdown_splitter.split_text(md_content[0])
  for md_header_docs in md_header_splits:
    # Add extra metadata
    md_header_docs.metadata.update({"ticker":md_content[1]})
    md_header_docs.metadata.update({"year":md_content[2]})
    md_header_docs.metadata.update({"filing_type":md_content[3]})
  sec_markdown_docs.extend(md_header_splits)

In [9]:
%%time
sec_filings_md_db = Chroma.from_documents(sec_markdown_docs, emb_fn,
                                          persist_directory="./sec-filings-md-db",
                                          collection_name="sec_filings_md")

CPU times: user 55.7 s, sys: 5.52 s, total: 1min 1s
Wall time: 1min 2s


### DB for Earnings Call

In [10]:
earnings_call_dir = "./output/earnings_call"

earnings_call_list = []

for combo in combinations:
  ticker, year = combo
  ec_file_path = os.path.join(earnings_call_dir,f"{ticker}-{year}","earnings_calls.pkl")
  print(f'{ticker}-{year}-earnings_call')
  with open(ec_file_path, 'rb') as file:
    loaded_earnings_call = pickle.load(file)
  # loaded_earnings_call['ticker'] = ticker
  # loaded_earnings_call['year'] = year
  earnings_call_list.append(loaded_earnings_call)

NVDA-2024-earnings_call
AMD-2024-earnings_call
INTC-2024-earnings_call
QCOM-2024-earnings_call
AVGO-2024-earnings_call
DELL-2024-earnings_call


In [11]:
earnings_call_docs = []

for earnings_call in earnings_call_list:
  earnings_calls_splits = text_splitter.split_documents(earnings_call['earnings_docs'])
  earnings_call_docs.extend(earnings_calls_splits)

In [12]:
%%time
earnings_call_db = Chroma.from_documents(earnings_calls_splits, emb_fn,
                                         persist_directory="./earnings-call-db",
                                         collection_name="earnings_call")

CPU times: user 31.2 s, sys: 2.48 s, total: 33.6 s
Wall time: 35.1 s


### Queries

In [14]:
company_ticker = 'AMD'

query_1 = (
    f"Industry and Market Analysis: "
    f"1. What are the main trends and drivers in company {company_ticker}? "
    f"2. Who are the major competitors in this industry? "
    f"3. What is the company's market share relative to its competitors?"
)

query_2 = (
    f"Financial Performance:"
    f"1. For company {company_ticker}, what has been the company's historical financial performance?"
    f"2. How have revenue, profits, and key financial metrics (e.g., EBITDA, ROE, ROA) evolved over time?"
    f"3. What are the company's main sources of revenue and profit?"
)

In [16]:
relevant_docs = sec_filings_md_db.similarity_search(query_1, k=1)
relevant_docs[0]

Document(metadata={'Header 1': 'Advanced Micro Devices, Inc. (Exact Name Of Registrant As Specified In Its Charter)', 'Header 2': 'Notes To Condensed Consolidated Financial Statements (Unaudited) Note 1 - The Company', 'filing_type': '10-Q1', 'ticker': 'AMD', 'year': '2024'}, page_content="Advanced Micro Devices, Inc. is a global semiconductor company. References herein to AMD or the Company mean Advanced Micro Devices, Inc. and its consolidated subsidiaries. AMD's products include x86 microprocessors (CPUs) and graphics processing units (GPUs), as standalone devices or as incorporated into accelerated processing units (APUs), chipsets, data center and professional GPUs, embedded processors, semi-custom System-on-Chip (SoC) products, microprocessor and SoC development services and technology, data processing units (DPUs), Field Programmable Gate Arrays (FPGAs), System on Modules (SOMs), Smart Network Interface Cards (SmartNICs), Artificial Intelligence (AI) Accelerators and Adaptive So

In [17]:
relevant_docs = earnings_call_db.similarity_search(query_1, k=3)
relevant_docs[0]

Document(metadata={'quarter': 'Q1', 'speaker': 'Wamsi Mohan'}, page_content="Yes, thank you so much. I was wondering if you could talk about how you're thinking about pricing and share? And maybe also reference, how you're looking at elasticity of demand as a response to pricing? I think, Yvonne, you just said it's definitely a more competitive environment. So, where is Dell going to be kind of flexing more in its ability to drive more share with price, and in what areas do you think that that won't be quite as much, where there might not be as much elasticity of demand? Thank you so much.")

In [18]:
relevant_docs = sec_filings_md_db.similarity_search(query_2, k=3)
relevant_docs[0]

Document(metadata={'Header 1': 'Advanced Micro Devices, Inc. (Exact Name Of Registrant As Specified In Its Charter)', 'Header 2': 'Note 2 - Basis Of Presentation And Significant Accounting Policies', 'filing_type': '10-Q1', 'ticker': 'AMD', 'year': '2024'}, page_content="Basis of *Presentation.* The accompanying unaudited condensed consolidated financial statements of AMD have been prepared in accordance with U.S. generally accepted accounting principles (U.S. GAAP) for interim financial information and the instructions to Form 10-Q and Article 10 of Regulation S-X. The results of operations for the three months ended March 30, 2024 shown in this report are not necessarily indicative of results to be expected for the full year ending December 28, 2024 or any other future period. In the opinion of the Company's management, the information contained herein reflects all adjustments necessary for a fair presentation of the Company's results of operations, financial position, cash flows and

In [19]:
relevant_docs = earnings_call_db.similarity_search(query_2, k=3)
relevant_docs[0]

Document(metadata={'quarter': 'Q4', 'speaker': 'Rob Williams'}, page_content="Thanks, everyone, for joining us. With me today are Jeff Clarke, Yvonne McGill and Tyler Johnson. Our earnings materials are available on our IR website, and I encourage you to review these materials and the presentation, which includes additional content to complement our discussion this afternoon. Guidance will be covered on today's call. During this call, unless otherwise indicated, all references to financial measures refer to non-GAAP financial measures, including non-GAAP gross margin, operating expenses, operating income, net income, diluted earnings per share and adjusted free cash flow. A reconciliation of these measures to their most directly comparable GAAP measures can be found in our web deck and our press release. Growth percentages refer to year-over-year change unless otherwise specified. Statements made during this call that relate to future results and events are forward-looking statements b

### Load and examine the persistent databases

In [30]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
load_sec_filings_md_db = Chroma(persist_directory="./sec-filings-md-db",
                                embedding_function=emb_fn,
                                collection_name="sec_filings_md")
# load_earnings_call_db = Chroma(persist_directory="./earnings-call-db", embedding_function=emb_fn)



In [32]:
# load_sec_filings_md_db.get()
# if not specifying collection_name, then the line above return a dictionary with empty values.

In [21]:
import sqlite3

# Path to your SQLite database file
db_path = './sec-filings-md-db/chroma.sqlite3'

# Connect to the SQLite database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# List all tables
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print("Tables in the database:")
for table in tables:
    print(table[0])

Tables in the database:
migrations
embeddings_queue
collection_metadata
segments
segment_metadata
tenants
databases
collections
embeddings
embedding_metadata
max_seq_id
embedding_fulltext_search
embedding_fulltext_search_data
embedding_fulltext_search_idx
embedding_fulltext_search_content
embedding_fulltext_search_docsize
embedding_fulltext_search_config


In [24]:
for table in tables:
  cursor.execute(f"SELECT COUNT(*) FROM {table[0]}")
  num_documents = cursor.fetchone()[0]
  print(f"Table {table[0]} has {num_documents} rows.")

Table migrations has 11 rows.
Table embeddings_queue has 387 rows.
Table collection_metadata has 0 rows.
Table segments has 4 rows.
Table segment_metadata has 0 rows.
Table tenants has 1 rows.
Table databases has 1 rows.
Table collections has 2 rows.
Table embeddings has 387 rows.
Table embedding_metadata has 2267 rows.
Table max_seq_id has 1 rows.
Table embedding_fulltext_search has 387 rows.
Table embedding_fulltext_search_data has 1145 rows.
Table embedding_fulltext_search_idx has 945 rows.
Table embedding_fulltext_search_content has 387 rows.
Table embedding_fulltext_search_docsize has 387 rows.
Table embedding_fulltext_search_config has 1 rows.


In [33]:
in_session_db_output = sec_filings_md_db.similarity_search(query_1, k=1)

loaded_db_output = load_sec_filings_md_db.similarity_search(query_1, k=1)

in_session_db_output[0] == loaded_db_output[0]

True