# 1 - Main components

## Chat model

In [1]:
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI

# Load the variables from .env
load_dotenv()

# Initialize your model
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite")

## Embeddings model

In [2]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

## Vector Store

In [3]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="earnings_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

# 2 - Indexing

## Load documents

In [None]:
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path="transcripts/TSLA_Q4_2025.json",
    # '.' refers to the root object of the JSON file
    jq_schema=".", 
    # Use the dot-notation path directly in the content_key
    content_key="full_conference_call_transcript",
    text_content=True
)

docs = loader.load()

, and goodbye.

**Vaibhav Taneja:** Alright. Cool.


In [21]:
from langchain_community.document_loaders import DirectoryLoader, JSONLoader

# 1. Define the parameters for each JSON file
loader_kwargs = {
    "jq_schema": ".",
    "content_key": "full_conference_call_transcript",
    "text_content": True
}

# 2. Use DirectoryLoader to find all .json files in your transcripts folder
loader = DirectoryLoader(
    path="./transcripts/",
    glob="*.json",
    loader_cls=JSONLoader,
    loader_kwargs=loader_kwargs
)

# 3. Load all files into a single list of documents
docs = loader.load()

# Check how many documents were loaded
print(f"Loaded {len(docs)} documents.")

# Accessing content from different files
for doc in docs:
    print(f"Source: {doc.metadata.get('source')}")
    print(f"Excerpt: {doc.page_content[:50]}...\n")

Loaded 2 documents.
Source: /Users/blakefery/ml-workspace/ev-earnings-rag/transcripts/TSLA_Q4_2025.json
Excerpt: **Elon Musk:** Thanks, Travis. So I have updated t...

Source: /Users/blakefery/ml-workspace/ev-earnings-rag/transcripts/RIVN_Q4_2025.json
Excerpt: Robert Scaringe: Thanks, Chip. Good afternoon, eve...



## Split documents

In [22]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split earnings into {len(all_splits)} sub-documents.")

Split earnings into 136 sub-documents.


## Storing documents

In [23]:
document_ids = vector_store.add_documents(documents=all_splits)

print(document_ids[:3])

['05ed1ec8-2ddc-4da1-97b7-95b75d48602b', 'eb99ec01-6d51-4e16-8e57-502406bb597d', '16a04be4-b00c-447b-92df-91535719ee31']


In [24]:
print(len(document_ids))

136


# 3 - Retrieval and generation

## RAG agent

In [25]:
from langchain.tools import tool

@tool(response_format="content_and_artifact")
def retrieve_context(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vector_store.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

In [26]:
from langchain.agents import create_agent


tools = [retrieve_context]
# If desired, specify custom instructions
prompt = (
    "You have access to a tool that retrieves context from company earnings calls transcripts. "
    "Use the tool to help answer user queries."
)
agent = create_agent(model, tools, system_prompt=prompt)

In [28]:
query = (
    "What is the first comment Robert Scaringe makes?"
)

for event in agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values",
):
    event["messages"][-1].pretty_print()


What is the first comment Robert Scaringe makes?
Tool Calls:
  retrieve_context (bc82bfc7-dc96-4e9a-bd4c-041d65203cb2)
 Call ID: bc82bfc7-dc96-4e9a-bd4c-041d65203cb2
  Args:
    query: What is the first comment Robert Scaringe makes?
Name: retrieve_context

Source: {'seq_num': 1, 'start_index': 0, 'source': '/Users/blakefery/ml-workspace/ev-earnings-rag/transcripts/RIVN_Q4_2025.json'}
Content: Robert Scaringe: Thanks, Chip. Good afternoon, everyone, and thanks for joining us for today's call. 2025 was a year focused on execution at Rivian as we laid the foundations for scaling our business. Our team progressed the development of our technology road map in R2, while simultaneously driving continued improvement in our customer experience and our path to profitability. In founding Rivian, I wanted to demonstrate how a clean sheet technology-focused vehicle could eliminate long accepted compromises and provide consumers choice. Our goal with the launch of our R1 products was to establish 