### Box Client Initialization
- Establishes connection to Box using CCG (Client Credentials Grant) authentication
- Verifies connection by retrieving user information

In [1]:
import os
from dotenv import load_dotenv
from box_sdk_gen import (
    BoxCCGAuth, 
    CCGConfig, 
    BoxClient, 
    FileTokenStorage,
    BoxAPIError,
)
# Read the .env configuration file
load_dotenv()

client_id = os.getenv("BOX_CLIENT_ID")
client_secret = os.getenv("BOX_CLIENT_SECRET")
user_id = os.getenv("BOX_SUBJECT_ID")

# Create a BoxCCGConfig instance
box_config = CCGConfig(client_id=client_id,
                   client_secret=client_secret,
                   user_id=user_id,
                   token_storage=FileTokenStorage(".ccg.db"))
# Create a BoxCCGAuth instance
box_auth = BoxCCGAuth(box_config)
# Create a BoxClient instance
box_client = BoxClient(box_auth)
# Test the connection
try:
    user = box_client.users.get_user_me()
    print(f"Connected to Box as {user.name} ({user.id})")
except BoxAPIError as e:
    print(f"Failed to connect to Box: {e}")
    exit(1)

Connected to Box as RB Admin (31519033281)


### Document Upload
- Creates a new folder in Box root directory
- Uploads all documents from the local sample data folder
- Handles duplicate files gracefully

In [2]:
from box_sdk_gen import CreateFolderParent, UploadFileAttributes, UploadFileAttributesParentField

# Upload sample data to the Box instance

def upload_folder(client: BoxClient, folder_id, local_folder_path)->str:
    # Create a new folder in Box
    try:
        box_folder = client.folders.create_folder(
            name=os.path.basename(local_folder_path),
            parent=CreateFolderParent(id=folder_id))
    except BoxAPIError as e:
        if e.response_info.body['status'] == 409:
            # Folder already exists, get its ID
            box_folder = client.folders.get_folder_by_id(
                e.response_info.body['context_info']['conflicts'][0]['id'])

    print(f"Created folder: {box_folder.name} ({box_folder.id})")

    # Upload files to the new folder
    local_folder_path = os.path.abspath(local_folder_path)
    for root, _, files in os.walk(local_folder_path):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            parent = UploadFileAttributesParentField(id=box_folder.id,
                                                     type="folder")
            file_attributes = UploadFileAttributes(
                name=file_name,
                parent=parent,
            )
            with open(file_path, 'rb') as file_stream:
                
                try:
                    box_file = client.uploads.upload_file(
                        attributes=file_attributes, file=file_stream).entries[0]
                    print(f"Uploaded file: {box_file.name} ({box_file.id})")
                except BoxAPIError as e:
                    if e.response_info.body['status'] == 409:
                        print(f"File already exists: {file_name} ({e.response_info.context_info['conflicts']['id']})")

    return box_folder.id
local_folder_path = "../sample_data/Q4 Tech earnings-Demo"
box_folder_id = upload_folder(box_client, "0", local_folder_path)

Created folder: Q4 Tech earnings-Demo (323228490092)
File already exists: Apple_analysis.docx (1874924987657)
File already exists: Tesla_analysis.docx (1874969973544)
File already exists: Microsoft_analysis.docx (1874972038992)
File already exists: Meta_analysis.docx (1874954337477)
File already exists: NVIDIA_analysis.docx (1874960079989)


In [3]:
# Read the .env configuration file for other services
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
MONGODB_URI=os.getenv("MONGODB_URI")

### Document Processing
- Uses `BoxLoader` from LangChain to load documents from Box
- Splits documents into chunks (200 characters with 20 character overlap)
- Generates embeddings using OpenAI's embedding model

In [4]:
from langchain_box.document_loaders import BoxLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

auth_token = box_auth.retrieve_token().access_token
# Load the PDF
loader = BoxLoader(box_developer_token=auth_token,
                   box_folder_id=box_folder_id, # type: ignore
                   )
data = loader.load()

# Split PDF into documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
docs = text_splitter.split_documents(data)

### Vector Store Setup
- Stores document chunks and embeddings in MongoDB Atlas
- Creates a vector search index with 1536 dimensions (OpenAI embedding size)
- Enables semantic search capabilities

In [5]:
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings
embedding_model = OpenAIEmbeddings()
# Instantiate vector store
vector_store = MongoDBAtlasVectorSearch.from_connection_string(
    connection_string=MONGODB_URI,
    namespace="langchain_db.earnings_rag",
    embedding=embedding_model,
    index_name="vector_index")

# Add data to the vector store
vector_store.add_documents(docs)

# Use helper method to create the vector search index
vector_store.create_vector_search_index(
   dimensions = 1536
)

### RAG Chain Creation
- Combines document retrieval with language model generation
- Uses a custom prompt template for context-aware responses
- Returns both answers and source document citations

In [6]:
import pprint
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

llm = ChatOpenAI(model = "gpt-4o")

# Instantiate Atlas Vector Search as a retriever
retriever = vector_store.as_retriever()

# Define prompt template
template = """
Use the following pieces of context to answer the question at the end.
{context}
Question: {question}
"""
custom_rag_prompt = PromptTemplate.from_template(template)

def format_docs(docs):
   return "\n\n".join(doc.page_content for doc in docs)

# Create chain
rag_chain = (
   {"context": retriever | format_docs, "question": RunnablePassthrough()}
   | custom_rag_prompt
   | llm
   | StrOutputParser()
)
# Prompt the chain
question = "What are the major tech companies challenges for 2026?"
answer = rag_chain.invoke(question)
pprint.pprint(answer)

# Return source documents
documents = retriever.invoke(question)
print("\nSource documents:")
pprint.pprint(documents)

('The provided context focuses on asking about Apple specifically and does not '
 'offer detailed insights into the challenges major tech companies may face in '
 '2026. However, based on general industry trends and challenges faced by tech '
 'companies, some possible headwinds for major tech companies in 2026 could '
 'include:\n'
 '\n'
 '1. **Shifting Consumer Preferences:** As the context mentions, there could '
 'be shifting preferences for Western technology, possibly influenced by '
 'geopolitical tensions or cultural factors.\n'
 '\n'
 '2. **Regulatory Constraints:** Increasing scrutiny by governments worldwide '
 'may lead to stricter regulations concerning data privacy, antitrust issues, '
 "and digital market exclusivity, impacting tech companies' operations and "
 'strategies.\n'
 '\n'
 '3. **Innovation Pressure:** The rapid pace of technological advancement '
 'requires continuous innovation, which can be both a challenge and a '
 'necessity for staying competitive, especi