### Instantiate a Box client

In [1]:
import os
from dotenv import load_dotenv
from box_sdk_gen import (
    BoxCCGAuth, 
    CCGConfig, 
    BoxClient, 
    FileTokenStorage,
    BoxAPIError,
)
# Read the .env configuration file
load_dotenv()

client_id = os.getenv("BOX_CLIENT_ID")
client_secret = os.getenv("BOX_CLIENT_SECRET")
user_id = os.getenv("BOX_SUBJECT_ID")

# Create a BoxCCGConfig instance
config = CCGConfig(client_id=client_id,
                   client_secret=client_secret,
                   user_id=user_id,
                   token_storage=FileTokenStorage(".ccg.db"))
# Create a BoxCCGAuth instance
auth = BoxCCGAuth(config)
# Create a BoxClient instance
client = BoxClient(auth)
# Test the connection
try:
    user = client.users.get_user_me()
    print(f"Connected to Box as {user.name} ({user.id})")
except BoxAPIError as e:
    print(f"Failed to connect to Box: {e}")
    exit(1)

Connected to Box as RB Admin (31519033281)


In [2]:
from box_sdk_gen import CreateFolderParent, UploadFileAttributes, UploadFileAttributesParentField

# Upload sample data to the Box instance

def upload_folder(client: BoxClient, folder_id, local_folder_path)->str:
    # Create a new folder in Box
    try:
        box_folder = client.folders.create_folder(
            name=os.path.basename(local_folder_path),
            parent=CreateFolderParent(id=folder_id))
    except BoxAPIError as e:
        if e.response_info.body['status'] == 409:
            # Folder already exists, get its ID
            box_folder = client.folders.get_folder_by_id(
                e.response_info.body['context_info']['conflicts'][0]['id'])

    print(f"Created folder: {box_folder.name} ({box_folder.id})")

    # Upload files to the new folder
    local_folder_path = os.path.abspath(local_folder_path)
    for root, _, files in os.walk(local_folder_path):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            parent = UploadFileAttributesParentField(id=box_folder.id,
                                                     type="folder")
            file_attributes = UploadFileAttributes(
                name=file_name,
                parent=parent,
            )
            with open(file_path, 'rb') as file_stream:
                
                try:
                    box_file = client.uploads.upload_file(
                        attributes=file_attributes, file=file_stream).entries[0]
                    print(f"Uploaded file: {box_file.name} ({box_file.id})")
                except BoxAPIError as e:
                    if e.response_info.body['status'] == 409:
                        print(f"File already exists: {file_name} ({e.response_info.context_info['conflicts']['id']})")

    return box_folder.id
local_folder_path = "sample_data/Q4 Tech earnings-Demo"
box_folder_id = upload_folder(client, "0", local_folder_path)

Created folder: Q4 Tech earnings-Demo (323228490092)
File already exists: Apple_analysis.docx (1874924987657)
File already exists: Tesla_analysis.docx (1874969973544)
File already exists: Microsoft_analysis.docx (1874972038992)
File already exists: Meta_analysis.docx (1874954337477)
File already exists: NVIDIA_analysis.docx (1874960079989)


In [3]:
import os
load_dotenv()
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
MONGODB_URI=os.getenv("MONGODB_URI")

In [4]:
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings
embedding_model = OpenAIEmbeddings()
# Instantiate vector store
vector_store = MongoDBAtlasVectorSearch.from_connection_string(
   connection_string = MONGODB_URI,
   namespace = "langchain_db.earnings_rag",
   embedding=embedding_model,
   index_name="vector_index"
)

In [5]:
from langchain_box.document_loaders import BoxLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

auth_token = auth.retrieve_token().access_token
# Load the PDF
loader = BoxLoader(box_developer_token=auth_token,
                   box_folder_id=box_folder_id,
                   )
data = loader.load()

# Split PDF into documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
docs = text_splitter.split_documents(data)

# Add data to the vector store
vector_store.add_documents(docs)

# Use helper method to create the vector search index
vector_store.create_vector_search_index(
   dimensions = 1536
)

In [6]:
import pprint
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

llm = ChatOpenAI(model = "gpt-4o")

# Instantiate Atlas Vector Search as a retriever
retriever = vector_store.as_retriever()

# Define prompt template
template = """
Use the following pieces of context to answer the question at the end.
{context}
Question: {question}
"""
custom_rag_prompt = PromptTemplate.from_template(template)

def format_docs(docs):
   return "\n\n".join(doc.page_content for doc in docs)

# Create chain
rag_chain = (
   {"context": retriever | format_docs, "question": RunnablePassthrough()}
   | custom_rag_prompt
   | llm
   | StrOutputParser()
)
# Prompt the chain
question = "What are the major tech companies concerned with in the next few months?"
answer = rag_chain.invoke(question)
pprint.pprint(answer)

# Return source documents
documents = retriever.invoke(question)
print("\nSource documents:")
pprint.pprint(documents)

('The major tech companies are likely concerned with the trajectory around '
 'infrastructure and capital expenditures (capex) in the next few months. '
 'However, it is indicated that it might be too early to have a strong opinion '
 'on what this means for them.')

Source documents:
[Document(id='68361740d5e188fcf0b37b30', metadata={'_id': '68361740d5e188fcf0b37b30', 'source': 'https://dl.boxcloud.com/api/2.0/internal_files/1874969973544/versions/2067810227944/representations/extracted_text/content/', 'title': 'Tesla_analysis_docx'}, page_content='companies that are outside of Tesla in maybe the second half of next year, something like that.'),
 Document(id='683624d1ad50fb0f8fa49f97', metadata={'_id': '683624d1ad50fb0f8fa49f97', 'source': 'https://dl.boxcloud.com/api/2.0/internal_files/1874969973544/versions/2067810227944/representations/extracted_text/content/', 'title': 'Tesla_analysis_docx'}, page_content='companies that are outside of Tesla in maybe the second half of next year, 