In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredPowerPointLoader

### Load Documents
## PowerPoint Presentations
# Replace with the actual path to your PowerPoint file
file_path = "/home/datascience/demo/pptx/nacie_integration_performance_discussion_20250814.pptx" 

# Create an instance of the loader
loader = UnstructuredPowerPointLoader(file_path)

# Load the data from the file
documents = loader.load()

# The 'data' variable now holds a list of LangChain Document objects
# You can inspect the content and metadata of the documents:
#print(documents[0].page_content)
print(documents[0].metadata)

## Split Text
# Initialize the RecursiveCharacterTextSplitter
# chunk_size: The maximum size of each chunk (in characters by default).
# chunk_overlap: The number of characters to overlap between consecutive chunks,
#                helping to maintain context.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,  # Use character length for chunk size
    is_separator_regex=False, # Treat separators literally
)

#text_splitter = RecursiveCharacterTextSplitter(
#    chunk_size=1000,
#    chunk_overlap=50,
#    length_function=len,  # Use character length for chunk size
#    is_separator_regex=False, # Treat separators literally
#)

# Split the loaded documents
split_docs = text_splitter.split_documents(documents)

# Print the resulting chunks
print(f"Number of original documents: {len(documents)}")
print(f"Number of split chunks: {len(split_docs)}\n")

#for i, chunk in enumerate(split_docs):
#    print(f"Chunk {i+1}:\n{chunk.page_content}\n---")


{'source': '/home/datascience/demo/pptx/nacie_integration_performance_discussion_20250814.pptx'}
Number of original documents: 1
Number of split chunks: 208



In [6]:
### Embed Documents
from langchain_community.embeddings import OCIGenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_classic.chains import RetrievalQA

oci_embeddings = OCIGenAIEmbeddings(
    model_id="cohere.embed-english-light-v3.0",
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
    compartment_id="ocid1.compartment.oc1..aaaaaaaa52sp42nqmtwwzzvmp5mmldri26razhrbyw7cvixmims7p5crsg7a",
)

l = len(split_docs)
embeddings = []
for i in range(l // 16 + 1):
    #print(f"Embedding from index {i*16} to {(i + 1) * 16}...")
    subdocs = [item.page_content for item in split_docs[i * 16: (i + 1) * 16]]
    embeddings.extend(oci_embeddings.embed_documents(subdocs))



In [7]:
### Build Vector Store (FAISS)
#vectorstore = FAISS.from_texts(
#    [
#        "Larry Ellison co-founded Oracle Corporation in 1977 with Bob Miner and Ed Oates.",
#        "Oracle Corporation is an American multinational computer technology company headquartered in Austin, Texas, United States.",
#    ],
#    embedding=embeddings,
#)

texts = [item.page_content for item in split_docs]
text_embedding_pairs = [(text, embed) for text, embed in zip(texts, embeddings)]
vectorstore = FAISS.from_embeddings(text_embedding_pairs, oci_embeddings)

In [11]:
from langchain_core.prompts import PromptTemplate
from langchain_community.chat_models import ChatOCIGenAI

### Build Chain (OCI chat-based Retrieval QA)
retriever = vectorstore.as_retriever()

#rag_prompt_template = """Answer the question based only on the following context:
#{context}
#Question: {question}
#"""

#rag_prompt_template = """Try to the question using only the following context, but, if that fails, use your general knowledge.  You don't have to mention whether you did or did not use the context.
#{context}
#Question: {question}
#"""

rag_prompt_template = """Try to the question using only the following context, but, if that fails, use your general knowledge.
{context}
Question: {question}
"""

rag_prompt = PromptTemplate.from_template(rag_prompt_template)

oci_chat = ChatOCIGenAI(
    model_id="cohere.command-a-03-2025",
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
    compartment_id="ocid1.compartment.oc1..aaaaaaaa52sp42nqmtwwzzvmp5mmldri26razhrbyw7cvixmims7p5crsg7a",
    model_kwargs={"temperature": 0.7, "max_tokens": 500},
)

rag = RetrievalQA.from_chain_type(
    llm=oci_chat,
    retriever=retriever,
    chain_type_kwargs={"prompt": rag_prompt,},
)

print(rag.invoke("What are the 4 pillars of performance?"))
print(rag.invoke("Why is Productivity important?"))
print(rag.invoke("Can I ignore any of the 4 pillars of performance?"))

{'query': 'What are the 4 pillars of performance?', 'result': 'The 4 pillars of performance, as outlined in the context, are:\n\n1. **Growth**  \n2. **Productivity**  \n3. **Health**  \n4. **Innovation**'}
{'query': 'Why is Productivity important?', 'result': "Based on the provided context, Productivity is important because it **measures individual contribution to revenue**. This suggests that tracking productivity helps assess how much each person is directly impacting the company's financial performance."}
{'query': 'Can I ignore any of the 4 pillars of performance?', 'result': 'Based on the provided context, the 4 pillars of performance are:\n\n1. **Growth**\n2. **Productivity**\n3. **Health**\n4. **Innovation**\n\nThe context emphasizes the "Evolving 4-Pillar Approach," suggesting that these pillars are interconnected and essential for overall performance. There is no indication that any pillar can be ignored. In fact, the repetition of the pillars implies their equal importance.\n