In [89]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

### Load Documents
from langchain_community.document_loaders import UnstructuredPowerPointLoader

## PowerPoint Presentations
# Replace with the actual path to your PowerPoint file
file_path = "./example.pptx" 

# Create an instance of the loader
loader = UnstructuredPowerPointLoader(file_path)

# Load the data from the file
data = loader.load()

# The 'data' variable now holds a list of LangChain Document objects
# You can inspect the content and metadata of the documents:
# print(data[0].page_content)
# print(data[0].metadata)

## Load Text
loader = TextLoader("mgs5/mgs5_cassette_tapes.txt")
documents = loader.load()

## Split Text
# Initialize the RecursiveCharacterTextSplitter
# chunk_size: The maximum size of each chunk (in characters by default).
# chunk_overlap: The number of characters to overlap between consecutive chunks,
#                helping to maintain context.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,  # Use character length for chunk size
    is_separator_regex=False, # Treat separators literally
)

#text_splitter = RecursiveCharacterTextSplitter(
#    chunk_size=1000,
#    chunk_overlap=50,
#    length_function=len,  # Use character length for chunk size
#    is_separator_regex=False, # Treat separators literally
#)

# Split the loaded documents
split_docs = text_splitter.split_documents(documents)

# Print the resulting chunks
print(f"Number of original documents: {len(documents)}")
print(f"Number of split chunks: {len(split_docs)}\n")

#for i, chunk in enumerate(split_docs):
#    print(f"Chunk {i+1}:\n{chunk.page_content}\n---")


Number of original documents: 1
Number of split chunks: 4920



In [90]:
### Embed Documents
from langchain_community.embeddings import OCIGenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_classic.chains import RetrievalQA

oci_embeddings = OCIGenAIEmbeddings(
    model_id="cohere.embed-english-light-v3.0",
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
    compartment_id="ocid1.compartment.oc1..aaaaaaaa52sp42nqmtwwzzvmp5mmldri26razhrbyw7cvixmims7p5crsg7a",
)

l = len(split_docs)
embeddings = []
for i in range(l // 16 + 1):
    #print(f"Embedding from index {i*16} to {(i + 1) * 16}...")
    subdocs = [item.page_content for item in split_docs[i * 16: (i + 1) * 16]]
    embeddings.extend(oci_embeddings.embed_documents(subdocs))



Embedding from index 0 to 16...
Embedding from index 16 to 32...
Embedding from index 32 to 48...
Embedding from index 48 to 64...
Embedding from index 64 to 80...
Embedding from index 80 to 96...
Embedding from index 96 to 112...
Embedding from index 112 to 128...
Embedding from index 128 to 144...
Embedding from index 144 to 160...
Embedding from index 160 to 176...
Embedding from index 176 to 192...
Embedding from index 192 to 208...
Embedding from index 208 to 224...
Embedding from index 224 to 240...
Embedding from index 240 to 256...
Embedding from index 256 to 272...
Embedding from index 272 to 288...
Embedding from index 288 to 304...
Embedding from index 304 to 320...
Embedding from index 320 to 336...
Embedding from index 336 to 352...
Embedding from index 352 to 368...
Embedding from index 368 to 384...
Embedding from index 384 to 400...
Embedding from index 400 to 416...
Embedding from index 416 to 432...
Embedding from index 432 to 448...
Embedding from index 448 to 464...

In [91]:
### Build Vector Store (FAISS)
#vectorstore = FAISS.from_texts(
#    [
#        "Larry Ellison co-founded Oracle Corporation in 1977 with Bob Miner and Ed Oates.",
#        "Oracle Corporation is an American multinational computer technology company headquartered in Austin, Texas, United States.",
#    ],
#    embedding=embeddings,
#)

texts = [item.page_content for item in split_docs]
text_embedding_pairs = [(text, embed) for text, embed in zip(texts, embeddings)]
vectorstore = FAISS.from_embeddings(text_embedding_pairs, oci_embeddings)

In [92]:
### Build Chain (OCI chat-based Retrieval QA)
retriever = vectorstore.as_retriever()

#rag_prompt_template = """Answer the question based only on the following context:
#{context}
#Question: {question}
#"""

#rag_prompt_template = """Try to the question using only the following context, but, if that fails, use your general knowledge.  You don't have to mention whether you did or did not use the context.
#{context}
#Question: {question}
#"""

rag_prompt_template = """Try to the question using only the following context, but, if that fails, use your general knowledge.
{context}
Question: {question}
"""

rag_prompt = PromptTemplate.from_template(rag_prompt_template)

oci_chat = ChatOCIGenAI(
    model_id="cohere.command-a-03-2025",
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
    compartment_id="ocid1.compartment.oc1..aaaaaaaa52sp42nqmtwwzzvmp5mmldri26razhrbyw7cvixmims7p5crsg7a",
    model_kwargs={"temperature": 0.7, "max_tokens": 500},
)

rag = RetrievalQA.from_chain_type(
    llm=oci_chat,
    retriever=retriever,
    chain_type_kwargs={"prompt": rag_prompt,},
)

#print(rag.invoke("What is your favorite pokemon?"))
#print(rag.invoke("In what year was oracle founded?"))
#print(rag.invoke("Where is oracle headquartered?"))
print(rag.invoke("Who is Benedict 'Kazuhira' Miller?"))
print(rag.invoke("Are Venom Snake and Revolver Ocelot friends?"))
print(rag.invoke("What is the name of Anderson's AI?"))
print(rag.invoke("What is Zero's fear once the Cold War is over?"))
print(rag.invoke("How many cassette tapes are there in Metal Gear Solid 5?"))

{'query': "Who is Benedict 'Kazuhira' Miller?", 'result': 'Based on the provided context, Benedict "Kazuhira" Miller appears to be a character involved in a mission or investigation. He is discussing a target, a Soviet soldier who was passing information, and mentions that a specific message is their only clue. However, the context does not provide enough information to definitively identify who Benedict "Kazuhira" Miller is.\n\nUsing general knowledge, Benedict "Kazuhira" Miller is likely a character from the *Metal Gear* video game series, specifically *Metal Gear Solid V: The Phantom Pain*. In the game, he is a key figure and a mentor-like character to the protagonist, Venom Snake. He is known for his leadership and involvement in the Diamond Dogs organization.'}
{'query': 'Are Venom Snake and Revolver Ocelot friends?', 'result': "Based on the provided context, it is not explicitly stated whether Venom Snake and Revolver Ocelot are friends. The dialogue suggests a level of familiari