In [1]:
import logging
import sys
from pinecone import Pinecone, ServerlessSpec
import os
from openai import OpenAI
import yaml
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, PromptTemplate
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from IPython.display import Markdown, display
from dotenv import load_dotenv, find_dotenv
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
_=load_dotenv(find_dotenv())
client = OpenAI()
pc = Pinecone()

  from tqdm.autonotebook import tqdm


INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/home/bam/miniconda3/envs/bam/lib/python3.12/site-packages/pinecone_plugins'])
Discovering subpackages in _NamespacePath(['/home/bam/miniconda3/envs/bam/lib/python3.12/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
Installing plugin inference into Pinecone


In [None]:

def extract_metadata_from_markdown(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
        if lines and lines[0].strip() == '---':
            end_index = None
            for i in range(1, len(lines)):
                if lines[i].strip() == '---':
                    end_index = i
                    break
            
            if end_index:
                yaml_content = ''.join(lines[1:end_index])
                metadata = yaml.safe_load(yaml_content)
                
                url = metadata.get('url', "https://www.churchofjesuschrist.org/study/general-conference/2024/04?lang=eng")
                author = metadata.get('author', None)
                if author:
                    author = author.replace('\xa0', ' ')
                
                return url, author
    
    return "https://www.churchofjesuschrist.org/study/general-conference/2024/04?lang=eng", None

In [None]:
def file_metadata_extractor(file_path):
    if file_path.endswith('.md'):
        url, author = extract_metadata_from_markdown(file_path)
        if url:
            return {'url': url, 'Author': author}
    return {}

In [2]:
# docs = SimpleDirectoryReader("data1", file_metadata=file_metadata_extractor, recursive=True).load_data()
embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

In [None]:
splitter = SemanticSplitterNodeParser(
    buffer_size=1,
    breakpoint_percentile_threshold=95,
    embed_model=embed_model 
)
nodes = splitter.get_nodes_from_documents(docs)

In [None]:
def add_metadata_to_nodes(nodes, docs):
    for node, doc in zip(nodes, docs):
        node.metadata = doc.metadata
    return nodes
nodes_with_metadata = add_metadata_to_nodes(nodes, docs)

In [3]:
index_name = "hy-sema"

existing_indexes = [index['name'].strip().lower() for index in pc.list_indexes()]
index_name_normalized = index_name.strip().lower()

if index_name_normalized not in existing_indexes:
    print(f"Index '{index_name}' does not exist. Creating a new index.")
    pc.create_index(index_name, dimension=1536, metric="dotproduct", spec=ServerlessSpec(cloud="aws", region="us-east-1"))
else:
    print(f"Index '{index_name}' already exists. Connecting to the existing index.")

Index 'hy-sema' already exists. Connecting to the existing index.


In [4]:
hy_sema_index = pc.Index("hy-sema")

In [None]:
def check_vector_exists(pinecone_index, doc_id):
    existing_vector = pinecone_index.fetch(ids=[doc_id])
    return existing_vector is not None and len(existing_vector['vectors']) > 0

In [5]:
from llama_index.core import StorageContext


vector_store = PineconeVectorStore(
    pinecone_index=hy_sema_index,
    add_sparse_vector=True,
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_index = VectorStoreIndex(nodes=[], embed_model=embed_model, storage_context=storage_context)



In [6]:
# query_engine = vector_index.as_query_engine(similarity_top_k=5, model="gpt-4o-mini")
# response = query_engine.query("What did Pres. Nelson say about the Cardston Temple?")
# display(Markdown(f"{response}"))
# # display(Markdown(f"{response}"))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


President Russell M. Nelson mentioned that there was a functioning temple in Cardston, Alberta, Canada, among the six temples that were operational when he was born.

In [37]:
meta_tmpl_str ="""
You are an AI assistant tasked with answering questions about the April 2024 General Conference of the Church of Jesus Christ of Latter-Day Saints. You will be provided with the content of the conference talks and a question to answer. Your goal is to provide accurate and relevant information based solely on the content of these talks.

Here is the content of the conference talks:

<conference_talks>
{{CONFERENCE_TALKS}}
</conference_talks>

When answering questions, follow these guidelines:

1. Carefully read and analyze the conference talks to find relevant information and think deeply how the relevant information can be used to answer the question.
2. Only use information explicitly stated in the provided talks. Do not include external knowledge or personal interpretations.
3. If the question cannot be answered using the information in the talks, state that the answer is not found in the provided content.
4. Provide direct quotes from the talks when possible to support your answer. Use quotation marks and indicate the speaker's name for each quote.
5. If multiple talks address the question, synthesize the information from all relevant sources.
6. Maintain a respectful and reverent tone when discussing religious topics.

Be as concise and complete as possible

Use the following XML tags to structure your response:
<answer><main_content></main_content><limitations></limitations>
</answer>

Now, please answer the following question based on the conference talks provided:

<question>
{{QUESTION}}
</question>"""

meta_tmpl = PromptTemplate(meta_tmpl_str)

In [32]:
query_engine.update_prompts(
    {"response_synthesizer:summary_template": meta_tmpl}
)

In [33]:
prompts_dict = query_engine.get_prompts()

In [38]:
# set Logging to DEBUG for more detailed outputs
query_engine = vector_index.as_query_engine(response_mode="tree_summarize")

In [39]:
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

In [41]:
display_prompt_dict(prompts_dict)

**Prompt Key**: response_synthesizer:summary_template<br>**Text:** <br>


You are an AI assistant tasked with answering questions about the April 2024 General Conference of the Church of Jesus Christ of Latter-Day Saints. You will be provided with the content of the conference talks and a question to answer. Your goal is to provide accurate and relevant information based solely on the content of these talks.

Here is the content of the conference talks:

<conference_talks>
{{CONFERENCE_TALKS}}
</conference_talks>

When answering questions, follow these guidelines:

1. Carefully read and analyze the conference talks to find relevant information and think deeply how the relevant information can be used to answer the question.
2. Only use information explicitly stated in the provided talks. Do not include external knowledge or personal interpretations.
3. If the question cannot be answered using the information in the talks, state that the answer is not found in the provided content.
4. Provide direct quotes from the talks when possible to support your answer.

<br><br>

In [None]:
query_engine = vector_index.as_query_engine(similarity_top_k=5, model="gpt-4o-mini")
response = query_engine.query("What did Pres. Nelson say about the Cardston Temple?")
display(Markdown(f"{response}"))