In [None]:
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
import logging
import sys
from collections.abc import Iterator
from sqlalchemy import make_url, create_engine, MetaData
from llama_index.core import ServiceContext, SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex
#from llama_index.vector_stores import PGVectorStore
import textwrap
import openai

# customize textnode - purpose is to add id to each node
#from llama_index.schema import TextNode
# customize stages of querying  https://docs.llamaindex.ai/en/latest/understanding/querying/querying.html
from llama_index.core import get_response_synthesizer
from llama_index.core.indices.vector_store.retrievers.retriever import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
import pandas as pd

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
api_key = ""
azure_endpoint = ""
api_version = "2023-07-01-preview"

# create llm and embedding model apis

llm = AzureOpenAI(
    model="gpt-4",
    deployment_name="ailab-llm",
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
)

# You need to deploy your own embedding model as well as your own chat completion model
embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name="ada",
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
)

In [None]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
storage_context = StorageContext.from_defaults(persist_dir="./index")

In [None]:
from llama_index.core import StorageContext, load_index_from_storage
index = load_index_from_storage(storage_context)

In [None]:
# configure retriever for debugging and retrieving metadata 
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=15,
)
# configure response synthesizer
response_synthesizer = get_response_synthesizer()
# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)


In [None]:
df_q['llamaindex_response'] = ''
df_q['llamaindex_top_15_doc'] = ''
df_q['llamaindex_top_15_doc_id'] = ''
df_q['llamaindex_answer_placement'] = ''
df_result = df_q.copy()
def find_string_position(string_list, target_string):
    """
    Finds the position of a target string in a list of strings.

    Parameters:
    - string_list: A list of strings to search through.
    - target_string: The string to find within the list.

    Returns:
    - The index (position) of the target string in the list, or -1 if not found.
    """
    for index, string in enumerate(string_list):
        if string == target_string:
            return index
    return -1
for i in range(len(df_result)):
    print('i', i)
    # query
    response = query_engine.query(df_result.iloc[i]['question'])
    print(response.get_formatted_sources())
    print("query was:", df_result.iloc[i]['question'])
    print("answer was:", response)
    # get top k result into a list, in order of match score
    top_k_result = []
    top_k_result_id = []
    for j in range(15):
        top_k_result.append(response.source_nodes[j])
        top_k_result_id.append(response.source_nodes[j].metadata['id_'])
    #print('top_k_result', top_k_result)
    # get customized metadata
    #response.source_nodes[0].metadata
    df_result.at[i,'llamaindex_response'] = response
    df_result.at[i,'llamaindex_top_15_doc'] = top_k_result
    df_result.at[i,'llamaindex_top_15_doc_id'] = top_k_result_id
    df_result.at[i,'llamaindex_answer_placement'] = find_string_position(top_k_result_id, df_result['chunk_id'].iloc[i])
    #print('df_result-------------------', df_result.iloc[i])
df = df_result

In [None]:
pd.options.display.max_colwidth = 10000
df[['question','llamaindex_answer_placement','llamaindex_response']]

In [None]:
df.to_csv('./data/good_qna_llamaindex_answer.csv',encoding='utf-8-sig')