In [None]:
%pip install llama_index llama-index-embeddings-huggingface llama-index-llms-huggingface streamlit python-dotenv python-pptx

In [None]:
from llama_index.core import StorageContext, load_index_from_storage, SimpleDirectoryReader, VectorStoreIndex, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
import helpers.my_XML as myXML
import pandas as pd

In [None]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5") #dunzhang/stella_en_1.5B_v5") #Snowflake/snowflake-arctic-embed-m-v1.5")#
local_llm = HuggingFaceLLM(
    model_name = "meta-llama/Llama-3.2-3B-Instruct", # "meta-llama/Llama-3.2-1B-Instruct"
    tokenizer_name="meta-llama/Llama-3.2-3B-Instruct" # "meta-llama/Llama-3.2-1B-Instruct"
)

Settings.embed_model = embed_model
Settings.llm = local_llm

model_names_for_path = "_llama3B_bgeL"

In [None]:
base_path = "./_MANUALS/"

FCOM_path = base_path + "XML_C_Ops_FCOM_A318_A319_A320_A321_21-Aug-2024_DLH/"
FCTM_path = base_path + "FCTM_A320_PEGMA/"
FCM_path = base_path + "FCM_PEGMA/"
CCC_path = base_path + "CCC_Individual_Chunks/"
background_path = base_path + "background/"
briefings_path = base_path + "Briefing PPTs/"

index_storage_path = base_path + "storage/"

print(f"FCOM path: {FCOM_path}")
print(f"FCTM path: {FCTM_path}")
print(f"FCM path: {FCM_path}")
print(f"CCC path: {CCC_path}")
print(f"Background path: {background_path}")
print(f"Briefings path: {briefings_path}")
print("--------------------------------")
print(f"Index storage path: {index_storage_path}")

Create your folder structure accordingly to the schema above or the one you choose. Otherwise you are going to run into errors below that folders are not found!

## FCOM

In [None]:
FCOM_index = None
index_store = index_storage_path + "FCOM" + model_names_for_path
try:
    storage_context = StorageContext.from_defaults(
        persist_dir=index_store
    )
    FCOM_index = load_index_from_storage(storage_context)
except Exception as e:
    print(e)

if not FCOM_index:
    FCOM_index = myXML.create_index(FCOM_path + "DATA/DU",
                                    myXML.parse_titles_bottom_up(FCOM_path + "DATA/XML_N_FCOM_DLH_TF_N_EU__20240821.xml"))
    FCOM_index.storage_context.persist(persist_dir=index_store)

## FCTM

In [None]:
FCTM_index = None
index_store = index_storage_path + "FCTM" + model_names_for_path

try:
    storage_context = StorageContext.from_defaults(
        persist_dir=index_store
    )
    FCTM_index = load_index_from_storage(storage_context)
except Exception as e:
    print(e)

if not FCTM_index:
    FCTM_index = myXML.create_index(FCTM_path + "DU",
                                    myXML.parse_titles_bottom_up_PEGMA(FCTM_path + "document.xml"))
    FCTM_index.storage_context.persist(persist_dir=index_store)

## FCM

In [None]:
FCM_index = None
index_store = index_storage_path + "FCM" + model_names_for_path
try:
    storage_context = StorageContext.from_defaults(
        persist_dir=index_store
    )
    FCM_index = load_index_from_storage(storage_context)
except Exception as e:
    print(e)

if not FCM_index:
    FCTM_index = myXML.create_index(FCM_path + "DU",
                                    myXML.parse_titles_bottom_up_PEGMA(FCM_path + "document.xml"))
    FCTM_index.storage_context.persist(persist_dir=index_store)

## LH Group Background Knowledge Base


In [None]:
background_index = None
index_store = index_storage_path + "background" + model_names_for_path
try:
    storage_context = StorageContext.from_defaults(
        persist_dir=index_store
    )
    background_index = load_index_from_storage(storage_context)
except Exception as e:
    print(e)

if not background_index:
    background_index = myXML.create_index(background_path + "DU",
                                          myXML.parse_titles_bottom_up_PEGMA(background_path + "document.xml"))

    background_index.storage_context.persist(persist_dir=index_store)

## Common Conversion Course


In [None]:
CCC_index = None
index_store = index_storage_path + "CCC" + model_names_for_path

try:
    storage_context = StorageContext.from_defaults(
        persist_dir=index_store
    )
    CCC_index = load_index_from_storage(storage_context)
except Exception as e:
    print(e)

if not CCC_index:
    ccc = SimpleDirectoryReader(
        input_dir=CCC_path,
        required_exts=[".pdf"],
        recursive=False
    ).load_data()

    # we want filename used for embedding & llm response
    for document in ccc:
        if ("file_name" in document.excluded_embed_metadata_keys):
            document.excluded_embed_metadata_keys.remove("file_name")
        if ("file_name" in document.excluded_llm_metadata_keys):
            document.excluded_llm_metadata_keys.remove("file_name")

    # build index
    CCC_index = VectorStoreIndex.from_documents(ccc, show_progress=True)

    # persist index
    CCC_index.storage_context.persist(persist_dir=index_store)

## Briefing PPTs

In [None]:
PPT_index = None
json_output_path = briefings_path + "extracted_briefings.json"
json_chunks_path = briefings_path + "chunks"
index_store = index_storage_path + "briefings" + model_names_for_path

try:
    storage_context = StorageContext.from_defaults(
        persist_dir=index_store
    )
    PPT_index = load_index_from_storage(storage_context)
except Exception as e:
    print(e)

if not PPT_index:
    myXML.process_pptx_files(briefings_path, json_output_path)
    myXML.create_simple_chunks_for_briefings_json(
        json_output_path, json_chunks_path)
    print(f"Extraction complete! Chunks saved to {json_chunks_path}")

    briefings_docs = SimpleDirectoryReader(
        input_dir=json_chunks_path,
        required_exts=[".json"],
        recursive=False
    ).load_data()
    briefings_index = VectorStoreIndex.from_documents(
        briefings_docs, show_progress=True)
    briefings_index.storage_context.persist(persist_dir=index_store)

# Testing

We have test cases for FCOM, FCTM, FCM and LH Background Knowledge.

In [None]:
df_FCOM = pd.read_csv("tests/FCOM_test.csv", dtype={
                      'input': str, 'input_question': str, 'output_merged_duSol': str, 'output': str})
df_FCTM = pd.read_csv("tests/FCTM_test.csv", dtype={
                      'input': str, 'input_question': str, 'output_merged_duSol': str, 'output': str})
df_FCM = pd.read_csv("tests/FCM_test.csv",
                     dtype={'input': str, 'input_question': str, 'output': str})
df_background = pd.read_csv("tests/background_test.csv",
                            dtype={'input': str, 'input_question': str, 'output': str})

In [None]:
for m in [(df_FCOM, FCOM_index), (df_FCTM, FCTM_index), (df_FCM, FCM_index), (df_background, background_index)]:
    print("-------------------")
    for k in [5, 25]:
        retriever = m[1].as_retriever(similarity_top_k=k)
        found_counter = 0
        reciprocal_ranks = []

        for index_store, row in m[0].iterrows():
            input_value = row['input']
            res = retriever.retrieve(input_value)

            # Find if the correct output exists in the top-k results
            found = False
            for rank, snode in enumerate(res, start=1):
                if row['output'] == (snode.node.id_)[:-4]:
                    found_counter += 1
                    reciprocal_ranks.append(1 / rank)
                    found = True
                    break

            if not found:
                reciprocal_ranks.append(0)

        # Calculate accuracy and MRR
        print("k: ", k)
        print("Accuracy:", found_counter / len(m[0]))
        print("MRR:", sum(reciprocal_ranks) / len(m[0]))

Testing of different tokenizer and embedding models can be done by simply changing the LLM/embedding model definition at the beginning or loading a different index accordingly.

Below, we are showcasing the FCOM test that always merges a couple of XML (i.e. in the XLM structure we merge all duSol documents within the same duInv).

In [None]:
FCOM_merged_index = None
index_store = index_storage_path + "FCOM" + model_names_for_path + "_merged"

try:
    storage_context = StorageContext.from_defaults(
        persist_dir=index_store
    )
    FCOM_merged_index = load_index_from_storage(storage_context)
except Exception as e:
    print(e)

if not FCOM_merged_index:
    myXML.merge_all_duSol(XML_structure_path=FCOM_path + "DATA/XML_N_FCOM_DLH_TF_N_EU__20240821.xml",
                          XML_folder_path=FCOM_path + "DATA/DU",
                          output_folder=FCOM_path + "DATA/DU_merged")

    FCOM_merged_index = myXML.create_index(FCOM_path + "DATA/DU_merged",
                                           myXML.parse_titles_bottom_up_duInv(FCOM_path + "DATA/XML_N_FCOM_DLH_TF_N_EU__20240821.xml"))

    FCOM_merged_index.storage_context.persist(persist_dir=index_store)

In [None]:
for m in [(df_FCOM, FCOM_index)]:
    print("-------------------")
    for k in [5, 25]:
        retriever = m[1].as_retriever(similarity_top_k=k)
        found_counter = 0
        reciprocal_ranks = []

        for index, row in m[0].iterrows():
            input_value = row['input']
            res = retriever.retrieve(input_value)

            # Find if the correct output exists in the top-k results
            found = False
            for rank, snode in enumerate(res, start=1):
                if row['output_merged_duSol'] == (snode.node.id_)[:-4]:
                    found_counter += 1
                    reciprocal_ranks.append(1 / rank)
                    found = True
                    break

            if not found:
                reciprocal_ranks.append(0)

        # Calculate accuracy and MRR
        print("k: ", k)
        print("Accuracy:", found_counter / len(m[0]))
        print("MRR:", sum(reciprocal_ranks) / len(m[0]))