### STEP 1: Installing required libraries

In [1]:
!pip install spacy rapidfuzz pandas openpyxl

import spacy
from rapidfuzz import process, fuzz
import pandas as pd

Collecting rapidfuzz
  Downloading rapidfuzz-3.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading rapidfuzz-3.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.12.1


###STEP 2: Load KPI Dataset

In [2]:
#Load KPI Dataset
from google.colab import files
uploaded = files.upload()

Saving test_data_modified_3.xlsx to test_data_modified_3.xlsx


###STEP 3: Extract KPI List

In [3]:
#Extract KPI List
df = pd.read_excel("test_data_modified_3.xlsx")
kpi_list = df["KPI"].dropna().unique().tolist()

In [4]:
kpi_list

['Certified Employee in Current GC',
 'Manpower Tracked for Certification- Competency',
 'Loss Time Injuries',
 'Loss Time Injuries (Contractor)',
 'Off-Job Man Hours (Emp)',
 'On-Job Man Hours (Contractor)',
 'On-Job Man Hours (EMP + Cont)',
 'On-Job Man Hours (Emp)',
 'Positive Responses to the Phishing Tests',
 'APNE/VCGNE',
 'AV & Office Devices',
 'App re-opened maint incidents',
 'Application Cost/Per Hr. (In-House)',
 'Applications Investment Capacity',
 'Applications Maintain Capacity',
 'Applications Run Capacity',
 'Assets Net Book Value',
 'Attrition',
 'Average Company Workforce',
 'BI-19 No. Overdue Projects (Past ERC)',
 'BI-19 No. of Projects with ? 3 Months To ERC',
 'Biz Travel',
 'Business Sustainability - Below Target',
 'Business Sustainability - Meeting Target',
 'Business Sustainability - Yellow',
 'CDPNE',
 'CPH Positions - Total',
 'CPH Positions Status - Acting',
 'CPH Positions Status - Permanent',
 'CPH Positions Status - Vacant',
 'CRM Delivery – Number of S

###STEP 6: Fine tune the spaCy model to make it understand the entities better

In [5]:
# Create training data
train_data = []

for kpi in kpi_list:
    sentence = f"This is an example sentence mentioning the KPI: {kpi}."
    start = len("This is an example sentence mentioning the KPI: ")
    end = start + len(kpi)
    train_data.append((sentence, {"entities": [(start, end, "KPI")]}))

# Example of the first training data entry
print(train_data[0])

('This is an example sentence mentioning the KPI: Certified Employee in Current GC.', {'entities': [(48, 80, 'KPI')]})


In [6]:
!pip install spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [7]:
!pip install spacy-lookups-data

Collecting spacy-lookups-data
  Downloading spacy_lookups_data-1.0.5-py2.py3-none-any.whl.metadata (4.8 kB)
Downloading spacy_lookups_data-1.0.5-py2.py3-none-any.whl (98.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spacy-lookups-data
Successfully installed spacy-lookups-data-1.0.5


In [9]:
import spacy
from spacy.training.example import Example
from spacy.training import offsets_to_biluo_tags

# Load pre-trained spaCy model
nlp = spacy.load("en_core_web_md")

# Add a new NER component if not already present
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe("ner")

# Add the custom KPI label
ner.add_label("KPI")

# Sample training data (ensure correct entity offsets)
train_data = [
    ("This is an example sentence mentioning the KPI: Net Profit Margin", {"entities": [(48, 66, "KPI")]}),
    ("Another sentence with a KPI: Revenue Growth Rate", {"entities": [(30, 51, "KPI")]}),
]

# Verify entity alignment
for text, annotations in train_data:
    doc = nlp.make_doc(text)
    tags = offsets_to_biluo_tags(doc, annotations["entities"])
    if "-" in tags:
        print(f"Misaligned entities detected in: {text}")

# Convert training data into spaCy's format
train_examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in train_data]

# Use resume_training instead of begin_training
optimizer = nlp.resume_training()

# Train the model
for epoch in range(10):
    losses = {}
    for example in train_examples:
        nlp.update([example], losses=losses)
    print(f"Epoch {epoch}, Loss: {losses}")

# Save the trained model
nlp.to_disk("kpi_ner_model")



Misaligned entities detected in: This is an example sentence mentioning the KPI: Net Profit Margin
Misaligned entities detected in: Another sentence with a KPI: Revenue Growth Rate
Epoch 0, Loss: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.29706067824087745}
Epoch 1, Loss: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.0001478927272616858}
Epoch 2, Loss: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 7.755721496519735e-08}
Epoch 3, Loss: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 4.120458384099432e-09}
Epoch 4, Loss: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 6.281063500187479e-10}
Epoch 5, Loss: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 2.408707721384928e-10}
Epoch 6, Loss: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 1.5218068111730958e-10}
Epoch 7, Loss: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 1.815746417590366e-10}
Epoch 8, Loss: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 7.6551473362215

###STEP 5: Extract Potential KPI Entities:
  <ol>1. Remove Noise words</ol>
  <ol>2. Extract entities that are relevant to the query</ol>
  <ol>3. Extract entities using bigrams and trigrams for better context understanding</ol>
  <ol>4. Combine KPI entities extracted earlier</ol>



In [10]:
# Load fine-tuned spaCy model
nlp = spacy.load("/content/kpi_ner_model")

def extract_kpi_entities(query):
    doc = nlp(query)

    # Extract entities recognized as KPIs by the fine-tuned model
    extracted_entities = [ent.text for ent in doc.ents if ent.label_ == "KPI"]
    print(f"Extracted Entities (Using Fine-tuned Model): {extracted_entities}")

    # If no KPIs were recognized, fall back to basic token extraction (optional)
    if not extracted_entities:
        # Optionally, use your previous method to fall back to basic token extraction
        exclude_words = {"plan", "beginning", "cost", "total", "overall"}
        words = [token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "ADJ", "ORG", "MONEY", "PERCENT", "QUANTITY"]
                 and token.text.lower() not in exclude_words]
        bigrams = [" ".join(words[i:i + 2]) for i in range(len(words) - 1)]
        trigrams = [" ".join(words[i:i + 3]) for i in range(len(words) - 2)]

        extracted_entities = words + bigrams + trigrams

    return extracted_entities if extracted_entities else [query]


###STEP 6: Extract Top Matching KPIs
<ol>1. Ignore generic words unless part of a phrase </ol>
<ol>2. Adjust threshold for shorter terms </ol>
<ol>3. Apply fuzzy matching with adjusted threshold </ol>
<ol>4. Exact substring fallback for acronyms and dates</ol>
<ol>5. Handle "Number of X" cases (ensure relevant filtering)</ol>
<ol>6. Prioritize longer matches</ol>

In [11]:
def get_top_matching_kpis(query, top_n=3):
    extracted_entities = extract_kpi_entities(query)
    matched_kpis = []

    for entity in extracted_entities:
        if entity.lower() in {"group", "highest", "number", "plan", "cost", "total"}:
            continue

        # Lowered thresholds
        threshold = 65 if len(entity) <= 4 else 75

        best_matches = process.extract(entity, kpi_list, scorer=fuzz.token_set_ratio, limit=top_n)

        # Debug print to check potential matches before filtering
        print(f"Potential Matches for '{entity}': {best_matches}")

        filtered_matches = [match[0] for match in best_matches if match[1] >= threshold]

        if not filtered_matches:
            filtered_matches = [kpi for kpi in kpi_list if entity.lower() in kpi.lower()]

        if "number" in entity.lower():
            filtered_matches = [kpi for kpi in filtered_matches if any(word in kpi.lower() for word in extracted_entities)]

        print(f"Filtered Matches for '{entity}': {filtered_matches}")
        matched_kpis.extend(filtered_matches)

    matched_kpis = sorted(set(matched_kpis), key=len, reverse=True)
    return matched_kpis

###STEP 7: Use Relevant Queries to extract KPIs

In [12]:
# List of predefined questions
predefined_questions = [
    "what is the applications service labor productivity ratio for DJVC for July 2018",
    "Is there any under performing KPIs in ITSED organization across all periods?",
    "Which Kpis are under performing for f&ad for october 2024",
    "How about Test Phishing Email Failure?",
    "is this kpi healthy?",
    "What we discussed so far",
    "Hi",
    "How are you?",
    "Which Kpis are under performing for ssd for june 2024",
    "How about august 2024?",
    "how many patents granted for CAD in 2024 without limit?",
    "What is the total Number of Failed Responses to the Phishing Tests in September 2019",
    "Which group in ITSED has the highest number of permanent female employees?",
    "Which organization in ITSED has the highest % of female representation?",
    "what is applications service labor productivity kpi would mean",
    "Kpi formula for applications service labor productivity",
    "what is permanent female employees would mean",
    "what is the difference between female representation and permanent female employees kpi",
    "what is the kpi formula for female representation and permanent female employees kpi",
    "List the kpis present in cad",
    "How many kpis are present in the data",
    "what are the organizations present in the data",
    "Which group in ITSED has the highest number of female employees?",
    "Which group in ITSED has the highest % of female representation?",
    "what is the percentage of top performer retention for the period feb 2018?",
    "What is the Professional Certification % in CG in February 2019?",
    "During which period CG has highest % Professional Certification in 2019?",
    "What is the total Mean Time to Repair hours for ITI in October and May 2024",
    "What is the Total Recorded Case Frequency rate for SCMD in December 2021",
    "What is the total number of Traffic Violations across all the periods in 2024?",
    "Which kpis needs attention in CAD for July 2024",
    "what are the previous questions that I asked?",
    "Which division in CAD has the highest % of Test Phishing Email Failure?",
    "what is the top performing organization within CAD?",
    "what is the focus area for Supply Chain On-Time Delivery",
    "what is the category for Drilling Demand Forecasting Accuracy kpi",
    "what are the kpis that is under Alignment to National Interest category"
]

In [17]:
import time
import pandas as pd

qa_list = []

for question in predefined_questions:
    start_time = time.time()  # Start time

    matches = get_top_matching_kpis(question)
    top_matches = [f"{kpi}" for kpi in matches[:3]]  # Top 3 KPIs

    end_time = time.time()  # End time
    retrieval_time = round(end_time - start_time, 4)  # Compute time taken

    answer = ", ".join(top_matches) if top_matches else "No matching KPI found"
    qa_list.append({"Question": question, "Top 3 Matching KPIs": answer, "Time Taken": retrieval_time})


# for question in predefined_questions:
#     matches = find_best_kpis(question)
#     top_matches = [f"{kpi}: {score:.2f}" for kpi, score in matches[:3]]  # Top 3 KPIs
#     answer = ", ".join(top_matches) if top_matches else "No matching KPI found"
#     qa_list.append({"Question": question, "Top 3 Matching KPIs": answer})

# Convert to DataFrame
df = pd.DataFrame(qa_list, columns=["Query", "KPI Retrieved", "Time Taken (seconds)"])

# Convert list to DataFrame
df = pd.DataFrame(qa_list)

# Save to Excel
df.to_excel("KPI_Matching_final_four.xlsx", index=False)

print("Excel file 'KPI_Matching.xlsx' created successfully.")

Extracted Entities (Using Fine-tuned Model): []
Potential Matches for 'applications': [('Communications', 61.53846153846154, 44), ('Traffic Violations', 60.0, 199), ('Applications Run Capacity', 59.45945945945946, 15)]
Filtered Matches for 'applications': ['Applications Investment Capacity', 'Applications Maintain Capacity', 'Applications Run Capacity', "Active Applications' Users over a 9 days", 'Non-SAP Applications Utilization', "Total Applications' authorized users", 'SAP Applications Utilization']
Potential Matches for 'service': [('Safety Observations', 46.15384615384615, 173), ('Services Utilization', 44.44444444444444, 353), ('Saudi Service Contractors', 43.75, 174)]
Filtered Matches for 'service': ['Saudi Service Contractors', 'Service Contractor Saudization', 'Service Delivery Customer Satisfaction ()', 'Service Support Customer Satisfaction ()', 'Total Service Contractors', 'Services Utilization']
Potential Matches for 'labor': [('Controllable Cost', 36.36363636363637, 52), 

In [13]:
import pandas as pd

# Create a list to store question-answer pairs
qa_list = []

for question in predefined_questions:
    matches = get_top_matching_kpis(question)
    top_matches = [f"{kpi}" for kpi in matches[:3]]  # Top 3 KPIs
    answer = ", ".join(top_matches) if top_matches else "No matching KPI found"
    qa_list.append({"Question": question, "Top 3 Matching KPIs": answer})

# Convert list to DataFrame
df = pd.DataFrame(qa_list)

# Save to Excel
df.to_excel("KPI_Matching using Finetuned NER + Fuzzy.xlsx", index=False)

print("Excel file 'KPI_Finetuned NER + Fuzzy Matching.xlsx' created successfully.")

Extracted Entities (Using Fine-tuned Model): []
Potential Matches for 'applications': [('Communications', 61.53846153846154, 44), ('Traffic Violations', 60.0, 199), ('Applications Run Capacity', 59.45945945945946, 15)]
Filtered Matches for 'applications': ['Applications Investment Capacity', 'Applications Maintain Capacity', 'Applications Run Capacity', "Active Applications' Users over a 9 days", 'Non-SAP Applications Utilization', "Total Applications' authorized users", 'SAP Applications Utilization']
Potential Matches for 'service': [('Safety Observations', 46.15384615384615, 173), ('Services Utilization', 44.44444444444444, 353), ('Saudi Service Contractors', 43.75, 174)]
Filtered Matches for 'service': ['Saudi Service Contractors', 'Service Contractor Saudization', 'Service Delivery Customer Satisfaction ()', 'Service Support Customer Satisfaction ()', 'Total Service Contractors', 'Services Utilization']
Potential Matches for 'labor': [('Controllable Cost', 36.36363636363637, 52), 

In [None]:
# Test query
query = "Which group in CAD has the highest % of female representation?"
print(get_top_matching_kpis(query))

Extracted Entities (Using Fine-tuned Model): []
Potential Matches for 'Which group in CAD has the highest % of female representation?': [('Number of Failed Responses to the Phishing Tests', 49.09090909090909, 112), ('Female Representation', 45.78313253012048, 73), ('of active risks with high severity and ongoing treatments(', 43.333333333333336, 231)]
Filtered Matches for 'Which group in CAD has the highest % of female representation?': []
[]


In [None]:
get_top_matching_kpis("How many patents were granted for SCMD in April 2024?")

In [None]:
get_top_matching_kpis("What was CAD Controllable Cost plan in the beginning of 2024?")

In [None]:
get_top_matching_kpis("What is the Professional Certification % in F&AD in July 2024?")

In [None]:
get_top_matching_kpis("Which group in CAD has the highest number of female employees?")

In [None]:
get_top_matching_kpis("Which division in CAD has the highest % of Test Phishing Email Failure?")

In [None]:
get_top_matching_kpis("Which KPIs are underperforming for SSD in August 2024?")

In [None]:
get_top_matching_kpis("How many patents were granted for SCMD in April 2024?")

In [None]:
get_top_matching_kpis("What was CAD Controllable Cost plan in the beginning of 2024?")

In [None]:
get_top_matching_kpis("What is the Professional Certification % in F&AD in July 2024?")

In [None]:
get_top_matching_kpis("Which group in CAD has the highest % of female representation?")

In [None]:
get_top_matching_kpis("Which division in CAD has the highest % of Test Phishing Email Failure?")