##Importing Necessary NLP Libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import wordnet
from collections import defaultdict
from difflib import SequenceMatcher
from itertools import permutations

##Loading Dataset

In [2]:
#Load KPI Dataset
from google.colab import files
uploaded = files.upload()

Saving test_data_modified_3.xlsx to test_data_modified_3.xlsx


In [3]:
# Load the dataset
file_path = "test_data_modified_3.xlsx"
df = pd.read_excel(file_path, sheet_name="Sheet1")

##Extracting unique KPIs from the dataset

In [4]:
# Extract unique KPI names, KPI List names and Focus Area Names
kpi_list = df["KPI"].dropna().unique().tolist()

list_of_kpi_list = df["KPI List"].dropna().unique().tolist()

focus_areas_list = df["Focus Areas"].dropna().unique().tolist()

In [5]:
kpi_list

['Certified Employee in Current GC',
 'Manpower Tracked for Certification- Competency',
 'Loss Time Injuries',
 'Loss Time Injuries (Contractor)',
 'Off-Job Man Hours (Emp)',
 'On-Job Man Hours (Contractor)',
 'On-Job Man Hours (EMP + Cont)',
 'On-Job Man Hours (Emp)',
 'Positive Responses to the Phishing Tests',
 'APNE/VCGNE',
 'AV & Office Devices',
 'App re-opened maint incidents',
 'Application Cost/Per Hr. (In-House)',
 'Applications Investment Capacity',
 'Applications Maintain Capacity',
 'Applications Run Capacity',
 'Assets Net Book Value',
 'Attrition',
 'Average Company Workforce',
 'BI-19 No. Overdue Projects (Past ERC)',
 'BI-19 No. of Projects with ? 3 Months To ERC',
 'Biz Travel',
 'Business Sustainability - Below Target',
 'Business Sustainability - Meeting Target',
 'Business Sustainability - Yellow',
 'CDPNE',
 'CPH Positions - Total',
 'CPH Positions Status - Acting',
 'CPH Positions Status - Permanent',
 'CPH Positions Status - Vacant',
 'CRM Delivery – Number of S

##Tokenizing function using Regular Expression module to handle punctuations better

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
# Tokenize using regex (handles punctuation)
def tokenize(text):
    return re.findall(r"\b\w+(?:-\w+)*\b", text)

##Getting relevant synonyms for the input query

In [7]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Function to get synonyms only for keywords
def get_relevant_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        if syn.pos() in ['n', 'v', 'a']:  # Keep only Nouns (n), Verbs (v), and Adjectives (a)
            for lemma in syn.lemmas():
                synonym = lemma.name().replace('_', ' ')
                if synonym.lower() != word.lower():  # Avoid self-references
                    synonyms.add(synonym)
        if len(synonyms) >= 3:  # Limit synonyms per word to reduce noise
            break
    return synonyms

##Extracted Known abbreviations from the list of KPI, KPI List and Focus Areas

In [13]:
KNOWN_ABBREVIATIONS = {"GC","APNE","VCGNE","AV","BI-19","ERC","CDPNE","CPH","CRM","BI19","DPSRs","USB","HSSE","IKTVA","IT","KM","KT112","KT113","LTI","MPS","YTD","MVA","OE","IDP","CHaRM","EMP","OJI","ON JOB MH","ON","JOB","MH","EMP+SMP","SAMIR","PDP","KMI","KPI","RDP","SC11-7","SLA","SMPs","AVL","SA","PwD","PHDs","ERM","KA","KE","CAPI","ISA","HR","KAs","LFC","MFC","SFC","MFC&SFC","DT","BP",
"CON","OES","DTO"}

##Spelling correction using pyspellchecker module

In [9]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.2-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.2-py3-none-any.whl (7.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.2


In [14]:
import nltk
from spellchecker import SpellChecker

def correct_spelling(query: str) -> str:
    spell = SpellChecker()
    words = tokenize(query)
    corrected_words = []

    for word in words:
        if word.isalpha() and (word not in KNOWN_ABBREVIATIONS):
            corrected_word = spell.correction(word)
            corrected_words.append(corrected_word if corrected_word else word)
        else:
            corrected_words.append(word)

    return " ".join(corrected_words)

In [15]:
correct_spelling("Contlable KMI Granted")

'constable KMI Granted'

##Calculating the overlap between each word from the list KPI and the query

In [16]:
def word_overlap(query, kpi):
    query_words = set(word_tokenize(query.lower()))
    kpi_words = set(word_tokenize(kpi.lower()))
    return len(query_words & kpi_words)

##Function to return whether the KPI/Focus Area/KPI List exactly matched in the query

In [17]:
def exact_match(query, kpi):
    query_words = set(word_tokenize(query.lower()))
    kpi_words = set(word_tokenize(kpi.lower()))
    return kpi_words.issubset(query_words)

##Fix the missing spaces using wordninha and KNOWN_ABBREVIATIONS

In [18]:
!pip install wordninja

Collecting wordninja
  Downloading wordninja-2.0.0.tar.gz (541 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/541.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m541.6/541.6 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wordninja
  Building wheel for wordninja (setup.py) ... [?25l[?25hdone
  Created wheel for wordninja: filename=wordninja-2.0.0-py3-none-any.whl size=541530 sha256=2a506f58ee72c320e4dfbff7e2c24f42c34c40e95223788605dc6d9f1d83d465
  Stored in directory: /root/.cache/pip/wheels/e6/66/9c/712044a983337f5d44f90abcd244bd4b8ad28ee64750404b50
Successfully built wordninja
Installing collected packages: wordninja
Successfully installed wordninja-2.0.0


In [25]:
import re
import wordninja

def fix_missing_spaces(query):
    # Tokenize while keeping special symbols
    tokens = re.findall(r"[A-Za-z0-9&%$@#-]+|[\S]", query)  # Keeps %, &, etc.

    fixed_tokens = []

    for token in tokens:
        # If the token is exactly a known abbreviation, keep it as is
        if token in KNOWN_ABBREVIATIONS:
            fixed_tokens.append(token)
            continue

        matched_abbr = None
        for abbr in sorted(KNOWN_ABBREVIATIONS, key=len, reverse=True):  # Sort to match longest first
            if abbr in token:
                matched_abbr = abbr
                break

        if matched_abbr:
            parts = re.split(f'({matched_abbr})', token)
            for part in parts:
                if part == matched_abbr:
                    fixed_tokens.append(part)
                elif part:
                    fixed_tokens.extend(wordninja.split(part))
        else:
            # Apply wordninja only if the word is merged and not a known symbol
            if token.isalpha() and token not in ["%", "&", "$"]:  # Skip symbols
                split_words = wordninja.split(token)
                fixed_tokens.extend(split_words)  # Add split words
            else:
                fixed_tokens.append(token)  # Keep as is

    return " ".join(fixed_tokens)

In [26]:
query = "What are KMIHours"

print(fix_missing_spaces(query))

What are KMI Hours


In [27]:
query = "List me Target inKMIHours"

print(fix_missing_spaces(query))

List me Target in KMI Hours


In [28]:
query = "OutRDP"

print(fix_missing_spaces(query))

Out RDP


In [29]:
kpi_list

['Certified Employee in Current GC',
 'Manpower Tracked for Certification- Competency',
 'Loss Time Injuries',
 'Loss Time Injuries (Contractor)',
 'Off-Job Man Hours (Emp)',
 'On-Job Man Hours (Contractor)',
 'On-Job Man Hours (EMP + Cont)',
 'On-Job Man Hours (Emp)',
 'Positive Responses to the Phishing Tests',
 'APNE/VCGNE',
 'AV & Office Devices',
 'App re-opened maint incidents',
 'Application Cost/Per Hr. (In-House)',
 'Applications Investment Capacity',
 'Applications Maintain Capacity',
 'Applications Run Capacity',
 'Assets Net Book Value',
 'Attrition',
 'Average Company Workforce',
 'BI-19 No. Overdue Projects (Past ERC)',
 'BI-19 No. of Projects with ? 3 Months To ERC',
 'Biz Travel',
 'Business Sustainability - Below Target',
 'Business Sustainability - Meeting Target',
 'Business Sustainability - Yellow',
 'CDPNE',
 'CPH Positions - Total',
 'CPH Positions Status - Acting',
 'CPH Positions Status - Permanent',
 'CPH Positions Status - Vacant',
 'CRM Delivery – Number of S

In [30]:
focus_areas_list

['Value Creation',
 'Operational Efficiency',
 'Alignment to National Interest',
 'Business Sustainability',
 'Health Safety Security Environment (HSSE)',
 'Solution Delivery',
 'BP Solution Delivery',
 'Human Resources',
 'DTO Enablers',
 'Placeholder',
 'Safety & Cybersecurity',
 'Best Place to Work & Live',
 'Digitalization & Innovation',
 'Risk Management',
 'Sustainability & Citizenship',
 'Requisition To Pay',
 'Record To Report',
 'Order To Cash',
 'Hire To Retire',
 'Build To Dispose',
 'Treasury']

In [31]:
list_of_kpi_list

['Productivity KPIs',
 'Monthly Digest',
 'All KPIs',
 'Performance Measures',
 'Dashboard Performance',
 'Scorecard Performance',
 'Upstream KPIs',
 'Corporate Programs KPIs',
 'Performance Summary',
 'Corporate Dashboard',
 'Performance Eye',
 'DT Value Chain']

##Expanding the Query to check if synonyms in the query match with the KPI list

In [32]:
from nltk.corpus import stopwords
from nltk import pos_tag

# Function to extract keywords & expand query
def expand_query(query):
    stop_words = set(stopwords.words('english'))

    query = fix_missing_spaces(query)

    query = correct_spelling(query)

    words = tokenize(query)
    pos_tags = pos_tag(words)

    keywords = []
    for word, tag in pos_tags:
        if word in KNOWN_ABBREVIATIONS:  # Preserve abbreviations
            keywords.append(word)
        elif tag.startswith(('N', 'V', 'J')) and word.lower() not in stop_words:
            keywords.append(word)

    expanded_query = set(keywords)  # Include original keywords
    for word in keywords:
        expanded_query.update(get_relevant_synonyms(word))  # Add relevant synonyms

    return " ".join(expanded_query)

In [33]:
import nltk

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [34]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [35]:
print(expand_query("What inPHDs work"))

PHDs piece of work work employment Ph.D. PhD study


##Retrieving best matches using TF-IDF and Cosine Similarity

Went through analysis using different thresholds and chose 0.3 for better accuracy in retrieval

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize

def find_best_kpis(query, list_type, min_score=0.2, high_score=0.35):
    expanded_query = expand_query(query)

    vectorizer = TfidfVectorizer()
    combined = list_type + [expanded_query]

    tfidf_matrix = vectorizer.fit_transform(combined)
    similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

    sorted_matches = sorted(zip(list_type, similarities[0]), key=lambda x: x[1], reverse=True)

    if sorted_matches:
        best_kpi, best_score = sorted_matches[0]
        overlap_count = word_overlap(expanded_query, best_kpi)

        if overlap_count > 1:  # Higher priority retrieval logic
            if exact_match(query, best_kpi):
                filtered_matches = [(kpi, score) for kpi, score in sorted_matches if score > min_score]
                if len(filtered_matches) > 1 and filtered_matches[0][1] > high_score and filtered_matches[1][1] > high_score:
                    return filtered_matches[:2]
                elif filtered_matches:
                    return [filtered_matches[0]]
            else:
                return [(kpi, score) for kpi, score in sorted_matches if score > min_score]

    return [(kpi, score) for kpi, score in sorted_matches if score > min_score]

#TEST CASES

##If no exact match found return all similar KPIs

In [48]:
import time

query = "what is the applications service labor productivity ratio for DJVC for July 2018?"

start_time = time.time()  # Start time

matches = find_best_kpis(query,kpi_list)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

filtered_matches_kpi_list = find_best_kpis(query,list_of_kpi_list, min_score = 0.1)
print("Top Matching KPI Lists:")
for kpi, score in filtered_matches_kpi_list:
    print(f"{kpi}: {score}")

filtered_matches_focus_areas = find_best_kpis(query,focus_areas_list, min_score = 0.1)
print("Top Matching Focus Areas:")
for kpi, score in filtered_matches_focus_areas:
    print(f"{kpi}: {score}")


end_time = time.time()  # End time
retrieval_time = round(end_time - start_time, 4)  # Compute time taken

print(f"\nTime taken: {retrieval_time} seconds")


Top Matching KPIs:
Total Service Contractors: 0.36275234447375393
Service Contractor Saudization: 0.3253422261399763
Saudi Service Contractors: 0.3088555279459524
Service Delivery Customer Satisfaction (): 0.29302612581632353
Service Support Customer Satisfaction (): 0.28909019705376343
Top Matching KPI Lists:
Top Matching Focus Areas:

Time taken: 1.4381 seconds


In [49]:
import time

start_time = time.time()  # Start time

# Example query
query = "Which division in CAD has the highest % of Test Phishing Email?"
matches = find_best_kpis(query,kpi_list)
# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

filtered_matches_kpi_list = find_best_kpis(query,list_of_kpi_list)
print("Top Matching KPI Lists:")
for kpi, score in filtered_matches_kpi_list:
    print(f"{kpi}: {score}")

filtered_matches_focus_areas = find_best_kpis(query,focus_areas_list)
print("Top Matching Focus Areas:")
for kpi, score in filtered_matches_focus_areas:
    print(f"{kpi}: {score}")


end_time = time.time()  # End time
retrieval_time = round(end_time - start_time, 4)  # Compute time taken

print(f"\nTime taken: {retrieval_time} seconds")

Top Matching KPIs:
Test Phishing Email Positive: 0.2080458595241983
Test Phishing Email Failure: 0.2047306793549044
Test Phishing Email Neutral: 0.2047306793549044
Top Matching KPI Lists:
Top Matching Focus Areas:

Time taken: 1.2842 seconds


In [50]:
import time

start_time = time.time()  # Start time

query = "What are the KPIs in the performance summary and dt value chain?"

matches = find_best_kpis(query,kpi_list)
# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

filtered_matches_kpi_list = find_best_kpis(query,list_of_kpi_list)
print("Top Matching KPI Lists:")
for kpi, score in filtered_matches_kpi_list:
    print(f"{kpi}: {score}")

filtered_matches_focus_areas = find_best_kpis(query,focus_areas_list)
print("Top Matching Focus Areas:")
for kpi, score in filtered_matches_focus_areas:
    print(f"{kpi}: {score}")

end_time = time.time()  # End time
retrieval_time = round(end_time - start_time, 4)  # Compute time taken

print(f"\nTime taken: {retrieval_time} seconds")

Top Matching KPIs:
Innovation Realized Value: 0.223128871249002
Innovation Realized Value ($): 0.223128871249002
Value Creation - Yellow: 0.2194821929712428
Top Matching KPI Lists:
DT Value Chain: 0.523767655789811
Top Matching Focus Areas:
Value Creation: 0.3079984700770705

Time taken: 0.8164 seconds


##Missing Spaces between the word and abbreviation (starts with word)

In [51]:
import time

start_time = time.time()  # Start time

# Example query
query = "What are the PatchedSAP Systems?"
matches = find_best_kpis(query,kpi_list)
# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

filtered_matches_kpi_list = find_best_kpis(query,list_of_kpi_list)
print("Top Matching KPI Lists:")
for kpi, score in filtered_matches_kpi_list:
    print(f"{kpi}: {score}")

filtered_matches_focus_areas = find_best_kpis(query,focus_areas_list)
print("Top Matching Focus Areas:")
for kpi, score in filtered_matches_focus_areas:
    print(f"{kpi}: {score}")

end_time = time.time()  # End time
retrieval_time = round(end_time - start_time, 4)  # Compute time taken

print(f"\nTime taken: {retrieval_time} seconds")

Top Matching KPIs:
Patch level Compliance: 0.24612509228138485
Patched SAP Systems: 0.23509842348815302
Top Matching KPI Lists:
Top Matching Focus Areas:

Time taken: 0.8258 seconds


##Missing Spaces between abbreviation and word (starts with abbreviation)

In [52]:
import time

start_time = time.time()  # Start time

# Example query
query = "List me Target KMIHours?"
matches = find_best_kpis(query,kpi_list)
# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

filtered_matches_kpi_list = find_best_kpis(query,list_of_kpi_list)
print("Top Matching KPI Lists:")
for kpi, score in filtered_matches_kpi_list:
    print(f"{kpi}: {score}")

filtered_matches_focus_areas = find_best_kpis(query,focus_areas_list)
print("Top Matching Focus Areas:")
for kpi, score in filtered_matches_focus_areas:
    print(f"{kpi}: {score}")

end_time = time.time()  # End time
retrieval_time = round(end_time - start_time, 4)  # Compute time taken

print(f"\nTime taken: {retrieval_time} seconds")

Top Matching KPIs:
Target KMI Hours: 0.3138437494317441
Actual KMI Hours: 0.2115660363814194
Top Matching KPI Lists:
Top Matching Focus Areas:

Time taken: 0.8043 seconds


##Exact Match with abbreviations

In [53]:
import time

start_time = time.time()  # Start time

# Example query
query = "What isthe Scalability Index (DT Archived)?"
matches = find_best_kpis(query,kpi_list)
# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

filtered_matches_kpi_list = find_best_kpis(query,list_of_kpi_list)
print("Top Matching KPI Lists:")
for kpi, score in filtered_matches_kpi_list:
    print(f"{kpi}: {score}")

filtered_matches_focus_areas = find_best_kpis(query,focus_areas_list)
print("Top Matching Focus Areas:")
for kpi, score in filtered_matches_focus_areas:
    print(f"{kpi}: {score}")

end_time = time.time()  # End time
retrieval_time = round(end_time - start_time, 4)  # Compute time taken

print(f"\nTime taken: {retrieval_time} seconds")

Top Matching KPIs:
Scalability Index (DT Archived): 0.42933466876161674
Top Matching KPI Lists:
Top Matching Focus Areas:

Time taken: 0.8505 seconds


##Jumbled Words

In [63]:
import time

start_time = time.time()  # Start time

# Example query
query = "What are the granted patents in the FY24?"
matches = find_best_kpis(query,kpi_list)
# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

filtered_matches_kpi_list = find_best_kpis(query,list_of_kpi_list)
print("Top Matching KPI Lists:")
for kpi, score in filtered_matches_kpi_list:
    print(f"{kpi}: {score}")

filtered_matches_focus_areas = find_best_kpis(query,focus_areas_list)
print("Top Matching Focus Areas:")
for kpi, score in filtered_matches_focus_areas:
    print(f"{kpi}: {score}")

end_time = time.time()  # End time
retrieval_time = round(end_time - start_time, 4)  # Compute time taken

print(f"\nTime taken: {retrieval_time} seconds")

Top Matching KPIs:
Patents Granted: 0.31448167325942156
Top Matching KPI Lists:
Top Matching Focus Areas:

Time taken: 0.8382 seconds


In [64]:
import time

start_time = time.time()  # Start time

# Example query
query = "What is the DT Scalability Index Archived"
matches = find_best_kpis(query,kpi_list)
# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

filtered_matches_kpi_list = find_best_kpis(query,list_of_kpi_list)
print("Top Matching KPI Lists:")
for kpi, score in filtered_matches_kpi_list:
    print(f"{kpi}: {score}")

filtered_matches_focus_areas = find_best_kpis(query,focus_areas_list)
print("Top Matching Focus Areas:")
for kpi, score in filtered_matches_focus_areas:
    print(f"{kpi}: {score}")

end_time = time.time()  # End time
retrieval_time = round(end_time - start_time, 4)  # Compute time taken

print(f"\nTime taken: {retrieval_time} seconds")

Top Matching KPIs:
Scalability Index (DT Archived): 0.42933466876161674
Competency Index: 0.2374161864226096
Knowledge Management Index: 0.21573945021626398
Top Matching KPI Lists:
Top Matching Focus Areas:

Time taken: 1.0325 seconds


##Multiple Exact matching KPIs with abbreviations

In [54]:
import time

start_time = time.time()  # Start time

# Example query
query = "List all CDPNE"
matches = find_best_kpis(query,kpi_list)
# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

filtered_matches_kpi_list = find_best_kpis(query,list_of_kpi_list)
print("Top Matching KPI Lists:")
for kpi, score in filtered_matches_kpi_list:
    print(f"{kpi}: {score}")

filtered_matches_focus_areas = find_best_kpis(query,focus_areas_list)
print("Top Matching Focus Areas:")
for kpi, score in filtered_matches_focus_areas:
    print(f"{kpi}: {score}")

end_time = time.time()  # End time
retrieval_time = round(end_time - start_time, 4)  # Compute time taken

print(f"\nTime taken: {retrieval_time} seconds")

Top Matching KPIs:
CDPNE: 0.34106129487240144
Total Sponsored CDPNE: 0.20807952540236188
Top Matching KPI Lists:
Top Matching Focus Areas:

Time taken: 0.8733 seconds


##Missing space (word-abbreviation-word)

In [55]:
import time

start_time = time.time()  # Start time

# Example query
query = "List all inKMIHours"
matches = find_best_kpis(query,kpi_list)
# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

filtered_matches_kpi_list = find_best_kpis(query,list_of_kpi_list)
print("Top Matching KPI Lists:")
for kpi, score in filtered_matches_kpi_list:
    print(f"{kpi}: {score}")

filtered_matches_focus_areas = find_best_kpis(query,focus_areas_list)
print("Top Matching Focus Areas:")
for kpi, score in filtered_matches_focus_areas:
    print(f"{kpi}: {score}")

end_time = time.time()  # End time
retrieval_time = round(end_time - start_time, 4)  # Compute time taken

print(f"\nTime taken: {retrieval_time} seconds")

Top Matching KPIs:
Target KMI Hours: 0.2802902898008559
Actual KMI Hours: 0.25866881962221927
Top Matching KPI Lists:
Top Matching Focus Areas:

Time taken: 0.9346 seconds


##Spelling Check (Feamle -> Female)

In [56]:
import time

start_time = time.time()  # Start time

# Example query
query = "Which organization in ITSED has the higest % of feamle representation?"

matches = find_best_kpis(query,kpi_list)
# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

filtered_matches_kpi_list = find_best_kpis(query,list_of_kpi_list)
print("Top Matching KPI Lists:")
for kpi, score in filtered_matches_kpi_list:
    print(f"{kpi}: {score}")

filtered_matches_focus_areas = find_best_kpis(query,focus_areas_list)
print("Top Matching Focus Areas:")
for kpi, score in filtered_matches_focus_areas:
    print(f"{kpi}: {score}")

end_time = time.time()  # End time
retrieval_time = round(end_time - start_time, 4)  # Compute time taken

print(f"\nTime taken: {retrieval_time} seconds")

Top Matching KPIs:
Female Representation: 0.3285374668759515
Share best practices (12.1): 0.21096853046227196
Top Matching KPI Lists:
Top Matching Focus Areas:

Time taken: 0.8269 seconds


##Exact match without abbreviations

In [57]:
import time

start_time = time.time()  # Start time

# Example query
query = "Which group in ITSED has the highest number of permanent female employees?"
matches = find_best_kpis(query,kpi_list)
# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

filtered_matches_kpi_list = find_best_kpis(query,list_of_kpi_list)
print("Top Matching KPI Lists:")
for kpi, score in filtered_matches_kpi_list:
    print(f"{kpi}: {score}")

filtered_matches_focus_areas = find_best_kpis(query,focus_areas_list)
print("Top Matching Focus Areas:")
for kpi, score in filtered_matches_focus_areas:
    print(f"{kpi}: {score}")

end_time = time.time()  # End time
retrieval_time = round(end_time - start_time, 4)  # Compute time taken

print(f"\nTime taken: {retrieval_time} seconds")

Top Matching KPIs:
Permanent Female Employees: 0.39003750784500946
Top Matching KPI Lists:
Top Matching Focus Areas:

Time taken: 1.3003 seconds


In [58]:
import time

start_time = time.time()  # Start time

# Example query
query = "What is the Professional Certification % in F&AD in July 2024?"
matches = find_best_kpis(query,kpi_list)
# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

filtered_matches_kpi_list = find_best_kpis(query,list_of_kpi_list)
print("Top Matching KPI Lists:")
for kpi, score in filtered_matches_kpi_list:
    print(f"{kpi}: {score}")

filtered_matches_focus_areas = find_best_kpis(query,focus_areas_list)
print("Top Matching Focus Areas:")
for kpi, score in filtered_matches_focus_areas:
    print(f"{kpi}: {score}")

end_time = time.time()  # End time
retrieval_time = round(end_time - start_time, 4)  # Compute time taken

print(f"\nTime taken: {retrieval_time} seconds")

Top Matching KPIs:
Professional Certification: 0.4236606149825768
Top Matching KPI Lists:
Top Matching Focus Areas:

Time taken: 1.0033 seconds


In [61]:
# List of predefined questions
predefined_questions = [

# SYNONYMS:
	"Which group in CAD has the highest % of women representation?",

# NOT EXACT MATCHING KPIS
    "what is the applications service labor productivity ratio for DJVC for July 2018",
    "What is the total Number of Failed Responses to the Phishing Tests in September 2019",
    "what is applications service labor productivity kpi would mean",
    "Kpi formula for applications service labor productivity",
    "what is the top performing organization within CAD?",
    "what is the focus area for Supply Chain On-Time Delivery",
    "what are the kpis that is under Alignment to National Interest category",
    "List me all the KPIs in the performance measures list",
    "What are the KPIs in the performance summary and dt value chain?"

# SHOULD NOT RETURN ANYTHING
    "Is there any under performing KPIs in ITSED organization across all periods?",
    "Which Kpis are under performing for f&ad for october 2024",
    "is this kpi healthy?",
    "What we discussed so far",
    "Hi",
    "How are you?",
    "Which Kpis are under performing for ssd for june 2024",
    "How about august 2024?",
    "List the kpis present in cad",
    "How many kpis are present in the data",
    "what are the organizations present in the data",
    "What is the total Mean Time to Repair hours for ITI in October and May 2024",
    "what are the previous questions that I asked?",
    "what is the category for Drilling Demand Forecasting Accuracy kpi",

# EXACT MATCHING KPIS
    "How about Test Phishing Email Failure?",
    "how many patents granted for CAD in 2024 without limit?",
    "what is permanent female employees would mean",
    "what is the difference between female representation and permanent female employees kpi",
    "what is the kpi formula for female representation and permanent female employees kpi",
    "Which group in ITSED has the highest number of female employees?",
    "Which group in ITSED has the highest % of female representation?",
    "what is the percentage of top performer retention for the period feb 2018?",
    "What is the Professional Certification % in CG in February 2019?",
    "During which period CG has highest % Professional Certification in 2019?",
    "What is the Total Recorded Case Frequency rate for SCMD in December 2021",
    "What is the total number of Traffic Violations across all the periods in 2024?",
    "Which division in CAD has the highest % of Test Phishing Email Failure?",

# ABBREVIATIONS:
    "What is the Scalability Index (DT Archived)?",

# MISSING SPACES:
    "What isthe Scalability Index (DT Archived)?",

# MISSING SPACES WITH ABBREVIATIONS: (Abbreviation at the beginning)
    "What areSMPs",

# MISSING SPACES WITH ABBREVIATIONS: (Abbreviation at the middle word-abbreviation-word)
    "Find all workingKMIHours",

# MISSING SPACES WITH ABBREVIATIONS: (Abbreviation at the end)
    "What are the PHDsfound",
]

In [62]:
import pandas as pd
import time

# Create a list to store question-answer pairs
qa_list = []

for question in predefined_questions:
    start_time = time.time()  # Start time

    matches = find_best_kpis(question, kpi_list)
    top_matches = [f"{kpi}: {score:.2f}" for kpi, score in matches]
    answer = ", ".join(top_matches) if top_matches else "No matching KPI found"

    matches = find_best_kpis(question, list_of_kpi_list)
    top_matches_kpi_list = [f"{kpi}: {score:.2f}" for kpi, score in matches]
    answer_kpi_list = ", ".join(top_matches_kpi_list) if top_matches_kpi_list else "No matching KPI found"

    matches = find_best_kpis(question, focus_areas_list)
    top_matches_focus_areas = [f"{kpi}: {score:.2f}" for kpi, score in matches]  # Top 3 KPIs
    answer_focus_areas = ", ".join(top_matches_focus_areas) if top_matches_focus_areas else "No matching KPI found"

    end_time = time.time()  # End time
    retrieval_time = round(end_time - start_time, 4)  # Compute time taken

    qa_list.append({"Question": question, "KPI": answer, "KPI List": answer_kpi_list, "Focus Areas": answer_focus_areas, "Retrieval Time": retrieval_time})

# Convert list to DataFrame
df = pd.DataFrame(qa_list)

# Save to Excel
df.to_excel("KPI_Matching_using Wordnet (FINAL)(8).xlsx", index=False)

print("Excel file 'KPI_Matching.xlsx' created successfully.")

Excel file 'KPI_Matching.xlsx' created successfully.
