###Importing Necessary NLP Libraries

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import wordnet
from collections import defaultdict
from difflib import SequenceMatcher
from itertools import permutations

###Loading Dataset

In [None]:
#Load KPI Dataset
from google.colab import files
uploaded = files.upload()

Saving test_data_modified_3.xlsx to test_data_modified_3 (1).xlsx


In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Function to get synonyms only for keywords
def get_relevant_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        if syn.pos() in ['n', 'v', 'a']:  # Keep only Nouns (n), Verbs (v), and Adjectives (a)
            for lemma in syn.lemmas():
                synonym = lemma.name().replace('_', ' ')
                if synonym.lower() != word.lower():  # Avoid self-references
                    synonyms.add(synonym)
        if len(synonyms) >= 3:  # Limit synonyms per word to reduce noise
            break
    return synonyms

In [None]:
# Load the dataset
file_path = "test_data_modified_3.xlsx"
df = pd.read_excel(file_path, sheet_name="Sheet1")

###Extracting unique KPIs from the dataset

In [None]:
# Extract unique KPI names
kpi_list = df["KPI"].dropna().unique().tolist()

###Tokenizing function using Regular Expression module to handle punctuations better

In [None]:
# Tokenize using regex (handles punctuation)
def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())

###Expanding the Query to check if synonyms in the query match with the KPI list

In [None]:
from nltk.corpus import stopwords

# Function to extract keywords & expand query
def expand_query(query):
    stop_words = set(stopwords.words('english'))

    # Tokenize and POS tag the words
    words = tokenize(query)
    pos_tags = pos_tag(words)

    # Extract keywords (only nouns, verbs, adjectives & not stopwords)
    keywords = [word for word, tag in pos_tags if tag.startswith(('N', 'V', 'J')) and word not in stop_words]

    # Expand query with synonyms
    expanded_query = set(keywords)  # Include original keywords
    for word in keywords:
        expanded_query.update(get_relevant_synonyms(word))  # Add relevant synonyms

    return " ".join(expanded_query)

###Retrieving best matches using TF-IDF and Cosine Similarity

Went through analysis using different thresholds and chose 0.3 for better accuracy in retrieval

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def find_best_kpis(query, min_score=0.18):
    # Expand the query using WordNet synonyms
    expanded_query = expand_query(query)
    print(expanded_query)

    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Combine the KPI list and the expanded query
    combined = kpi_list + [expanded_query]

    # Fit and transform the combined list
    tfidf_matrix = vectorizer.fit_transform(combined)

    # Compute cosine similarity (the last row is the expanded query)
    similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

    # Get the matches where similarity score is greater than the threshold (min_score)
    filtered_matches = [(kpi, score) for kpi, score in zip(kpi_list, similarities[0]) if score > min_score]

    return filtered_matches


In [None]:
import nltk

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
# Example query
query = "Which group in CAD has the highest % of women representation?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")


cleaning woman grouping blackguard group representation women hound char agency chemical group delegacy high highest internal representation radical charwoman heel bounder dog woman mental representation cleaning lady cad adult female high-pitched
Top Matching KPIs:
Female Representation: 0.40069610798940414


In [None]:
# Example query
query = "Which KPIs are underperforming for SSD in August 2024?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")


Aug underachieve underperforming kpis august underperform ssd
Top Matching KPIs:


In [None]:
# Example query
query = "How many patents were granted for SCMD in April 2024?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

april many letters patent award patents patent scmd grant Apr granted patent of invention allow
Top Matching KPIs:
Patents Granted: 0.2897102624908531


In [None]:
# Example query
query = "What was CAD Controllable Cost plan in the beginning of 2024?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

controllable programme kickoff plan blackguard beginning monetary value commencement hound toll get-go first showtime starting time heel offset bounder dog start outset design cad cost price program
Top Matching KPIs:
Controllable Cost: 0.22406205134517854
Controllable Cost ($): 0.22406205134517854


In [None]:
# Example query
query = "What is the Professional Certification % in F&AD in July 2024?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

corroboration certification degree Fahrenheit fluorine enfranchisement advert advertisement july pro master professional advertizement ad professional person atomic number 9 documentation advertizing f advertising
Top Matching KPIs:
Professional Certification: 0.4236606149825768


In [None]:
# Example query
query = "Which group in CAD has the highest % of female representation?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

grouping blackguard group representation hound agency chemical group delegacy high highest internal representation radical heel female person bounder female dog mental representation cad high-pitched
Top Matching KPIs:
Female CPH: 0.20622608757301267
Female Representation: 0.5497777505031809


In [None]:
# Example query
query = "Which group in CAD has the highest number of female employees?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

act grouping blackguard group number hound chemical group figure high highest radical routine heel female person bounder female dog employee bit turn cad high-pitched employees
Top Matching KPIs:
Female CPH: 0.22494708129419408
Female Representation: 0.1966562963051932
Permanent Female Employees: 0.25683210102082865
Permanent CPH Female Employees: 0.22205291766374996


In [None]:
# Example query
query = "Which division in CAD has the highest % of Test Phishing Email Failure?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

trial netmail trial run unsuccessful person blackguard electronic mail part tryout hound cad division highest email section high loser nonstarter heel test bounder dog phishing variance high-pitched failure e-mail
Top Matching KPIs:
Test Phishing Email Failure: 0.2775654861316818
Test Phishing Email Failure (Repeated Violators): 0.2083246147987199
Test Phishing Email Neutral: 0.18952424757520922
Test Phishing Email Positive: 0.19259319175661593


In [None]:
# List of predefined questions
predefined_questions = [
    "Which group in CAD has the highest % of women representation?",
    "what is the applications service labor productivity ratio for DJVC for July 2018",
    "Is there any under performing KPIs in ITSED organization across all periods?",
    "Which Kpis are under performing for f&ad for october 2024",
    "How about Test Phishing Email Failure?",
    "is this kpi healthy?",
    "What we discussed so far",
    "Hi",
    "How are you?",
    "Which Kpis are under performing for ssd for june 2024",
    "How about august 2024?",
    "how many patents granted for CAD in 2024 without limit?",
    "What is the total Number of Failed Responses to the Phishing Tests in September 2019",
    "Which group in ITSED has the highest number of permanent female employees?",
    "Which organization in ITSED has the highest % of female representation?",
    "what is applications service labor productivity kpi would mean",
    "Kpi formula for applications service labor productivity",
    "what is permanent female employees would mean",
    "what is the difference between female representation and permanent female employees kpi",
    "what is the kpi formula for female representation and permanent female employees kpi",
    "List the kpis present in cad",
    "How many kpis are present in the data",
    "what are the organizations present in the data",
    "Which group in ITSED has the highest number of female employees?",
    "Which group in ITSED has the highest % of female representation?",
    "what is the percentage of top performer retention for the period feb 2018?",
    "What is the Professional Certification % in CG in February 2019?",
    "During which period CG has highest % Professional Certification in 2019?",
    "What is the total Mean Time to Repair hours for ITI in October and May 2024",
    "What is the Total Recorded Case Frequency rate for SCMD in December 2021",
    "What is the total number of Traffic Violations across all the periods in 2024?",
    "Which kpis needs attention in CAD for July 2024",
    "what are the previous questions that I asked?",
    "Which division in CAD has the highest % of Test Phishing Email Failure?",
    "what is the top performing organization within CAD?",
    "what is the focus area for Supply Chain On-Time Delivery",
    "what is the category for Drilling Demand Forecasting Accuracy kpi",
    "what are the kpis that is under Alignment to National Interest category"
]

In [None]:
import pandas as pd
import time

# Create a list to store question-answer pairs
qa_list = []

for question in predefined_questions:
    start_time = time.time()  # Start time

    matches = find_best_kpis(question)
    top_matches = [f"{kpi}: {score:.2f}" for kpi, score in matches[:3]]  # Top 3 KPIs

    end_time = time.time()  # End time
    retrieval_time = round(end_time - start_time, 4)  # Compute time taken

    answer = ", ".join(top_matches) if top_matches else "No matching KPI found"
    qa_list.append({"Question": question, "Top 3 Matching KPIs": answer, "Time Taken": retrieval_time})


# for question in predefined_questions:
#     matches = find_best_kpis(question)
#     top_matches = [f"{kpi}: {score:.2f}" for kpi, score in matches[:3]]  # Top 3 KPIs
#     answer = ", ".join(top_matches) if top_matches else "No matching KPI found"
#     qa_list.append({"Question": question, "Top 3 Matching KPIs": answer})


# Convert to DataFrame
df = pd.DataFrame(qa_list, columns=["Query", "KPI Retrieved", "Time Taken (seconds)"])

# Convert list to DataFrame
df = pd.DataFrame(qa_list)

# Save to Excel
df.to_excel("KPI_Matching_final_four.xlsx", index=False)

print("Excel file 'KPI_Matching.xlsx' created successfully.")

cleaning woman grouping blackguard group representation women hound char agency chemical group delegacy high highest internal representation radical charwoman heel bounder dog woman mental representation cleaning lady cad adult female high-pitched
religious service divine service proportion labor coating labour djvc military service service july practical application covering armed service ratio working class applications proletariat productiveness application productivity
performing periods acting organisation period time period playing period of time system organization arrangement itsed kpis playacting
performing october acting advertizement playing advertisement degree Fahrenheit fluorine ad atomic number 9 Oct advert advertizing f advertising playacting
trial netmail email trial run nonstarter unsuccessful person test electronic mail tryout phishing loser failure e-mail
kpi healthy
discuss talk about discourse discussed
hi hullo how-do-you-do howdy hello

performing acting june pl

In [None]:
question = "Which Kpis are under performing for f&ad for october 2024"
matches = find_best_kpis(question)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")


performing october acting advertizement playing advertisement degree Fahrenheit fluorine ad atomic number 9 Oct advert advertizing f advertising playacting
Top Matching KPIs:


In [None]:
question = "How about Test Phishing Email Failure?"
matches = find_best_kpis(question)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")


trial netmail email trial run nonstarter unsuccessful person test electronic mail tryout phishing loser failure e-mail
Top Matching KPIs:
Test Phishing Email Failure: 0.37390408381297774
Test Phishing Email Failure (Repeated Violators): 0.28063079930280993
Test Phishing Email Neutral: 0.2553051214599986
Test Phishing Email Positive: 0.25943924771039867


In [None]:
question = "is this kpi healthy?"
matches = find_best_kpis(question)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

kpi healthy
Top Matching KPIs:
Phishing KPI (Positive): 0.40669974358029193
reusable KPI Key from active Tile: 0.2638471653315122


In [None]:
question = "What we discussed so far"
matches = find_best_kpis(question)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")


discuss talk about discourse discussed
Top Matching KPIs:
