###Importing Necessary NLP Libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import wordnet
from collections import defaultdict
from difflib import SequenceMatcher
from itertools import permutations

###Loading Dataset

In [2]:
#Load KPI Dataset
from google.colab import files
uploaded = files.upload()

Saving test_data_modified_3.xlsx to test_data_modified_3.xlsx


In [3]:
# Load the dataset
file_path = "test_data_modified_3.xlsx"
df = pd.read_excel(file_path, sheet_name="Sheet1")

###Extracting unique KPIs from the dataset

In [4]:
# Extract unique KPI names
kpi_list = df["KPI"].dropna().unique().tolist()

###Tokenizing function using Regular Expression module to handle punctuations better

In [5]:
# Tokenize using regex (handles punctuation)
def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())

###Retrieving best matches using TF-IDF and Cosine Similarity

Went through analysis using different thresholds and chose 0.3 for better accuracy in retrieval

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def find_best_kpis(query, min_score=0.3):
    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Combine the KPI list and the query
    combined = kpi_list + [query]

    # Fit and transform the combined list
    tfidf_matrix = vectorizer.fit_transform(combined)

    # Compute cosine similarity (the last row is the query)
    similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

    # Get the matches where similarity score is greater than the threshold (min_score)
    filtered_matches = [(kpi, score) for kpi, score in zip(kpi_list, similarities[0]) if score > min_score]

    return filtered_matches

In [7]:
# Example query
query = "Which group in CAD has the highest % of female representation?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")


Top Matching KPIs:
Female Representation: 0.44141331419805163


In [8]:
# Example query
query = "Which KPIs are underperforming for SSD in August 2024?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")


Top Matching KPIs:


In [9]:
# Example query
query = "How many patents were granted for SCMD in April 2024?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

Top Matching KPIs:
Patents Granted: 0.4304182577769597


In [10]:
# Example query
query = "What was CAD Controllable Cost plan in the beginning of 2024?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

Top Matching KPIs:
Controllable Cost: 0.3911366336545692
Controllable Cost ($): 0.3911366336545692
Direct Controllable Cost: 0.30322487351867766


In [11]:
# Example query
query = "What is the Professional Certification % in F&AD in July 2024?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

Top Matching KPIs:
Professional Certification: 0.41190470855295336


In [12]:
# Example query
query = "Which group in CAD has the highest % of female representation?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

Top Matching KPIs:
Female Representation: 0.44141331419805163


In [13]:
# Example query
query = "Which group in CAD has the highest number of female employees?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

Top Matching KPIs:
Permanent Female Employees: 0.30836904745790417


In [14]:
# Example query
query = "Which division in CAD has the highest % of Test Phishing Email Failure?"
matches = find_best_kpis(query)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

Top Matching KPIs:
Test Phishing Email Failure: 0.5355621211324848
Test Phishing Email Failure (Repeated Violators): 0.40196198072255707
Test Phishing Email Neutral: 0.36568670497189615
Test Phishing Email Positive: 0.37160822741453736


In [15]:
# List of predefined questions
predefined_questions = [
    "what is the applications service labor productivity ratio for DJVC for July 2018",
    "Is there any under performing KPIs in ITSED organization across all periods?",
    "Which Kpis are under performing for f&ad for october 2024",
    "How about Test Phishing Email Failure?",
    "is this kpi healthy?",
    "What we discussed so far",
    "Hi",
    "How are you?",
    "Which Kpis are under performing for ssd for june 2024",
    "How about august 2024?",
    "how many patents granted for CAD in 2024 without limit?",
    "What is the total Number of Failed Responses to the Phishing Tests in September 2019",
    "Which group in ITSED has the highest number of permanent female employees?",
    "Which organization in ITSED has the highest % of female representation?",
    "what is applications service labor productivity kpi would mean",
    "Kpi formula for applications service labor productivity",
    "what is permanent female employees would mean",
    "what is the difference between female representation and permanent female employees kpi",
    "what is the kpi formula for female representation and permanent female employees kpi",
    "List the kpis present in cad",
    "How many kpis are present in the data",
    "what are the organizations present in the data",
    "Which group in ITSED has the highest number of female employees?",
    "Which group in ITSED has the highest % of female representation?",
    "what is the percentage of top performer retention for the period feb 2018?",
    "What is the Professional Certification % in CG in February 2019?",
    "During which period CG has highest % Professional Certification in 2019?",
    "What is the total Mean Time to Repair hours for ITI in October and May 2024",
    "What is the Total Recorded Case Frequency rate for SCMD in December 2021",
    "What is the total number of Traffic Violations across all the periods in 2024?",
    "Which kpis needs attention in CAD for July 2024",
    "what are the previous questions that I asked?",
    "Which division in CAD has the highest % of Test Phishing Email Failure?",
    "what is the top performing organization within CAD?",
    "what is the focus area for Supply Chain On-Time Delivery",
    "what is the category for Drilling Demand Forecasting Accuracy kpi",
    "what are the kpis that is under Alignment to National Interest category"
]

In [16]:
import pandas as pd
import time

# Create a list to store question-answer pairs
qa_list = []

for question in predefined_questions:
    start_time = time.time()  # Start time
    matches = find_best_kpis(question)
    top_matches = [f"{kpi}: {score:.2f}" for kpi, score in matches[:3]]  # Top 3 KPIs
    end_time = time.time()  # End time
    retrieval_time = round(end_time - start_time, 4)  # Compute time taken
    answer = ", ".join(top_matches) if top_matches else "No matching KPI found"
    qa_list.append({"Question": question, "Top 3 Matching KPIs": answer})

# Convert list to DataFrame
df = pd.DataFrame(qa_list)

# Save to Excel
df.to_excel("KPI_Matching.xlsx", index=False)

print("Excel file 'KPI_Matching.xlsx' created successfully.")

Excel file 'KPI_Matching.xlsx' created successfully.


In [17]:
question = "Which Kpis are under performing for f&ad for october 2024"
matches = find_best_kpis(question)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")


Top Matching KPIs:


In [18]:
question = "How about Test Phishing Email Failure?"
matches = find_best_kpis(question)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")


Top Matching KPIs:
Number of Phishing Test Recipients: 0.3287137192526166
Test Phishing Email Failure: 0.7505422151077068
Test Phishing Email Failure (Repeated Violators): 0.5633136166587831
Test Phishing Email Neutral: 0.5124770754971858
Test Phishing Email Positive: 0.5207755574016043


In [19]:
question = "is this kpi healthy?"
matches = find_best_kpis(question)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")

Top Matching KPIs:


In [20]:
question = "What we discussed so far"
matches = find_best_kpis(question)

# Display top 3 matching KPIs with their similarity scores
print("Top Matching KPIs:")
for kpi, score in matches:
    print(f"{kpi}: {score}")


Top Matching KPIs:
