In [None]:
!pip install numpy pandas scikit-learn nltk gensim



In [None]:
import numpy as np
import pandas as pd
import nltk
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download stopwords if not available
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import gensim.downloader as api

# Load the Word2Vec model
word_vectors = api.load('word2vec-google-news-300')

# Check a word's vector representation
print(word_vectors['king'])
print(word_vectors.most_similar("computer"))

[ 1.25976562e-01  2.97851562e-02  8.60595703e-03  1.39648438e-01
 -2.56347656e-02 -3.61328125e-02  1.11816406e-01 -1.98242188e-01
  5.12695312e-02  3.63281250e-01 -2.42187500e-01 -3.02734375e-01
 -1.77734375e-01 -2.49023438e-02 -1.67968750e-01 -1.69921875e-01
  3.46679688e-02  5.21850586e-03  4.63867188e-02  1.28906250e-01
  1.36718750e-01  1.12792969e-01  5.95703125e-02  1.36718750e-01
  1.01074219e-01 -1.76757812e-01 -2.51953125e-01  5.98144531e-02
  3.41796875e-01 -3.11279297e-02  1.04492188e-01  6.17675781e-02
  1.24511719e-01  4.00390625e-01 -3.22265625e-01  8.39843750e-02
  3.90625000e-02  5.85937500e-03  7.03125000e-02  1.72851562e-01
  1.38671875e-01 -2.31445312e-01  2.83203125e-01  1.42578125e-01
  3.41796875e-01 -2.39257812e-02 -1.09863281e-01  3.32031250e-02
 -5.46875000e-02  1.53198242e-02 -1.62109375e-01  1.58203125e-01
 -2.59765625e-01  2.01416016e-02 -1.63085938e-01  1.35803223e-03
 -1.44531250e-01 -5.68847656e-02  4.29687500e-02 -2.46582031e-02
  1.85546875e-01  4.47265

In [None]:
#Load KPI Dataset
from google.colab import files
uploaded = files.upload()

Saving test_data_modified_3.xlsx to test_data_modified_3.xlsx


In [None]:
# Load the dataset
file_path = "test_data_modified_3.xlsx"
df = pd.read_excel(file_path, sheet_name="Sheet1")

In [None]:
# Extract unique KPI names
kpi_list = df["KPI"].dropna().unique().tolist()

In [None]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(kpi_list)

In [None]:
def get_word_embedding(sentence):
    """Compute average Word2Vec embedding for a sentence."""
    words = sentence.lower().split()  # Simple word tokenization
    words = [word for word in words if word in word_vectors]  # Remove words not in Word2Vec model

    if not words:
        return np.zeros(300)  # Return zero vector if no words are found

    embeddings = [word_vectors[word] for word in words]
    return np.mean(embeddings, axis=0)

In [None]:
def hybrid_similarity(query):
    """Compute combined similarity using TF-IDF and Word Embeddings."""
    query_tfidf = tfidf_vectorizer.transform([query])
    tfidf_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

    query_embedding = get_word_embedding(query)
    embedding_scores = np.array([cosine_similarity([query_embedding], [get_word_embedding(kpi)])[0][0] for kpi in kpi_list])

    # Weighted combination of TF-IDF and Word Embedding similarity
    alpha = 0.6  # Adjust weightage as needed
    final_scores = alpha * tfidf_scores + (1 - alpha) * embedding_scores

    # Get top 3 KPIs with scores
    top_indices = final_scores.argsort()[-3:][::-1]
    return [(kpi_list[i], final_scores[i]) for i in top_indices if final_scores[i] > 0.3]

In [None]:
# List of predefined questions
predefined_questions = [
    "what is the applications service labor productivity ratio for DJVC for July 2018",
    "Is there any under performing KPIs in ITSED organization across all periods?",
    "Which Kpis are under performing for f&ad for october 2024",
    "How about Test Phishing Email Failure?",
    "is this kpi healthy?",
    "What we discussed so far",
    "Hi",
    "How are you?",
    "Which Kpis are under performing for ssd for june 2024",
    "How about august 2024?",
    "how many patents granted for CAD in 2024 without limit?",
    "What is the total Number of Failed Responses to the Phishing Tests in September 2019",
    "Which group in ITSED has the highest number of permanent female employees?",
    "Which organization in ITSED has the highest % of female representation?",
    "what is applications service labor productivity kpi would mean",
    "Kpi formula for applications service labor productivity",
    "what is permanent female employees would mean",
    "what is the difference between female representation and permanent female employees kpi",
    "what is the kpi formula for female representation and permanent female employees kpi",
    "List the kpis present in cad",
    "How many kpis are present in the data",
    "what are the organizations present in the data",
    "Which group in ITSED has the highest number of female employees?",
    "Which group in ITSED has the highest % of female representation?",
    "what is the percentage of top performer retention for the period feb 2018?",
    "What is the Professional Certification % in CG in February 2019?",
    "During which period CG has highest % Professional Certification in 2019?",
    "What is the total Mean Time to Repair hours for ITI in October and May 2024",
    "What is the Total Recorded Case Frequency rate for SCMD in December 2021",
    "What is the total number of Traffic Violations across all the periods in 2024?",
    "Which kpis needs attention in CAD for July 2024",
    "what are the previous questions that I asked?",
    "Which division in CAD has the highest % of Test Phishing Email Failure?",
    "what is the top performing organization within CAD?",
    "what is the focus area for Supply Chain On-Time Delivery",
    "what is the category for Drilling Demand Forecasting Accuracy kpi",
    "what are the kpis that is under Alignment to National Interest category"
]

In [None]:
import os

punkt_path = "/root/nltk_data/tokenizers/punkt"
if os.path.exists(punkt_path):
    print("Punkt tokenizer is available.")
else:
    print("Punkt tokenizer is missing!")

Punkt tokenizer is available.


In [None]:
import pandas as pd
import time

# List to store query, retrieved KPI, and time taken
results = []

for question in predefined_questions:
    start_time = time.time()  # Start time

    matches = hybrid_similarity(question)  # Retrieve top KPIs

    top_matches = [f"{kpi}: {score:.2f}" for kpi, score in matches[:3]]  # Top 3 KPIs

    end_time = time.time()  # End time
    retrieval_time = round(end_time - start_time, 4)  # Compute time taken

    # Append result (query, KPIs retrieved, time taken)
    results.append([question, ", ".join(top_matches) if top_matches else "No matching KPI found", retrieval_time])

# Convert to DataFrame
df = pd.DataFrame(results, columns=["Query", "KPI Retrieved", "Time Taken (seconds)"])

# Save to Excel
df.to_excel("KPI_Retrieval_Report.xlsx", index=False)

print("Excel file 'KPI_Retrieval_Report.xlsx' created successfully.")


Excel file 'KPI_Retrieval_Report.xlsx' created successfully.
