In [2]:
import json
import numpy as np
import pandas as pd
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
with open("incident_data.json", "r") as f:
   alert_data = json.load(f)
df = pd.DataFrame(alert_data)
df = df[["INCIDENT_ID", "DESCRIPTION"]]  # Ensure correct format
df.head()

Unnamed: 0,INCIDENT_ID,DESCRIPTION
0,INC-20250319-001,A critical system failure has been detected in...
1,INC-20250319-001,A critical system failure has been detected in...
2,INC001,Unable to establish a connection to the produc...
3,INC002,The customer-facing web application is down wi...
4,INC003,CPU usage on server-05 has been above 95% for ...


In [10]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df["DESCRIPTION"].tolist())

In [11]:
NUM_CLUSTERS = 5  # Change based on your data
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)
df["cluster"] = cluster_labels

In [12]:
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(df["DESCRIPTION"])
for cluster_id in sorted(df["cluster"].unique()):
   indices = df[df["cluster"] == cluster_id].index
   cluster_docs = X[indices]
   mean_tfidf = cluster_docs.mean(axis=0)
   top_keywords = np.array(vectorizer.get_feature_names_out())[np.argsort(mean_tfidf.A1)[-5:]]
   print(f"Cluster {cluster_id} – Top Keywords: {', '.join(top_keywords[::-1])}")

Cluster 0 – Top Keywords: insufficient, vm, prod, storage, application
Cluster 1 – Top Keywords: database, user, unknown, security, admin
Cluster 2 – Top Keywords: network, failure, utc, disruptions, environment
Cluster 3 – Top Keywords: function, instance, aws, application, failed
Cluster 4 – Top Keywords: server, error, connection, 95, timeout


In [13]:
cluster_to_rca = {
   0: "Database overload",
   1: "Authentication failure",
   2: "Disk space issue",
   3: "App crash",
   4: "Network latency"
}

In [15]:
print("\n=== RCA Predictions for Each Alert ===\n")
for i, row in df.iterrows():
   cluster = row["cluster"]
   rca = cluster_to_rca.get(cluster, "Unknown")
   print(f"Incident ID: {row['INCIDENT_ID']}")
   print(f"Description: {row['DESCRIPTION']}")
   print(f"Cluster: {cluster}")
   print(f"Predicted Root Cause: {rca}\n")


=== RCA Predictions for Each Alert ===

Incident ID: INC-20250319-001
Description: A critical system failure has been detected in the Production Environment
at 03:25 AM UTC. Immediate attention is required to prevent further service
disruptions.
Cluster: 2
Predicted Root Cause: Disk space issue

Incident ID: INC-20250319-001
Description: A critical system failure has been detected in the Production Environment
at 03:25 AM UTC. Immediate attention is required to prevent further service
disruptions.
Cluster: 2
Predicted Root Cause: Disk space issue

Incident ID: INC001
Description: Unable to establish a connection to the production database. Error: Connection timeout after 30 seconds. Affects multiple applications.
Cluster: 4
Predicted Root Cause: Network latency

Incident ID: INC002
Description: The customer-facing web application is down with error 500 Internal Server Error. Logs show OutOfMemoryException in the backend.
Cluster: 4
Predicted Root Cause: Network latency

Incident ID: I

In [None]:
with open("kmeans_model.pkl", "wb") as f:
   pickle.dump(kmeans, f)
with open("cluster_to_rca.pkl", "wb") as f:
   pickle.dump(cluster_to_rca, f)
print("Model and mapping saved.")