In [1]:
import json
import numpy as np
import pandas as pd
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load from JSON (or use your own source)
df = pd.read_json("incident_data.json")
# Keep only necessary columns and clean
df = df[["INCIDENT_ID", "DESCRIPTION"]]
df.dropna(subset=["DESCRIPTION"], inplace=True)
df = df[df["DESCRIPTION"].str.strip() != ""]

In [5]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df["DESCRIPTION"].tolist(), show_progress_bar=True)

Batches: 100%|██████████| 1/1 [00:00<00:00,  5.78it/s]


In [13]:
dbscan = DBSCAN(eps=0.5, min_samples=2, metric='cosine')
df["cluster"] = dbscan.fit_predict(embeddings)
# Get unique clusters
unique_clusters = sorted(df["cluster"].unique())
print(f"\nTotal Clusters Found (excluding noise): {len([c for c in unique_clusters if c != -1])}")
print(f"All cluster labels (including noise): {unique_clusters}")


Total Clusters Found (excluding noise): 3
All cluster labels (including noise): [np.int64(-1), np.int64(0), np.int64(1), np.int64(2)]


In [14]:
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(df["DESCRIPTION"])
print("\nTop Keywords per Cluster:")
for cluster_id in unique_clusters:
   if cluster_id == -1:
       print(f"\nCluster -1 (Noise): Skipped")
       continue
   cluster_indices = df[df["cluster"] == cluster_id].index
   cluster_matrix = X[cluster_indices]
   mean_tfidf = cluster_matrix.mean(axis=0)
   keywords = np.array(vectorizer.get_feature_names_out())[np.argsort(mean_tfidf.A1)[-5:]]  # Top 5
   print(f"\nCluster {cluster_id}:")
   print(f"  Total Alerts  : {len(cluster_indices)}")
   print(f"  Top Keywords  : {', '.join(keywords[::-1])}")


Top Keywords per Cluster:

Cluster -1 (Noise): Skipped

Cluster 0:
  Total Alerts  : 2
  Top Keywords  : utc, service, prevent, 25, environment

Cluster 1:
  Total Alerts  : 2
  Top Keywords  : connection, timeout, database, error, sql

Cluster 2:
  Total Alerts  : 2
  Top Keywords  : network, latency, possible, central1, issue
