In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("synthetic_logs.csv")

df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [3]:
df.shape

(2410, 5)

In [4]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [5]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
import numpy as np

# Generate embeddings for the log messages (assuming the column is named 'log_message')
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['log_message'].astype(str).tolist(), show_progress_bar=True)

embeddings[:2]

  from tqdm.autonotebook import tqdm, trange
Batches: 100%|██████████| 76/76 [00:49<00:00,  1.55it/s]


array([[-1.02939650e-01,  3.35459337e-02, -2.20260564e-02,
         1.55101169e-03, -9.86912474e-03, -1.78956285e-01,
        -6.34410679e-02, -6.01761974e-02,  2.81109493e-02,
         5.99619560e-02, -1.72618292e-02,  1.43372314e-03,
        -1.49560094e-01,  3.15284380e-03, -5.66030778e-02,
         2.71685515e-02, -1.49890240e-02, -3.54037099e-02,
        -3.62936929e-02, -1.45410625e-02, -5.61489025e-03,
         8.75538290e-02,  4.55121100e-02,  2.50964109e-02,
         1.00187529e-02,  1.24266976e-02, -1.39923558e-01,
         7.68696368e-02,  3.14095467e-02, -4.15253872e-03,
         4.36902903e-02,  1.71250310e-02, -8.00950900e-02,
         5.74006513e-02,  1.89091228e-02,  8.55261683e-02,
         3.96398529e-02, -1.34371877e-01, -1.44363148e-03,
         3.06704850e-03,  1.76854059e-01,  4.44884459e-03,
        -1.69274323e-02,  2.24267393e-02, -4.35050912e-02,
         6.09031925e-03, -9.98168997e-03, -6.23972267e-02,
         1.07371937e-02, -6.04898203e-03, -7.14661032e-0

In [16]:
# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)

# Add cluster labels to the dataframe
df['clusters'] = clusters

# Show cluster distribution
df['clusters'].value_counts()

clusters
0      1017
5       147
11      100
13       86
7        60
       ... 
122       1
124       1
125       1
126       1
127       1
Name: count, Length: 136, dtype: int64

In [11]:
df.drop('dbscan_cluster', axis=1, inplace=True)

In [17]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,clusters
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0


In [18]:
df[df['clusters'] == 1]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,clusters
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,bert,1
217,1/22/2025 5:45,BillingSystem,Mail service encountered a delivery glitch,Error,bert,1
248,5/2/2025 23:04,ModernHR,Service disruption caused by email sending error,Critical Error,bert,1
265,3/30/2025 23:53,ModernCRM,Email system had a problem sending emails,Error,bert,1
361,11/19/2025 23:06,BillingSystem,Email service experienced a sending issue,Error,bert,1
450,10/27/2025 5:59,ThirdPartyAPI,Email delivery system encountered an error,Error,bert,1
477,12/2/2025 10:30,AnalyticsEngine,Email transmission error caused service impact,Critical Error,bert,1
570,11/7/2025 18:08,ThirdPartyAPI,Email service impacted by sending failure,Critical Error,bert,1
678,4/28/2025 15:13,AnalyticsEngine,Email delivery problem affected system,Critical Error,bert,1


In [20]:
# Count the number of values in each cluster
cluster_counts = df['clusters'].value_counts().sort_values(ascending=True)

# Filter clusters with more than 10 values
large_clusters = cluster_counts[cluster_counts > 10].index

# Print 5 log messages for each large cluster
for cluster_id in large_clusters:
    print(f"\nCluster {cluster_id} (Count: {cluster_counts[cluster_id]}):")
    print(df[df['clusters'] == cluster_id]['log_message'].head(5).to_string(index=False))


Cluster 26 (Count: 11):
        Privilege elevation detected for user 5038
Elevation of admin privileges detected for user...
Elevation of admin privileges detected for user...
User 6069 has been granted elevated admin privi...

Cluster 59 (Count: 12):
Potential security threat: Admin privilege esca...
    Admin privilege escalation alert for user 2893
    Admin privilege escalation alert for user 8532
Potential security threat: Admin privilege esca...

Cluster 25 (Count: 13):
         System configuration is no longer valid
Configuration is corrupted throughout the system
     Cross-system configuration failure occurred
     System configuration is experiencing errors
        Configuration malfunction is system-wide

Cluster 42 (Count: 13):
User 5127 has escalated admin privileges withou...
            User 9745 has escalated to admin level
     User 8483 escalated privileges to admin level
            User 1987 has escalated to admin level
     User 8395 escalated privileges to admi

In [38]:
import re

def classify_log_message(log_message):
    """
    Classifies a log message using regex patterns and returns the target label.
    If no pattern matches, returns 'other'.
    """
    patterns = [
        (r'User User\d+ logged (in|out).', 'User Action'),
        (r'http status|wsgi\.server', 'HTTP Status'),
        (r'email service|failed transmission', 'Critical Error'),
        (r'unauthorized access|failed login|account experienced multiple failed', 'Security Alert'),
    ]
    for pattern, label in patterns:
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [34]:
classify_log_message("User User685 logged OUT.")

'User Action'

In [40]:
df['regex_label'] = df['log_message'].apply(classify_log_message)

In [42]:
df['regex_label'].unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', None,
       'User Action'], dtype=object)

In [43]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,clusters,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,Security Alert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,HTTP Status
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,HTTP Status


In [45]:
df[df.regex_label.notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,clusters,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,Security Alert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,HTTP Status
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,HTTP Status
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0,HTTP Status
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,7,Security Alert
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,0,HTTP Status
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,Critical Error


In [47]:
df_non_regex = df[df['regex_label'].isnull()]

In [48]:
df_non_regex.shape

(1249, 7)

In [49]:
df_non_regex.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,clusters,regex_label
6,3/1/2025 19:14,ModernHR,Shard 6 replication task ended in failure,Error,bert,3,
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,4,
8,2025-02-12 10:42:29,ThirdPartyAPI,nova.compute.claims [req-a07ac654-8e81-416d-bf...,Resource Usage,bert,5,
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,bert,1,
11,6/15/2025 11:44,ModernHR,Critical system unit error: unit ID Component55,Critical Error,bert,6,


In [None]:
# Identify small classes in the non-regex-labeled data, for which we can use logistic regression later
# these target labels belong to legacy crm 
label_counts = df_non_regex['target_label'].value_counts()
small_labels = label_counts[label_counts <= 5].index
print(small_labels.tolist())  



In [56]:
df_non_legacy = df_non_regex[~df_non_regex['target_label'].isin(small_labels)]

In [60]:
df_non_legacy['source'].unique()

array(['ModernHR', 'ThirdPartyAPI', 'ModernCRM', 'AnalyticsEngine',
       'BillingSystem'], dtype=object)

In [61]:
# Generate embeddings for the log messages (assuming the column is named 'log_message')
# model = SentenceTransformer('all-MiniLM-L6-v2')
filtered_embeddings = model.encode(df_non_legacy['log_message'].astype(str).tolist(), show_progress_bar=True)

filtered_embeddings[:2]

Batches: 100%|██████████| 39/39 [00:13<00:00,  2.96it/s]


array([[ 2.24610791e-02, -8.02895650e-02,  8.77409521e-03,
        -3.92057709e-02, -8.56306963e-03, -1.08223846e-02,
        -1.02909841e-01, -7.84475878e-02, -2.29802690e-02,
         3.29453796e-02,  1.51971518e-03,  3.58007438e-02,
         4.44075838e-02,  5.45145618e-03,  3.17093059e-02,
        -9.75915510e-03, -2.08439101e-02, -3.24148573e-02,
        -6.01219684e-02,  6.75506145e-02, -1.08872741e-01,
         9.33381636e-03,  6.05258495e-02,  3.40463221e-02,
         2.31043082e-02, -9.83717963e-02,  4.63556824e-03,
        -2.84142029e-02, -5.64913228e-02, -1.35368584e-02,
         8.31359997e-02, -2.26772074e-02, -9.20645893e-02,
        -3.62371877e-02,  8.02754611e-02,  1.33601248e-01,
         2.17525475e-02,  1.12199457e-02, -4.52408306e-02,
        -4.91912030e-02,  8.82793888e-02, -1.47098610e-02,
        -8.95307064e-02,  1.59341854e-03, -1.19982408e-02,
         4.09735627e-02, -1.46244187e-02, -1.41879013e-02,
         6.92769662e-02,  1.84404030e-02, -5.16961329e-0

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Prepare features and labels
X = filtered_embeddings
y = df_non_legacy['target_label']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train logistic regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

                     precision    recall  f1-score   support

     Critical Error       1.00      1.00      1.00        31
              Error       1.00      1.00      1.00        34
     Resource Usage       1.00      1.00      1.00        36
     Security Alert       1.00      1.00      1.00        68
System Notification       1.00      1.00      1.00        71
        User Action       1.00      1.00      1.00         9

           accuracy                           1.00       249
          macro avg       1.00      1.00      1.00       249
       weighted avg       1.00      1.00      1.00       249



In [None]:
import joblib

joblib.dump(clf, '..models/logistic_regression_model.joblib')

['models/logistic_regression_model.joblib']