In [2]:
import pandas as pd

df = pd.read_csv("dataset/synthetic_logs.csv")
df.head()



Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [3]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [4]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer


In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings = model.encode(df['log_message'].tolist())

In [6]:
embeddings[:5]

array([[-0.10293967,  0.03354587, -0.02202607, ...,  0.00457792,
        -0.04259717,  0.00322621],
       [ 0.00804575, -0.03573927,  0.0493874 , ...,  0.01538318,
        -0.06230948, -0.02774665],
       [-0.00908229,  0.13003926, -0.05275564, ...,  0.02014101,
        -0.05117096, -0.02930292],
       [-0.09751049,  0.04911301, -0.03977426, ...,  0.02477502,
        -0.03546081, -0.00018604],
       [-0.10468344,  0.05926032, -0.024885  , ...,  0.02502053,
        -0.03719302, -0.0256891 ]], dtype=float32)

In [7]:
clustering = DBSCAN(eps=0.2, min_samples=1, metric='cosine').fit(embeddings)
df['cluster'] = clustering.labels_

In [8]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,"nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2675118",HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,"nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" HTTP status code - 200 len: 211 time: 0.0968180",HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,"nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" RCODE 200 len: 1874 time: 0.2280791",HTTP Status,bert,0


In [9]:
df[df.cluster==5]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
8,2025-02-12 10:42:29,ThirdPartyAPI,"nova.compute.claims [req-a07ac654-8e81-416d-bfbb-189116b07969 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: bf8c824d-f099-4433-a41e-e3da7578262e] Total memory: 64172 MB, used: 512.00 MB",Resource Usage,bert,5
26,2025-03-03 17:11:11,ModernCRM,"nova.compute.claims [req-d6986b54-3735-4a42-9074-0ba7d9717de9 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: af5f7392-f7d4-4298-b647-c98924c64aa1] memory limit: 96258.00 MB, free: 95746.00 MB",Resource Usage,bert,5
40,2025-06-19 21:42:34,ThirdPartyAPI,"nova.compute.claims [req-72b4858f-049e-49e1-b31e-b562c5018eaf 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: 63a0d960-70b6-44c6-b606-491478a5cadf] disk limit not specified, defaulting to unlimited",Resource Usage,bert,5
58,2025-09-13 14:45:14,AnalyticsEngine,"nova.compute.claims [req-5c8f52bd-8e3c-41f0-95a5-7861d247cafa 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: d96a117b-0193-4549-bdcc-63b917273d1d] vcpu limit not specified, defaulting to unlimited",Resource Usage,bert,5
61,2025-04-27 11:18:18,ThirdPartyAPI,"nova.compute.claims [req-d38f479d-9bb9-4276-9688-52607e8fd350 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: 95960536-049b-41f6-9049-05fc479b6a7c] disk limit not specified, defaulting to unlimited",Resource Usage,bert,5
...,...,...,...,...,...,...
2336,2025-12-10 11:53:33,AnalyticsEngine,"nova.compute.claims [req-97fcea79-42f7-4241-9b9e-63fe193c1929 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: b562ef10-ba2d-48ae-bf4a-18666cba4a51] disk limit not specified, defaulting to unlimited",Resource Usage,bert,5
2345,2025-12-22 01:38:48,BillingSystem,"nova.compute.claims [req-caeb3818-dab6-4e8d-9ea6-aceb23905ebc 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: 78dc1847-8848-49cc-933e-9239b12c9dcf] vcpu limit not specified, defaulting to unlimited",Resource Usage,bert,5
2352,2025-02-18 00:16:44,ModernCRM,"nova.compute.claims [req-98474cd9-61e1-4afe-bd52-676a577b058f 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: 70c1714b-c11b-4c88-b300-239afe1f5ff8] Total disk: 15 GB, used: 0.00 GB",Resource Usage,bert,5
2355,2025-11-28 18:03:55,BillingSystem,"nova.compute.claims [req-6f9ecdfe-481c-4535-9bdc-45d86085d739 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: c62f4f25-982c-4ea2-b5e4-93000edfcfbf] vcpu limit not specified, defaulting to unlimited",Resource Usage,bert,5


In [10]:
cluster_counts = df['cluster'].value_counts()
large_cluster_counts = cluster_counts[cluster_counts >10].index

for cluster in large_cluster_counts:
    print(f"Cluster{cluster}:")
    print(df[df['cluster']==cluster]['log_message'].head(5).to_string(index=False))
    print()

Cluster0:
           nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
            nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
      nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2

In [12]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message,re.IGNORECASE):
            return label
    return None

In [13]:
classify_with_regex("User User123 logged in.")

'User Action'

In [14]:
classify_with_regex("System reboot initiated by user User179.")

'System Notification'

In [15]:
df['regex_label'] = df['log_message'].apply(lambda x: classify_with_regex(x))
df[df['regex_label'].notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by user User953.,System Notification,regex,4,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by user User175.,System Notification,regex,4,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,regex,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex,11,User Action
...,...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,regex,21,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,regex,13,System Notification


In [16]:
df[df['regex_label'].isnull()].head(5)

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,"nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2675118",HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,"nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" HTTP status code - 200 len: 211 time: 0.0968180",HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,"nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" RCODE 200 len: 1874 time: 0.2280791",HTTP Status,bert,0,


In [17]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1910, 7)

In [18]:
df_legacy = df_non_regex[df_non_regex.source=="LegacyCRM"]
df_legacy

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
60,2025-10-06 16:55:23,LegacyCRM,Lead conversion failed for prospect ID 7842 due to missing contact information.,Workflow Error,llm,24,
255,2025-05-03 16:55:35,LegacyCRM,API endpoint 'getCustomerDetails' is deprecated and will be removed in version 3.2. Use 'fetchCustomerInfo' instead.,Deprecation Warning,llm,48,
377,2025-06-24 12:16:29,LegacyCRM,Customer follow-up process for lead ID 5621 failed due to missing next action,Workflow Error,llm,62,
1325,2025-04-17 07:33:44,LegacyCRM,Escalation rule execution failed for ticket ID 9807 - undefined escalation level.,Workflow Error,llm,105,
1734,2025-04-30 07:47:30,LegacyCRM,The 'ExportToCSV' feature is outdated. Please migrate to 'ExportToXLSX' by the end of Q3.,Deprecation Warning,llm,118,
1826,2025-01-23 10:33:36,LegacyCRM,Support for legacy authentication methods will be discontinued after 2025-06-01.,Deprecation Warning,llm,122,
2217,2025-05-12 09:46:54,LegacyCRM,Task assignment for TeamID 3425 could not complete due to invalid priority level.,Workflow Error,llm,133,


In [19]:
df_non_legacy = df_non_regex[df_non_regex.source!="LegacyCRM"]
df_non_legacy

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,"nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2675118",HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,"nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" HTTP status code - 200 len: 211 time: 0.0968180",HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,"nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" RCODE 200 len: 1874 time: 0.2280791",HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,"nova.osapi_compute.wsgi.server [req-96c3ec98-21a0-4af2-84a8-d4989512413e 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" Return code: 200 len: 1916 time: 0.2677610",HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed logins,Security Alert,bert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,"nova.metadata.wsgi.server [req-b6d4a270-accb-4c3a-8179-9611e52e1768 - - - - -] 10.11.21.124,10.11.10.1 ""GET /openstack/2013-10-17 HTTP/1.1"" RCODE 200 len: 157 time: 0.2249990",HTTP Status,bert,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,


In [20]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings_filtered = model.encode(df_non_legacy['log_message'].tolist())

In [21]:
len(embeddings_filtered)

1903

In [22]:
X = embeddings_filtered
y = df_non_legacy['target_label'].values
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.91      1.00      0.95        48
         Error       0.98      0.89      0.93        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       1.00      0.99      1.00       123

      accuracy                           0.99       571
     macro avg       0.98      0.98      0.98       571
  weighted avg       0.99      0.99      0.99       571



In [24]:
import joblib
joblib.dump(clf, '../models/log_classifier.joblib')

['models/log_classifier.joblib']