In [1]:
import pandas as pd
from pkg_resources import non_empty_lines
from sklearn.metrics import classification_report

df = pd.read_csv('datasets/synthetic_logs.csv')
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [2]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [3]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [5]:
df.shape

(2410, 5)

In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
import numpy as np

#load pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
#generate embeddings for logs message
embedding = model.encode(df['log_message'].tolist())
embedding

  from .autonotebook import tqdm as notebook_tqdm


array([[-0.10293969,  0.03354593, -0.02202609, ...,  0.00457784,
        -0.04259722,  0.00322624],
       [ 0.00804572, -0.03573925,  0.04938739, ...,  0.01538318,
        -0.06230948, -0.02774668],
       [-0.00908222,  0.13003924, -0.05275575, ...,  0.02014106,
        -0.05117101, -0.02930294],
       ...,
       [-0.04022275,  0.04224353, -0.06610429, ...,  0.02363658,
        -0.00530871,  0.02044462],
       [-0.03603456,  0.01960893,  0.10052757, ...,  0.03668109,
        -0.02487843, -0.00578848],
       [ 0.01457428,  0.04911831, -0.00301355, ...,  0.01029736,
        -0.00068494,  0.00708863]], shape=(2410, 384), dtype=float32)

In [9]:
#perform dbscan clustering method
dbscan = DBSCAN(eps=0.2,min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embedding)
df['cluster'] = clusters
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,"nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2675118",HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,"nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" HTTP status code - 200 len: 211 time: 0.0968180",HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,"nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" RCODE 200 len: 1874 time: 0.2280791",HTTP Status,bert,0


In [12]:
#print similar like patterns
df[df.cluster == 3]


Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
6,3/1/2025 19:14,ModernHR,Shard 6 replication task ended in failure,Error,bert,3
45,5/22/2025 3:17,ThirdPartyAPI,Data replication task for shard 14 did not complete,Error,bert,3
98,12/7/2025 8:23,ModernCRM,Data replication task failed for shard 17,Error,bert,3
104,6/26/2025 16:19,ModernHR,Replication of data to shard 14 failed,Error,bert,3
190,3/6/2025 2:08,BillingSystem,Data replication task for shard 6 did not complete,Error,bert,3
228,6/15/2025 16:15,ModernCRM,Shard 1 data copy failed,Error,bert,3
262,2/2/2025 1:35,ModernHR,Data replication for shard 13 encountered an issue,Error,bert,3
263,6/19/2025 3:42,BillingSystem,Shard 8 synchronization task failed,Error,bert,3
286,2/5/2025 20:05,ThirdPartyAPI,Data replication for shard 16 was unsuccessful,Error,bert,3
297,11/17/2025 18:21,ModernCRM,Shard 2 experienced a replication failure,Error,bert,3


In [17]:
df[df.cluster == 2]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
1410,8/7/2025 18:51,ModernCRM,An unusual data access attempt was detected,Security Alert,bert,2
1426,3/3/2025 16:50,ThirdPartyAPI,Identified a possible unauthorized data access attempt,Security Alert,bert,2


In [19]:
cluster_counts = df['cluster'].value_counts()
large_cluster = cluster_counts[cluster_counts >10].index

for cluster in large_cluster:
    print(f"Cluster {cluster}:")
    print(df[df['cluster'] == cluster]['log_message'].head().to_string(index = False))
    print()

Cluster 0:
           nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
            nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
      nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v

In [35]:
classify_with_regex('Backup started at 2025-05-14 07:06:55.')

'System Notification'

In [36]:
classify_with_regex('User User494 logged out.')

'User Action'

In [40]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)
df[df.regex_label.notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by user User953.,System Notification,regex,4,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by user User175.,System Notification,regex,4,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,regex,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex,11,User Action
...,...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,regex,21,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,regex,13,System Notification


In [45]:
df.regex_label.shape

(2410,)

In [43]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,"nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2675118",HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,"nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" HTTP status code - 200 len: 211 time: 0.0968180",HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,"nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" RCODE 200 len: 1874 time: 0.2280791",HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,"nova.osapi_compute.wsgi.server [req-96c3ec98-21a0-4af2-84a8-d4989512413e 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" Return code: 200 len: 1916 time: 0.2677610",HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed logins,Security Alert,bert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,"nova.metadata.wsgi.server [req-b6d4a270-accb-4c3a-8179-9611e52e1768 - - - - -] 10.11.21.124,10.11.10.1 ""GET /openstack/2013-10-17 HTTP/1.1"" RCODE 200 len: 157 time: 0.2249990",HTTP Status,bert,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,


In [44]:
df_non_regex.shape

(1910, 7)

In [48]:
print(df_non_regex['target_label'].value_counts()[df_non_regex['target_label'].value_counts()<=5].index.tolist())



In [49]:
df_non_legacy = df_non_regex[df_non_regex.source!= 'LegacyCRM']
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [50]:
#generating embeddings for non legacy
filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist())
filtered_embeddings[:3]

array([[-0.10293969,  0.03354593, -0.02202609, ...,  0.00457784,
        -0.04259722,  0.00322624],
       [ 0.00804572, -0.03573925,  0.04938739, ...,  0.01538318,
        -0.06230948, -0.02774668],
       [-0.00908224,  0.13003926, -0.05275568, ...,  0.02014102,
        -0.051171  , -0.02930292]], shape=(3, 384), dtype=float32)

In [59]:
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import classification_report
#assign values
 X = filtered_embeddings
 y = df_non_legacy['target_label']

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        43
           1       0.98      0.98      0.98        59
           2       1.00      1.00      1.00       301
           3       1.00      1.00      1.00        66
           4       0.99      0.99      0.99       102

    accuracy                           0.99       571
   macro avg       0.99      0.99      0.99       571
weighted avg       0.99      0.99      0.99       571



In [65]:
import joblib
joblib.dump(classifier, '../models/log_classifier.joblib')

['models/log_classifier.joblib']