In [7]:
import pandas as pd


df = pd.read_csv(r'D:\python for data sinceand ml\project-nlp-log-classification\training\dateset\synthetic_logs.csv')

In [8]:
df.describe()

Unnamed: 0,timestamp,source,log_message,target_label
count,2410,2410,2410,2410
unique,2402,6,2265,9
top,4/28/2025 20:42,ThirdPartyAPI,Backup completed successfully.,HTTP Status
freq,2,496,60,1017


In [9]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [10]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [11]:
from sentence_transformers import SentenceTransformer # type: ignore
from sklearn.cluster import DBSCAN
import numpy as np 

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(df['log_message'].tolist())

dbscan = DBSCAN(eps=0.24,min_samples=5,metric='cosine')

clusters = dbscan.fit_predict(embeddings)



In [13]:
df['cluster'] = clusters

In [14]:
df

Unnamed: 0,timestamp,source,log_message,target_label,cluster
0,6/27/2025 7:20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,29
3,7/12/2025 0:24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,6/2/2025 18:25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0
...,...,...,...,...,...
2405,8/13/2025 7:29,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,0
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,6
2407,8/3/2025 3:07,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,0
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1


In [16]:
cluster_count = df.cluster.value_counts()
large_cluster = cluster_count[cluster_count >10].index

for cluster in large_cluster:
  print(f'cluster {cluster}.')
  print(df[df['cluster']==cluster]['log_message'].head(30).to_string(index = False))
  

cluster 0.
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...
nova.osapi_compute.wsgi.server [req-d4f8d0c2-4f...
nova.osapi_compute.wsgi.server [req-6fe0e366-f2...
nova.osapi_compute.wsgi.server [req-5f1c2027-e1...
nova.osapi_compute.wsgi.server [req-945d1f31-a2...
nova.osapi_compute.wsgi.server [req-033d97b9-69...
nova.osapi_compute.wsgi.server [req-75bc6269-85...
nova.osapi_compute.wsgi.server [req-077c3c87-b7...
nova.osapi_compute.wsgi.server [req-4e83daf7-a2...
nova.osapi_compute.wsgi.server [req-bfce366e-98...
nova.osapi_compute.wsgi.server [req-5e6e042b-f9...
nova.metadata.wsgi.server [-] 10.11.21.138,10.1...
nova.osapi_compute.wsgi.server [req-fe9ef402-d3...
nova.metadata.wsgi.server [req-27e91939-3ba4-4d...
nova.osapi_compute.wsgi.server [req-aef59c8e-1f...
nova.osapi_compute.w

In [33]:
import re

def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).*": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.*": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.*": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action",
        r"Account with ID .* updated by .*": "User Action",
        r"Account with ID .* deleted by .*": "User Action",



    }

    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label

    return "Other"


In [34]:
classify_with_regex("User User123 logged in at 2023-10-01 12:00:00")

'User Action'

In [35]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)

In [36]:
df

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
0,6/27/2025 7:20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0,Other
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1,Other
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,29,Other
3,7/12/2025 0:24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0,Other
4,6/2/2025 18:25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0,Other
...,...,...,...,...,...,...
2405,8/13/2025 7:29,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,0,Other
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,6,Other
2407,8/3/2025 3:07,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,0,Other
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1,Other


In [37]:
df[df.regex_label.notnull()].shape

(2410, 6)

In [45]:
print(df_non_regex['target_label'].value_counts()[df_non_regex['target_label'].value_counts()>5].index.tolist())

['System Notification', 'User Action']


In [44]:
df_non_legacy = df_non_regex[df_non_regex.source != 'LegacyCRM']
df_non_legacy.source.unique()

array(['ModernHR', 'ThirdPartyAPI', 'ModernCRM', 'AnalyticsEngine',
       'BillingSystem'], dtype=object)

In [48]:
filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist())
print(filtered_embeddings[:2])

[[-2.94228569e-02 -8.70681833e-04 -7.40980208e-02 -4.12792005e-02
  -3.42596397e-02 -3.25561352e-02 -1.47934128e-02  2.76827551e-02
  -4.61630300e-02 -4.23525088e-03 -1.12679303e-02  3.31216753e-02
   2.38445755e-02 -4.70358059e-02 -6.51653185e-02 -5.06461263e-02
  -1.15989283e-01 -9.52246636e-02 -1.11447200e-02  2.70234626e-02
  -4.68750075e-02 -1.13822939e-03 -2.06070598e-02 -4.14567962e-02
   1.20106369e-01 -1.89999659e-02 -2.41568051e-02  3.42948386e-03
  -1.85985323e-02  2.86994670e-02  2.99387425e-02 -2.48572533e-03
   3.80762061e-03  8.44719037e-02  5.40919751e-02 -1.89499650e-02
   2.14070398e-02 -2.14109924e-02 -5.78539111e-02  4.56983186e-02
   6.24554045e-03 -5.24354447e-03  3.81471477e-02  1.07134804e-01
  -4.37787175e-02  5.96478321e-02  6.28406107e-02 -1.32653434e-02
   1.20463688e-03  6.95034713e-02 -1.26250535e-01  4.86109182e-02
   2.73854826e-02  5.48527390e-02 -2.73892172e-02 -9.96047780e-02
  -2.04427652e-02 -1.53816072e-02  4.32989933e-02  9.05175321e-03
  -2.89553

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn. preprocessing import LabelEncoder

label_encoder = LabelEncoder()
x= filtered_embeddings
y = df_non_legacy['target_label']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)
print(f"Model accuracy: {accuracy:.2f}")
y_pred = clf.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=clf.classes_))


Model accuracy: 1.00
                     precision    recall  f1-score   support

System Notification       1.00      1.00      1.00        74
        User Action       1.00      1.00      1.00        26

           accuracy                           1.00       100
          macro avg       1.00      1.00      1.00       100
       weighted avg       1.00      1.00      1.00       100



In [58]:
import joblib
joblib.dump(clf, 'models/log_classifier_model.pkl')

['models/log_classifier_model.pkl']