In [2]:
import pandas as pd

df = pd.read_csv('myapp_logs_with_classification.nogit.csv')

df.app_name.unique()

array(['busybox', 'billing', 'legacycrm'], dtype=object)

In [3]:
df.target_classification.unique()



In [4]:
pd.set_option('display.max_colwidth', None)
df['message'] = df['message'].fillna('N/A')
df[df.level == 'default'].to_csv('default_level_logs.nogit.csv', index=False)

In [5]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings = model.encode(df['message'].tolist())

In [7]:
embeddings[:5]


array([[ 0.09903297, -0.048695  ,  0.01268109, ...,  0.0628672 ,
        -0.1253528 , -0.02058251],
       [ 0.05637626,  0.08952276,  0.0305897 , ..., -0.03325792,
        -0.0280063 ,  0.00731132],
       [-0.03860264, -0.06171542, -0.07587606, ...,  0.01474465,
         0.0189996 ,  0.08397783],
       [-0.02105866, -0.01128142,  0.00824801, ...,  0.01330624,
        -0.02337207,  0.05626958],
       [-0.01404935,  0.02050487,  0.02530796, ..., -0.02932672,
        -0.03665217,  0.03168793]], shape=(5, 384), dtype=float32)

In [8]:
clustering = DBSCAN(eps=0.2, min_samples=1, metric='cosine').fit(embeddings)
df['cluster'] = clustering.labels_

In [9]:
df.head(20)

Unnamed: 0,timestamp,namespace_name,app_name,level,log_type,message,target_classification,cluster
0,2025-03-09T04:53:12.143530588Z,myapp,busybox,default,application,I am busybox. Running normally..,info,0
1,2025-03-09T04:56:37.451478037Z,myapp,billing,default,application,security breach occurred. user tried to log in from outside of business hours,security alert,1
2,2025-03-09T04:56:50.646327206Z,myapp,legacycrm,default,application,The 'ExportToCSV' feature is outdated. Please migrate to 'ExportToXLSX' by the end of Q3.,warning,2
3,2025-03-09T04:57:20.651643036Z,myapp,legacycrm,default,application,API endpoint 'getCustomerDetails' is deprecated and will be removed in version 3.2. Use 'fetchCustomerInfo' instead.,warning,3
4,2025-03-09T04:57:47.467350244Z,myapp,billing,default,application,suspicious activity observed from ip 10.2.4.5,security alert,4
5,2025-03-09T04:59:07.487485571Z,myapp,billing,default,application,application is up and running within acceptable parameters,info,5
6,2025-03-09T05:01:17.516644851Z,myapp,billing,default,application,application is up and running within acceptable parameters,info,5
7,2025-03-09T05:01:47.522489869Z,myapp,billing,default,application,invoice generated successfully,info,6
8,2025-03-09T05:02:17.529301019Z,myapp,billing,default,application,invoice generated successfully,info,6
9,2025-03-09T05:03:40.717071599Z,myapp,legacycrm,default,application,The 'ExportToCSV' feature is outdated. Please migrate to 'ExportToXLSX' by the end of Q3.,warning,2


In [10]:
# Group by cluster to inspect patterns
clusters = df.groupby('cluster')['message'].apply(list)
sorted_clusters = clusters.sort_values(key=lambda x: x.map(len), ascending=False)

In [11]:
print("Clustered Patterns:")
for cluster_id, messages in sorted_clusters.items():
    if len(messages) > 10:
        print(f"Cluster {cluster_id}:")
        for msg in messages[:5]:
            print(f"  {msg}")

Clustered Patterns:
Cluster 7:
  I am busybox. I have just faced an issue...
  I am busybox. I have just faced an issue...
  I am busybox. I have just faced an issue...
  I am busybox. I have just faced an issue...
  I am busybox. I have just faced an issue...
Cluster 0:
  I am busybox. Running normally..
  I am busybox. Running normally..
  I am busybox. Running normally..
  I am busybox. Running normally..
  I am busybox. Running normally..
Cluster 2:
  The 'ExportToCSV' feature is outdated. Please migrate to 'ExportToXLSX' by the end of Q3.
  The 'ExportToCSV' feature is outdated. Please migrate to 'ExportToXLSX' by the end of Q3.
  The 'ExportToCSV' feature is outdated. Please migrate to 'ExportToXLSX' by the end of Q3.
  The 'ExportToCSV' feature is outdated. Please migrate to 'ExportToXLSX' by the end of Q3.
  The 'ExportToCSV' feature is outdated. Please migrate to 'ExportToXLSX' by the end of Q3.
Cluster 5:
  application is up and running within acceptable parameters
  applicat

In [12]:
len(embeddings)

3477

In [13]:
X = embeddings
y = df['target_classification'].values

In [14]:
import os
os.environ.get("VERSION", f"notebook-output")

'notebook-output'

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

         error       1.00      1.00      1.00       431
          info       1.00      1.00      1.00       423
security alert       1.00      1.00      1.00        78

      accuracy                           1.00      1044
     macro avg       1.00      1.00      1.00      1044
  weighted avg       1.00      1.00      1.00      1044



# Test the model using ONNX

In [32]:
import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Convert the model to ONNX format
initial_type = [('float_input', FloatTensorType([None, X.shape[1]]))]
onnx_model = convert_sklearn(clf, initial_types=initial_type)

# Save the ONNX model to a file
with open("../models/myclassifier/1/log_classifier.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [None]:
import onnxruntime as ort
import numpy as np
onnx_session = ort.InferenceSession("../models/myclassifier/1/log_classifier.onnx")

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')


import pandas as pd
test_df = pd.read_csv('myapp_logs-test.nogit.csv')
test_df['classification'] = ''
# test_logs = ["application failed to graceful shutdown", 
#              "login denied for user as it is out side of working hours", 
#              "App failed start due to insufficent memory",]
for index, row in test_df.iterrows():
    test_log = row['message']
    test_embeddings = model.encode([test_log])
    test_inputs = {onnx_session.get_inputs()[0].name: test_embeddings.astype(np.float32)}
    # print(test_inputs) --> {'float_input': array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...]], dtype=float32)}
    # print(onnx_session.get_inputs()[0]) --> NodeArg(name='float_input', type='tensor(float)', shape=[None, 384])
    test_probabilities = onnx_session.run(None, test_inputs)
    test_predicted_label = 'Unclassified'
    if(len(test_probabilities) < 2):
        probabilities_dict = test_probabilities[1][0]
        test_predicted_label = max(probabilities_dict, key=probabilities_dict.get)
        print(test_log, "--> Unclassified -->", test_predicted_label)
    else:
        test_predicted_label = test_probabilities[0][0]
        # print(test_log, "->", test_predicted_label, "-->", test_probabilities)
    test_df.at[index, 'classification'] = test_predicted_label

test_df.to_csv('myapp_logs-test_with_classification.nogit.csv', index=False)




float_input


NameError: name 'pd' is not defined

# Test the model classifier using JOBLIB

In [None]:
import joblib
joblib.dump(clf, '../models/log_classifier.joblib')

In [None]:
model_classification = joblib.load("../models/log_classifier.joblib")


test_df = pd.read_csv('myapp_logs-test.nogit.csv')
test_df['classification'] = ''
# test_logs = ["application failed to graceful shutdown", 
#              "login denied for user as it is out side of working hours", 
#              "App failed start due to insufficent memory",]
for index, row in test_df.iterrows():
    test_log = row['message']
    test_embeddings = model.encode([test_log])
    test_probabilities = model_classification.predict_proba(test_embeddings)[0]
    test_predicted_label = 'Unclassified'
    if max(test_probabilities) < 0.5:
        print(test_log, "--> Unclassified -->", test_probabilities)
    else:
        test_predicted_label = model_classification.predict(test_embeddings)[0]
        print(test_log, "->", test_predicted_label, "-->", test_probabilities)
    test_df.at[index, 'classification'] = test_predicted_label

test_df.to_csv('myapp_logs-test_with_classification.nogit.csv', index=False)

Lead conversion failed for prospect ID 3463 due to missing contact information. -> error --> [0.98159402 0.00580956 0.00480952 0.0077869 ]
application is up and running within acceptable parameters -> info --> [0.01150519 0.98177432 0.0039612  0.00275928]
Escalation rule execution failed for ticket ID 4339 - undefined escalation level. -> error --> [0.97228243 0.0121859  0.00512506 0.01040661]
application is up and running within acceptable parameters -> info --> [0.01150519 0.98177432 0.0039612  0.00275928]
I am busybox. Running normally.. -> info --> [1.45569395e-02 9.83163686e-01 1.40170382e-03 8.77670578e-04]
Escalation rule execution failed for ticket ID 6523 - undefined escalation level. -> error --> [0.9750762  0.01231893 0.0039439  0.00866097]
security breach occurred. user tried to log in from outside of business hours -> security alert --> [0.02638974 0.0189385  0.94393733 0.01073443]
backup completed -> info --> [0.03159122 0.957174   0.00449384 0.00674093]
Lead conversion f