In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import re
import torch

In [2]:
def read_csv_dataset(file_path, sep=','):
    return pd.read_csv(file_path, sep=sep)

def convert_timestamps_to_datetime(df, timestamp_column_name='timestamp', timestamp_unit='s'):
    df['datetime'] = pd.to_datetime(df[timestamp_column_name], unit=timestamp_unit)
    return df

def get_data(file_path, sep=',', timestamp_column_name='timestamp', timestamp_unit='s'):
    df = read_csv_dataset(file_path, sep=sep)
    df = convert_timestamps_to_datetime(df, timestamp_column_name, timestamp_unit)
    print(f"Dataset shape: {df.shape}")
    return df

In [3]:
failures_03_05_df = get_data('/Users/arinagoncharova/Documents/diploma/EDA/Aiops-Dataset/groundtruth/groundtruth-2022-05-03.csv')
failures_03_05_df.head()

Dataset shape: (50, 5)


Unnamed: 0,timestamp,level,cmdb_id,failure_type,datetime
0,1651510238,pod,shippingservice2-0,Kubernetes Container Memory Load,2022-05-02 16:50:38
1,1651511197,node,node-2,Node Memory Consumption,2022-05-02 17:06:37
2,1651512508,node,node-2,Node Disk Space Consumption,2022-05-02 17:28:28
3,1651515718,pod,adservice2-0,Kubernetes Container Network Resource Packet C...,2022-05-02 18:21:58
4,1651518109,pod,paymentservice-1,Kubernetes Container Network Packet Loss,2022-05-02 19:01:49


In [4]:
service_failures_03_05_df = failures_03_05_df[failures_03_05_df['level'] == 'service']
service_failures_03_05_df

Unnamed: 0,timestamp,level,cmdb_id,failure_type,datetime
5,1651519906,service,productcatalogservice,Kubernetes Container Network Resource Packet D...,2022-05-02 19:31:46
8,1651523554,service,recommendationservice,Kubernetes Container Read I/O Load,2022-05-02 20:32:34
10,1651529574,service,frontend,Kubernetes Container Network Resource Packet D...,2022-05-02 22:12:54
13,1651533257,service,recommendationservice,Kubernetes Container Network Packet Loss,2022-05-02 23:14:17
14,1651535538,service,cartservice,Kubernetes Container Network Packet Loss,2022-05-02 23:52:18
16,1651538629,service,emailservice,Kubernetes Container Write I/O Load,2022-05-03 00:43:49
18,1651544319,service,frontend,Kubernetes Container Write I/O Load,2022-05-03 02:18:39
19,1651545391,service,adservice,Kubernetes Container Read I/O Load,2022-05-03 02:36:31
21,1651551592,service,currencyservice,Kubernetes Container Network Resource Packet C...,2022-05-03 04:19:52
22,1651553048,service,checkoutservice,Kubernetes Container CPU Load,2022-05-03 04:44:08


In [5]:
logs_03_05_service_df = get_data('/Users/arinagoncharova/Documents/diploma/EDA/Aiops-Dataset/data/2022-05-03/log/all/log_filebeat-testbed-log-service.csv')
logs_03_05_service_df.head()

Dataset shape: (5444332, 6)


Unnamed: 0,log_id,timestamp,cmdb_id,log_name,value,datetime
0,Cp6Bt38B8vQa58bZsQau,1651507200,frontend-1,log_frontend-service_application,"severity: debug, message: request complete",2022-05-02 16:00:00
1,EZ6Bt38B8vQa58bZqQWr,1651507200,cartservice-2,log_cartservice-service_application,Executing endpoint 'gRPC - /hipstershop.C...,2022-05-02 16:00:00
2,FZ6Bt38B8vQa58bZqQWr,1651507200,cartservice-2,log_cartservice-service_application,[40m[32minfo[39m[22m[49m: Microsoft.AspNet...,2022-05-02 16:00:00
3,Fp6Bt38B8vQa58bZqQWr,1651507200,cartservice-2,log_cartservice-service_application,Request finished in 0.6231ms 200 applicat...,2022-05-02 16:00:00
4,F56Bt38B8vQa58bZqQWr,1651507200,cartservice-2,log_cartservice-service_application,[40m[32minfo[39m[22m[49m: Microsoft.AspNet...,2022-05-02 16:00:00


In [6]:
def normalize_log(line: str) -> str:
    return re.sub(r'\d+', '<NUM>', str(line))

In [7]:
logs_03_05_service_df['value_normalized'] = logs_03_05_service_df['value'].apply(normalize_log)
logs_03_05_service_df['value_normalized'].nunique()

84465

In [8]:
def get_model(model_name):
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")
    model = SentenceTransformer(model_name, device=device)
    return model

def compute_embeddings(model, texts_list, should_save=True, filename="/Users/arinagoncharova/Documents/diploma/repo/embeddings/AIOps/qween3_embeddings.parquet"):
    computed_emb = model.encode(
        texts_list,
        normalize_embeddings=True,
        batch_size=32,      
        show_progress_bar=True,
        convert_to_numpy=True,
        device=model.device 
    )

    text_emb_mapping = pd.DataFrame({
        'log_message': texts_list,
        'embedding': list(computed_emb)
    })

    if should_save:
        text_emb_mapping.to_parquet(filename)

    return text_emb_mapping

In [None]:
qween3_model = get_model("Qwen/Qwen3-Embedding-0.6B")
unique_logs_list = list(logs_03_05_service_df['value_normalized'])
log_values_emb_mapping_qween3_df = compute_embeddings(
    qween3_model,
    unique_logs_list,
    True,
    "/Users/arinagoncharova/Documents/diploma/repo/embeddings/AIOps/qween3_embeddings.parquet"
)
log_values_emb_mapping_qween3_df.head()

Using device: mps


model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

  [2m2025-10-21T18:16:35.554234Z[0m [31mERROR[0m  [31mPython exception updating progress:, error: PyErr { type: <class 'LookupError'>, value: LookupError(<ContextVar name='shell_parent' at 0x10675cea0>), traceback: Some(<traceback object at 0x2d9ab9600>) }, [1;31mcaller[0m[31m: "src/progress_update.rs:313"[0m
    [2;3mat[0m /Users/runner/work/xet-core/xet-core/error_printer/src/lib.rs:28

  [2m2025-10-21T18:16:35.559818Z[0m [31mERROR[0m  [31mPython exception updating progress:, error: PyErr { type: <class 'LookupError'>, value: LookupError(<ContextVar name='shell_parent' at 0x10675cea0>), traceback: Some(<traceback object at 0x2d9ab96c0>) }, [1;31mcaller[0m[31m: "src/progress_update.rs:313"[0m
    [2;3mat[0m /Users/runner/work/xet-core/xet-core/error_printer/src/lib.rs:28

  [2m2025-10-21T18:16:35.560214Z[0m [31mERROR[0m  [31mPython exception updating progress:, error: PyErr { type: <class 'LookupError'>, value: LookupError(<ContextVar name='shell_parent' at 