## SCOM Calculation from Logs

In [1]:
import json

service_name = "scenario1"
file = open("./test_data/scenario1.json")
result = json.load(file)
file.close()

### Extract table names

In [2]:
import re

table_name_pattern = re.compile(
    r"""
    (?i)   # Case-insensitive matching
    \bFROM\s+([`'"]?[a-zA-Z_][\w$]*[`'"]?)|   
    \bJOIN\s+([`'"]?[a-zA-Z_][\w$]*[`'"]?)|   
    \bINTO\s+([`'"]?[a-zA-Z_][\w$]*[`'"]?)|  
    \bUPDATE\s+([`'"]?[a-zA-Z_][\w$]*[`'"]?)| 
    \bDELETE\s+FROM\s+([`'"]?[a-zA-Z_][\w$]*[`'"]?)  
    """,
    re.VERBOSE
)

def extract_table_names(sql):
    matches = table_name_pattern.findall(sql)
    #matches = [
    #('employees', '', '', '', '', '', ''),
    #('', 'customers', '', '', '', '', '')]
    # filters out empty matches ('') and flattens result to normal list
    return [match for sublist in matches for match in sublist if match]


### Parse JSON Input

In [3]:
def is_number(value):
   try:
        float(value)
        return True
   except ValueError:
        return False

class Log:
    def __init__(self, span_id, trace_id, parent_id, db_statements, http_target):
        self.span_id = span_id
        self.trace_id = trace_id
        self.parent_id = parent_id
        self.db_statements = db_statements
        self.http_target = http_target
        self.children = []
        self.parent_endpoint = ""

    def __repr__(self):
        return f"Log(span_id={self.span_id}, trace_id={self.trace_id}, parent_id={self.parent_id}, db_statements={self.db_statements}, http_target={self.http_target})"

    def to_json(self):
        return json.dumps({
            'spanId': self.span_id,
            'traceId': self.trace_id,
            'parentId': self.parent_id,
            'db_statements': self.db_statements,
            'http_target': self.http_target
        }, indent=2)
    
    def get_endpoint_name(self):
        endpoint_name = self.http_target

        if endpoint_name != None:
            endpoint =  endpoint_name.split('?')[0]
            endpoint_parts = endpoint.split('/')

            # if the last part is a number (e.g. /customers/count/9), it should get cut - else (e.g. /customers) 
            if is_number(endpoint_parts[-1]):
                endpoint_name = '/'.join(endpoint_parts[:-1])
            else: 
                endpoint_name = '/'.join(endpoint_parts)

            # removes duplicate / and ensures there is exactly one / at the end
            endpoint_name = '/'.join(part for part in endpoint_name.split('/') if part) + '/'
    
        return endpoint_name

    def get_db_statement(self):
        db_statements = self.db_statements

        if len(db_statements) > 0:
            return db_statements
        
        return None

    def get_table_names(self):
        statement = self.get_db_statement()
        
        if statement is not None:
            return extract_table_names(statement[0])
        
        return None

In [4]:
logs = []

for data in result["data"]:
    for log in data["spans"]:
        parent_id = None
        db_statements = []
        http_target = None

        for tag in log["tags"]:
            if "key" in tag:
                if tag["key"] == "db.statement":
                    db_statements.append(tag["value"])

                if tag["key"] == "http.target":
                    http_target = tag["value"]
        
        for reference in log["references"]:
            if "span" in reference: 
                for tag in reference["span"]["tags"]:
                    if tag["key"] == "db.statement":
                        db_statements.append(tag["value"])

                    if tag["key"] == "http.target":
                        http_target = tag["value"]

        if "references" in log: 
            for r in log["references"]:
                parent_id = r["spanID"]
        
        span_obj = Log(
            span_id=log['spanID'], 
            trace_id=log["traceID"], 
            parent_id = parent_id,
            db_statements = db_statements,
            http_target = http_target
        )

        logs.append(span_obj)

print(logs[1])


Log(span_id=9eb9be13e922fc0e, trace_id=d72d3bd03f1b13419c124474661a8b5a, parent_id=186e4c8f97c57c94, db_statements=['SELECT * FROM orders'], http_target=/scenario1/orders)


In [5]:
print(len(logs))

for log in logs: 
    if log.http_target == None:
    #if log.db_statements == [] and log.http_target == None:
        logs.remove(log)

print(len(logs))

8
8


In [6]:
"""for log in logs: 
    endpoint_name = log.get_endpoint_name()

    if endpoint_name != None and service_name in endpoint_name:
        for l in logs:
            if l.trace_id == log.trace_id:
                l.parent_endpoint = log.get_endpoint_name()"""

parents = {}

# get service parent for each trace id
for log in logs: 
    endpoint_name = log.get_endpoint_name()
    if endpoint_name != None and service_name in endpoint_name:
        if log.trace_id not in parents.keys():
            parents[log.trace_id] = endpoint_name

for log in logs:
    endpoint_name = log.get_endpoint_name()
    if endpoint_name != None:
        parent = parents.get(log.trace_id)
        if parent:
            log.parent_endpoint = parent

In [7]:
for log in logs:
    print(log.parent_endpoint)
    print(log.get_table_names())

scenario1/orders/
None
scenario1/orders/
['orders']
scenario1/orders/
['products']
scenario1/employees/
None
scenario1/employees/
['customers']
scenario1/employees/
['employees']
scenario1/employees/
['employees']
scenario1/employees/
['customers']


In [8]:
grouped_logs = {}

for log in logs: 
    table_names = log.get_table_names()
    if table_names != None and log.parent_endpoint != None:
        endpoint_name = log.parent_endpoint

        if endpoint_name not in grouped_logs:
            grouped_logs[endpoint_name] = []

        for name in table_names:
            #if name in grouped_logs[endpoint_name]:
            #    continue
            #else:
                grouped_logs[endpoint_name].append(name)
                
from collections import Counter

calls = {}

for url, tables in grouped_logs.items():
    counter = Counter(tables)
    calls[url] = dict(counter)

print(calls)

{'scenario1/orders/': {'orders': 1, 'products': 1}, 'scenario1/employees/': {'customers': 2, 'employees': 2}}


### Calculate SCOM

In [9]:
def filter_empty_apis(apis):
    return {k: v for k, v in apis.items() if v}

def calculate_connection_intensity(i, j):
     common_tables = set(i).intersection(j)
     if len(common_tables) == 0: return 0

     return len(common_tables) / (min(len(set(i)), len(set(j))))

def scom(grouped_logs):
    apis = filter_empty_apis(grouped_logs)

    n_of_apis = len(apis)
    if n_of_apis <= 1:
        return "Too few endpoints"

    total_weighted_connections = 0 
    processed_pairs = set() # Verarbeitete Paare speichern

    for i, api1 in enumerate(apis):
        for api2 in list(apis.keys())[i + 1:]:
            pair_key = tuple(sorted((api1, api2)))
            
            if pair_key in processed_pairs:
                continue  # Überspringen, wenn Paar schon verarbeitet wurde
                              
            tables1 = set(apis[api1])
            tables2 = set(apis[api2])

            connection_intensity = calculate_connection_intensity(tables1, tables2)
            #n_involved_tables = len(tables1.union(tables2))
            #weight = n_involved_tables / n_tables

            #weighted_connection = connection_intensity * weight
            #weighted_connection = connection_intensity * n_involved_tables
            #total_weighted_connections += weighted_connection
            total_weighted_connections += connection_intensity
            processed_pairs.add(pair_key)  # Paar als verarbeitet markieren

    return total_weighted_connections / (n_of_apis*(n_of_apis-1) / 2)

In [10]:
print(f"Endpoints and used database tables: {grouped_logs}")

Endpoints and used database tables: {'scenario1/orders/': ['orders', 'products'], 'scenario1/employees/': ['customers', 'employees', 'employees', 'customers']}


In [11]:
print(f"SCOM: {scom(grouped_logs)}") 

SCOM: 0.0
