## SCOM Calculation from Logs

In [None]:
import json

file = open("../test_data/teastore/persistence_020624.json")
result = json.load(file)
file.close()

### Extract table names

In [None]:
import re

table_name_pattern = re.compile(
    r"""
    (?i)   # Case-insensitive matching
    \bFROM\s+([`'"]?[a-zA-Z_][\w$]*[`'"]?)|   
    \bJOIN\s+([`'"]?[a-zA-Z_][\w$]*[`'"]?)|   
    \bINTO\s+([`'"]?[a-zA-Z_][\w$]*[`'"]?)|  
    \bUPDATE\s+([`'"]?[a-zA-Z_][\w$]*[`'"]?)| 
    \bDELETE\s+FROM\s+([`'"]?[a-zA-Z_][\w$]*[`'"]?)  
    """,
    re.VERBOSE
)

def extract_table_names(sql):
    matches = table_name_pattern.findall(sql)
    #matches = [
    #('employees', '', '', '', '', '', ''),
    #('', 'customers', '', '', '', '', '')]
    # filters out empty matches ('') and flattens result to normal list
    return [match for sublist in matches for match in sublist if match]


### Parse JSON Input

In [None]:
class Log:
    def __init__(self, span_id, reference_tags, tags):
        self.span_id = span_id
        self.reference_tags = reference_tags
        self.tags = tags

    def __repr__(self):
        return f"Log(span_id={self.span_id}, reference_tags={self.reference_tags}, tags={self.tags})"

    def to_json(self):
        return json.dumps({
            'spanId': self.span_id,
            'reference_tags': self.reference_tags,
            'tags': self.tags
        }, indent=2)
    
    def get_endpoint_name(self):
        result = None

        for tag in self.reference_tags:
            if tag["key"] == "http.target":     
                result = tag["value"]
                endpoint = result.split('?')[0]
                endpoint_parts = endpoint.split('/')
                result = '/'.join(endpoint_parts[:-1]) + '/'
                break
        
        return result
    
    def get_db_statement(self):
        result = []
        
        for s in self.tags:
            if s["key"] == "db.statement":
                result.append(s["value"])

        if len(result) > 0:
            return result
        
        return None
    
    def get_table_names(self):
        statement = self.get_db_statement()
        
        if statement is not None:
            return extract_table_names(statement[0])
        
        return None

In [None]:
logs = []
for data in result["data"]:
    for log in data["spans"]:
        span_id = log['spanID']
        tags = log['tags']
        for reference in log["references"]:
            if "span" in reference: 
                span_obj = Log(span_id=span_id, reference_tags=reference["span"]["tags"], tags=tags)
                logs.append(span_obj)

# Print all Span objects
#for log in logs:
#    if log.get_table_names() != None:
#        print(log.get_endpoint_name())
#        print(log.get_table_names())


In [None]:
grouped_logs= {}

for log in logs:
    endpoint_name = log.get_endpoint_name()

    if endpoint_name == None:
        continue

    if endpoint_name not in grouped_logs:
        grouped_logs[endpoint_name] = []

    table_names = log.get_table_names()

    if table_names is not None:
        for name in table_names: 
            if name in grouped_logs[endpoint_name]:
                continue
            else: 
                grouped_logs[endpoint_name].append(name)

for operation, group in grouped_logs.items():
    print(f"Operation: {operation}")
    for log in group:
        print(f"  {log}")

print(grouped_logs)

### Calculate SCOM

In [None]:
def calculateConnectionIntensity(i, j):
     common_attributes = set(i).intersection(j)
     if len(common_attributes) == 0: return 0

     return len(common_attributes) / (min(len(set(i)), len(set(j))))

def scom(apis):
    n_of_apis = len(apis)
    if n_of_apis <= 1: return "Too few endpoints"

    total_weighted_connections = 0

    processed_pairs = set()  # Verarbeitete Paare speichern

    for i, api1 in enumerate(apis):
        for api2 in list(apis.keys())[i + 1:]:
            pair_key = tuple(sorted((api1, api2)))
            
            if pair_key in processed_pairs:
                continue  # Überspringen, wenn Paar schon verarbeitet wurde
            
            connection_intensity = calculateConnectionIntensity(apis[api1], apis[api2])
            # Weglassen, da Tabellen, die nur sehr selten aufgerufen werden, nicht berücksichtigt werden sollen
            #n_involved_tables = len(set(apis[api1]).union(set(apis[api2])))
            #weight = n_involved_tables / number_of_tables
            #weighted_connection = connection_intensity * weight
            #weighted_connection = connection_intensity * n_involved_tables
            #total_weighted_connections += weighted_connection
            total_weighted_connections += connection_intensity
            processed_pairs.add(pair_key)  # Paar als verarbeitet markieren

    return total_weighted_connections / (n_of_apis*(n_of_apis-1) / 2)

In [None]:
print(f"Endpoints and used database tables: {grouped_logs}")

In [None]:
print(f"SCOM: {scom(grouped_logs)}") 