### KPI - Availability (Sev3: Burn Velocity > 2x in 6 hrs windows)

In [1]:
#! pip install azure-identity azure-monitor-query langchain-openai


In [2]:
## Get the credential to query log analytics
from azure.identity import DefaultAzureCredential
from azure.monitor.query import LogsQueryClient, LogsQueryStatus
import pandas as pd

# Azure config
tenant_id = "72f988bf-86f1-41af-91ab-2d7cd011db47"
workspace_id = "edf08e1f-916f-48c3-bc52-273492d63c8f"

# Auth
credential = DefaultAzureCredential()
client = LogsQueryClient(credential)

In [3]:
# Calculate the Burn Rate for slo = 99.95% in 5 minutes window
# Fire this alert in every 5 minutes

slo = 0.9995
timeGrain = "6h"

query = f"""

let SLO = {slo};
let timeGrain = {timeGrain};
AzureDiagnostics
| where TimeGenerated > ago(30d)
  and Resource == "MSONECLOUDAPIFD"
  and Category == "FrontDoorAccessLog"
| where isnotempty(httpStatusCode_s)
| extend statusCode = toint(httpStatusCode_s)
| summarize
    totalRequests = count(),
    errorRequests = countif(statusCode >= 500)
    by bin(TimeGenerated, timeGrain)
| extend SLI = iff(totalRequests > 0, 1.0 - (1.0 * errorRequests / totalRequests), 1.0)
| extend burnRate = iff(totalRequests > 0, (1.0 - SLI) / (1.0 - SLO), 0.0)
| project TimeGenerated, totalRequests, errorRequests, SLI, burnRate
| order by TimeGenerated asc
"""

# Query execution
response = client.query_workspace(
    workspace_id=workspace_id,
    query=query,
    timespan=None
)

# Parse results
if response.status == LogsQueryStatus.SUCCESS:
    table = response.tables[0]
    df = pd.DataFrame(data=table.rows, columns=table.columns)
    df["TimeGenerated"] = pd.to_datetime(df["TimeGenerated"])
    df = df.sort_values("TimeGenerated")

max_burnrate_df = df[(df["burnRate"] == df["burnRate"].max()) & (df["burnRate"].max() >= 2) ]

if max_burnrate_df.shape[0] > 0:
    max_burn_row = max_burnrate_df.iloc[0]
    print(max_burn_row)



TimeGenerated    2025-05-15 12:00:00+00:00
totalRequests                       860108
errorRequests                         2716
SLI                               0.996842
burnRate                          6.315486
Name: 19, dtype: object


In [20]:
from datetime import timedelta
import pandas as pd
from pandas import Timedelta
from azure.monitor.query import LogsQueryStatus

def merge_with_role_and_time(base_df, other_df, tolerance="2s"):
    if base_df.empty or other_df.empty:
        return base_df

    # Ensure datetime format and sorting
    base_df["TimeGenerated"] = pd.to_datetime(base_df["TimeGenerated"])
    other_df["TimeGenerated"] = pd.to_datetime(other_df["TimeGenerated"])

    base_df.sort_values(["TimeGenerated", "AppRoleName"], inplace=True)
    other_df.sort_values(["TimeGenerated", "AppRoleName"], inplace=True)

    merged_chunks = []

    for role, base_group in base_df.groupby("AppRoleName"):
        other_group = other_df[other_df["AppRoleName"] == role]

        if other_group.empty:
            merged_chunks.append(base_group)
            continue

        # Drop overlapping columns except merge keys to allow overwrite
        overlapping = set(base_group.columns) & set(other_group.columns) - {"TimeGenerated", "AppRoleName"}
        other_group = other_group.drop(columns=overlapping)

        merged = pd.merge_asof(
            base_group,
            other_group,
            on="TimeGenerated",
            by="AppRoleName",
            direction="nearest",
            tolerance=Timedelta(tolerance)
        )

        merged_chunks.append(merged)

    return pd.concat(merged_chunks).sort_values("TimeGenerated")


# ========== LOGIC ==========

if max_burnrate_df.shape[0] > 0:

    spike_time = max_burn_row['TimeGenerated']
    start_time = spike_time - timedelta(minutes=5)
    end_time = spike_time + timedelta(minutes=5)
    timespan = (start_time, end_time)

    tables = {
        "AppRequests": f"""
            AppRequests
            | where TimeGenerated between (datetime({start_time.isoformat()}) .. datetime({end_time.isoformat()}))
            and AppRoleName contains "msonecloudapi-prod"
            and Success == false
            and toint(ResultCode) >= 500
            | project TimeGenerated, type="request", OperationName, Name, ResultCode, Duration = DurationMs, Success, AppRoleName
        """,
        "AppExceptions": f"""
            AppExceptions
            | where TimeGenerated between (datetime({start_time.isoformat()}) .. datetime({end_time.isoformat()}))
            and AppRoleName contains "msonecloudapi-prod"
            and ProblemId != "Microsoft.IdentityModel.S2S.S2SAuthenticationException"
            | project TimeGenerated, type="exception", OperationName, ProblemId, Message = OuterMessage, AppRoleName
        """,
        "AppTraces": f"""
            AppTraces
            | where TimeGenerated between (datetime({start_time.isoformat()}) .. datetime({end_time.isoformat()}))
            and AppRoleName contains "msonecloudapi-prod"
            and SeverityLevel >= 2
            | project TimeGenerated, type="trace", Message, SeverityLevel, AppRoleName
        """,
        "AppDependencies": f"""
            AppDependencies
            | where TimeGenerated between (datetime({start_time.isoformat()}) .. datetime({end_time.isoformat()}))
            and AppRoleName contains "msonecloudapi-prod"
            and Success == false
            | project TimeGenerated, type="dependency", Name, Target = Data, ResultCode, Success, Duration = DurationMs, AppRoleName, Server = AppRoleName
        """
    }

    # Query logs
    log_dict = {}
    for table_name, query in tables.items():
        resp = client.query_workspace(workspace_id, query, timespan=timespan)
        if resp.status == LogsQueryStatus.SUCCESS:
            tab = resp.tables[0]
            df = pd.DataFrame(tab.rows, columns=tab.columns)
            df["type"] = table_name  # optional, override type for traceability
            log_dict[table_name] = df

    # Merge logs with preference starting from AppRequests
    merged_logs = log_dict.get("AppRequests", pd.DataFrame())

    for table_name in ["AppExceptions", "AppTraces", "AppDependencies"]:
        other_df = log_dict.get(table_name, pd.DataFrame())
        merged_logs = merge_with_role_and_time(merged_logs, other_df, tolerance="2s")

    # Final formatting
    merged_logs.sort_values("TimeGenerated", inplace=True)
    merged_logs.reset_index(drop=True, inplace=True)

    if merged_logs.shape[0] > 30:
        merged_logs = merged_logs.sample(30)
        merged_logs.sort_values(["AppRoleName", "TimeGenerated"], inplace=True)
    # Output: merged_logs is your final DataFrame


In [21]:
merged_logs

Unnamed: 0,TimeGenerated,type,OperationName,Name,ResultCode,Duration,Success,AppRoleName,ProblemId,Message,Target,Server
0,2025-05-15 11:55:00.619023+00:00,AppRequests,GET Product/Price,GET Product/Price,500,15.9157,False,msonecloudapi-prod-japaneast,System.IO.FileNotFoundException at OneCloud.Da...,Could not load file or assembly 'OneCloud.Disp...,,
49,2025-05-15 11:55:58.432913+00:00,AppRequests,GET Product/Price,GET Product/Price,500,1019.9718,False,msonecloudapi-prod-japaneast,System.IO.FileNotFoundException at OneCloud.Da...,Could not load file or assembly 'OneCloud.Disp...,,
69,2025-05-15 11:56:27.109604+00:00,AppRequests,GET Product/Price,GET Product/Price,500,1021.691,False,msonecloudapi-prod-japaneast,System.IO.FileNotFoundException at OneCloud.Da...,Could not load file or assembly 'OneCloud.Disp...,,
79,2025-05-15 11:56:36.866409+00:00,AppRequests,GET Product/Price,GET Product/Price,500,1024.3574,False,msonecloudapi-prod-japaneast,System.IO.FileNotFoundException at OneCloud.Da...,Could not load file or assembly 'OneCloud.Disp...,,
119,2025-05-15 11:57:34.384960+00:00,AppRequests,GET Product/Price,GET Product/Price,500,1028.0079,False,msonecloudapi-prod-japaneast,System.IO.FileNotFoundException at OneCloud.Da...,Could not load file or assembly 'OneCloud.Disp...,,
125,2025-05-15 11:57:43.452957+00:00,AppRequests,GET Product/Price,GET Product/Price,500,1025.3491,False,msonecloudapi-prod-japaneast,System.IO.FileNotFoundException at OneCloud.Da...,Could not load file or assembly 'OneCloud.Disp...,,
130,2025-05-15 11:57:57.196727+00:00,AppRequests,GET Product/Price,GET Product/Price,500,12.7288,False,msonecloudapi-prod-japaneast,System.IO.FileNotFoundException at OneCloud.Da...,Could not load file or assembly 'OneCloud.Disp...,,
140,2025-05-15 11:58:09.938178+00:00,AppRequests,GET Product/Price,GET Product/Price,500,2017.567,False,msonecloudapi-prod-japaneast,System.IO.FileNotFoundException at OneCloud.Da...,Could not load file or assembly 'OneCloud.Disp...,,
200,2025-05-15 11:59:16.222768+00:00,AppRequests,GET Product/Price,GET Product/Price,500,13.2668,False,msonecloudapi-prod-japaneast,System.IO.FileNotFoundException at OneCloud.Da...,Could not load file or assembly 'OneCloud.Disp...,,
212,2025-05-15 11:59:29.241553+00:00,AppRequests,GET Product/Price,GET Product/Price,500,1020.3536,False,msonecloudapi-prod-japaneast,System.IO.FileNotFoundException at OneCloud.Da...,Could not load file or assembly 'OneCloud.Disp...,,


In [22]:
## Format for LLM
if max_burnrate_df.shape[0] > 0:
    lines = []
    for _, row in merged_logs.iterrows():
        summary = f"[{row['TimeGenerated']}] [{row['type']}] " + " | ".join(f"{k}={row[k]}" for k in row.index if k not in ['TimeGenerated', 'type'])
        lines.append(summary)

    log_context = "\n".join(lines[:30])  # Trim for LLM input

    log_context


In [23]:
# Prompt to LLM

if max_burnrate_df.shape[0] > 0:
    prompt_text = f"""
    A spike in error burn rate was detected between {start_time} and {end_time}.

    Here are logs across request, exception, trace, and dependency tables:

    {log_context}

    Analyze the root cause and recommend how to mitigate or fix the issue.
    """


In [8]:
#prompt_text

In [24]:
# Connect to LLM using my libraries
from utility.llm_factory import LLMFactory
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

if max_burnrate_df.shape[0] > 0:
    prompt = ChatPromptTemplate.from_messages([
            ("system", """
                You are a skilled and detail-oriented Site Reliability Engineering (SRE) assistant.
                Your primary goal is to help identify the root cause of incidents based on system logs and telemetry.

                - Summerize the issue in 1 bullet point only.
                - Focus on pinpointing the **source of the error**, **server name** and the **exact or approximate time** it occurred.
                - Keep the **root-cause** elaborated in 2-3 bullet points max.
                - If applicable, correlate related events across logs.
                - Keep your **mitigation and next steps** clear and concise (2-3 bullet points max).
                - Avoid vague conclusions. Use specific log details to support your reasoning.
            """),
            ("human", "{prompt_text}")
        ])

    # print(prompt)

    llm = LLMFactory.get_llm('openai')
    chain = prompt | llm | StrOutputParser()
    result = chain.invoke({"prompt_text": prompt_text })

    print("=== Incident Analysis by LLM ===\n")
    print(result)


=== Incident Analysis by LLM ===

**Summary**: 
The application (msonecloudapi-prod-japaneast) is failing to execute GET Product/Price operation due to a missing assembly 'OneCloud.DisplayProductCatalog.M365'. The issue started around 2025-05-15 11:55 and continued till at least 2025-05-15 12:04.

**Root-Cause**: 
- The assembly 'OneCloud.DisplayProductCatalog.M365, Version=1.0.0.4281' is not found in the application's path which is resulting in a System.IO.FileNotFoundException.
- This failure is preventing the proper functioning of the 'ProductController.CreateProductRequestFrom' method within the 'OneCloud.DataConnector.M365.Controllers' namespace in the 'msonecloudapi-prod-japaneast' application.
- This could be due to an incorrect configuration or a missing deployment file in the server's file system.

**Mitigation and Next steps**:
1. Check the deployment package to confirm if 'OneCloud.DisplayProductCatalog.M365, Version=1.0.0.4281' assembly was included during build and deploy 