In [1]:
# from langchain.agents import initialize_agent, Tool
from langchain_core.tools import tool
from langgraph.graph import StateGraph, END
from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, ToolMessage
# from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from typing import Optional, TypedDict, Annotated
from pydantic.v1 import BaseModel, Field
import operator
# from langchain.prompts import PromptTemplate
import requests
from github import Github
import os
import getpass
from bandit.core import manager, config
from azure.identity import ManagedIdentityCredential
import config
import pandas as pd
import ollama
import tempfile
from langchain_ollama import ChatOllama
from langgraph.prebuilt import create_react_agent

In [12]:
# model_to_use = "llama3.3:latest"
llm = ChatOllama(model="llama3.3:latest", temperature=0.1)

In [5]:
def get_prometheus_rule_groups(subscription_id=config.SUBSCRIPTION, resource_group=config.RESOURCEGROUP, client_id=config.PROMETHEUS_CLIENT_ID):
    """
    Fetch Prometheus rule groups from Azure Monitor and return as a DataFrame.

    # Args:
    #     subscription_id (str): Azure subscription ID.
    #     resource_group (str): Azure resource group name.
    #     api_version (str): API version to use.
    #     client_id (str): Client ID for Managed Identity.

    Returns:
        pd.DataFrame: DataFrame containing rule group details, or None if the request fails.
    """
    # Get token using Managed Identity (DefaultAzureCredential)
    credential = ManagedIdentityCredential(client_id=client_id)
    token = credential.get_token("https://management.azure.com/.default").token

    # Construct API URL for listing all rule groups
    url = (
        f"https://management.azure.com/subscriptions/{subscription_id}"
        f"/resourceGroups/{resource_group}/providers/Microsoft.AlertsManagement"
        f"/prometheusRuleGroups?api-version=2023-03-01"
    )

    # Set headers with the bearer token
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }

    # Make the GET request to Azure Monitor
    response = requests.get(url, headers=headers)

    # Check and process response
    if response.status_code == 200:
        data = response.json()
        rule_groups = []
        for rule_group in data.get("value", []):
            rule_groups.append({
                "Name": rule_group['name'],
                "Location": rule_group.get('location'),
                "Description": rule_group.get('properties', {}).get('description'),
                "Rules": rule_group.get('properties', {}).get('rules')
            })
        return pd.DataFrame(rule_groups)
    else:
        print(f"Failed to fetch rule groups. Status Code: {response.status_code}")
        print(response.text)
        return None

In [24]:
alert_rules_df =  get_prometheus_rule_groups()

df_exploded = alert_rules_df.explode('Rules', ignore_index=True)
df_normalized = pd.concat(
    [df_exploded.drop(['Rules'], axis=1), df_exploded['Rules'].apply(pd.Series)],
    axis=1
)
df_normalized

Unnamed: 0,Name,Location,Description,record,expression,labels,alert,enabled,severity,for,actions,resolveConfiguration,annotations
0,NodeRecordingRulesRuleGroup-eastus-stage-arc,eastus,Node Recording Rules RuleGroup - 0.1,instance:node_num_cpu:sum,"count without (cpu, mode) ( node_cpu_seconds_...",,,,,,,,
1,NodeRecordingRulesRuleGroup-eastus-stage-arc,eastus,Node Recording Rules RuleGroup - 0.1,instance:node_cpu_utilisation:rate5m,1 - avg without (cpu) ( sum without (mode) (r...,,,,,,,,
2,NodeRecordingRulesRuleGroup-eastus-stage-arc,eastus,Node Recording Rules RuleGroup - 0.1,instance:node_load1_per_cpu:ratio,"( node_load1{job=""node""}/ instance:node_num_...",,,,,,,,
3,NodeRecordingRulesRuleGroup-eastus-stage-arc,eastus,Node Recording Rules RuleGroup - 0.1,instance:node_memory_utilisation:ratio,1 - ( ( node_memory_MemAvailable_bytes{job...,,,,,,,,
4,NodeRecordingRulesRuleGroup-eastus-stage-arc,eastus,Node Recording Rules RuleGroup - 0.1,instance:node_vmstat_pgmajfault:rate5m,"rate(node_vmstat_pgmajfault{job=""node""}[5m])",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,RPRuleGroup,eastus,Resource Provider alerts,,increase(rp_faulttolerance_job_processed_count...,{'metric': 'rp_faulttolerance_job_processed_co...,RPUnableToCommunicateWithCVS,True,4.0,,[{'actionGroupId': '/subscriptions/2f495c46-73...,"{'autoResolved': True, 'timeToResolve': 'PT10M'}",{'description': 'RP pod #$.labels.pod# is repo...
474,RPRuleGroup,eastus,Resource Provider alerts,,increase(rp_tor_down[10m]) > 0,"{'metric': 'rp_tor_down', 'service': 'anf-rp'}",RPTORDown,True,4.0,,[{'actionGroupId': '/subscriptions/2f495c46-73...,"{'autoResolved': True, 'timeToResolve': 'PT10M'}",{'description': 'RP pod #$.labels.pod# is repo...
475,RPRuleGroup,eastus,Resource Provider alerts,,rate(rp_external_rp_request_reponses_bucket{le...,{'metric': 'rp_external_rp_request_reponses_bu...,RPUnableToCommunicateWithCVS,True,4.0,,[{'actionGroupId': '/subscriptions/2f495c46-73...,"{'autoResolved': True, 'timeToResolve': 'PT10M'}",{'description': 'RP pod #$.labels.pod# is unab...
476,ONTAPRuleGroup-SRE,eastus,ONTAP alerts,,(count(node_failed_power > 0) by (ontap_cluste...,"{'metric': 'node_failed_power', 'service': 'On...",OntapPSUFailure,True,4.0,PT10M,[{'actionGroupId': '/subscriptions/2f495c46-73...,"{'autoResolved': True, 'timeToResolve': 'PT1M'}",{'description': 'Ontap:Multiple nodes with PSU...


In [27]:
# from langchain_core.messages import AIMessage

# messages = [
#     (
#         "system",
#         "You are a helpful assistant that translates prometheus alert rules to understandabale english. Translate the user question.",
#     ),
#     ("human", f"Translate the alert rule definitions in {df_normalized}, do it for all the rules in the rules column, add the results to a new column to the dataframe"),
# ]
# ai_msg = llm.invoke(messages)
# ai_msg

def translate_rule(row):
    messages = [
        (
            "system",
            "You are a helpful assistant that translates prometheus alert rules to understandable english. Translate the user question.",
        ),
        ("human", f"Translate the following Prometheus rule:\nRecord: {row['record']}\nExpression: {row['expression']}\nReturn only the translation."),
    ]
    ai_msg = llm.invoke(messages)
    return ai_msg.content if hasattr(ai_msg, "content") else str(ai_msg)

df_normalized['translation'] = df_normalized.apply(translate_rule, axis=1)
df_normalized

Unnamed: 0,Name,Location,Description,record,expression,labels,alert,enabled,severity,for,actions,resolveConfiguration,annotations,translation
0,NodeRecordingRulesRuleGroup-eastus-stage-arc,eastus,Node Recording Rules RuleGroup - 0.1,instance:node_num_cpu:sum,"count without (cpu, mode) ( node_cpu_seconds_...",,,,,,,,,Count the total number of nodes that are repor...
1,NodeRecordingRulesRuleGroup-eastus-stage-arc,eastus,Node Recording Rules RuleGroup - 0.1,instance:node_cpu_utilisation:rate5m,1 - avg without (cpu) ( sum without (mode) (r...,,,,,,,,,The average CPU utilization across all CPUs fo...
2,NodeRecordingRulesRuleGroup-eastus-stage-arc,eastus,Node Recording Rules RuleGroup - 0.1,instance:node_load1_per_cpu:ratio,"( node_load1{job=""node""}/ instance:node_num_...",,,,,,,,,"""The average load on a single CPU for each ins..."
3,NodeRecordingRulesRuleGroup-eastus-stage-arc,eastus,Node Recording Rules RuleGroup - 0.1,instance:node_memory_utilisation:ratio,1 - ( ( node_memory_MemAvailable_bytes{job...,,,,,,,,,This rule calculates the ratio of used memory ...
4,NodeRecordingRulesRuleGroup-eastus-stage-arc,eastus,Node Recording Rules RuleGroup - 0.1,instance:node_vmstat_pgmajfault:rate5m,"rate(node_vmstat_pgmajfault{job=""node""}[5m])",,,,,,,,,"""The rate of major page faults per second for ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,RPRuleGroup,eastus,Resource Provider alerts,,increase(rp_faulttolerance_job_processed_count...,{'metric': 'rp_faulttolerance_job_processed_co...,RPUnableToCommunicateWithCVS,True,4.0,,[{'actionGroupId': '/subscriptions/2f495c46-73...,"{'autoResolved': True, 'timeToResolve': 'PT10M'}",{'description': 'RP pod #$.labels.pod# is repo...,There has been an increase in the number of fa...
474,RPRuleGroup,eastus,Resource Provider alerts,,increase(rp_tor_down[10m]) > 0,"{'metric': 'rp_tor_down', 'service': 'anf-rp'}",RPTORDown,True,4.0,,[{'actionGroupId': '/subscriptions/2f495c46-73...,"{'autoResolved': True, 'timeToResolve': 'PT10M'}",{'description': 'RP pod #$.labels.pod# is repo...,There has been an increase in the number of ti...
475,RPRuleGroup,eastus,Resource Provider alerts,,rate(rp_external_rp_request_reponses_bucket{le...,{'metric': 'rp_external_rp_request_reponses_bu...,RPUnableToCommunicateWithCVS,True,4.0,,[{'actionGroupId': '/subscriptions/2f495c46-73...,"{'autoResolved': True, 'timeToResolve': 'PT10M'}",{'description': 'RP pod #$.labels.pod# is unab...,The average rate of external RP requests that ...
476,ONTAPRuleGroup-SRE,eastus,ONTAP alerts,,(count(node_failed_power > 0) by (ontap_cluste...,"{'metric': 'node_failed_power', 'service': 'On...",OntapPSUFailure,True,4.0,PT10M,[{'actionGroupId': '/subscriptions/2f495c46-73...,"{'autoResolved': True, 'timeToResolve': 'PT1M'}",{'description': 'Ontap:Multiple nodes with PSU...,Alert when more than 2 nodes in the same ONTAP...


In [28]:
df_normalized.to_csv('prom-alerts-def.csv', index = False, encoding = 'UTF-8', header = True)

In [26]:
ai_msg.content

'To translate the alert rule definitions into understandable English and add the results to a new column in the DataFrame, we\'ll need to iterate over each rule in the `Rules` column, parse its components (such as `record`, `alert`, `expr`, etc.), and then generate a human-readable description based on those components.\n\nGiven the complexity of Prometheus alert rules, which can include various metrics, operators, and functions, providing an exact translation for every possible rule is challenging without specific examples. However, we can create a basic function to handle common cases and provide a framework that you can extend as needed.\n\nBelow is a simplified Python example using pandas to process the DataFrame. This script includes a basic function `translate_rule` that attempts to parse and translate rules into human-readable text. Note that this is a simplified approach and might need adjustments based on the actual complexity of your rules.\n\n```python\nimport pandas as pd\n