# Alerts Helper

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/andrewm4894/netdata-gpt-notebooks/blob/main/notebooks/alerts_helper/alerts_helper.ipynb)

In [290]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from netdata_pandas.data_cloud import get_data_cloud
import openai
import pprint as pp
from urllib.parse import urlparse
import requests
import json
from datetime import datetime

# load tokens from .env file
load_dotenv()

NETDATA_API_TOKEN = os.getenv('NETDATA_API_TOKEN')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

openai.api_key = OPENAI_API_KEY

colnames_clean = {
    "nm": "alert name",
    "fami": "alert family",
    "info": "alert description",
    "ch_n": "chart name",
    "ctx": "chart context",
    "st": "alert status",
    "v": "alert latest value",
    "tr_v": "alert triggered value",
    "units": "alert units",
    "cl": "alert class",
    "cm": "alert component",
    "src": "alert source configuration",
    "t": "alert instance as at timestamp",
    "tr_t": "alert triggered at timestamp",
    "tp": "alert type",
    "to": "alert send to group",
}

In [291]:
# inputs
space_id = 'ea93d7b8-0df6-45c0-b13d-1560996c89eb' 
room_id = 'd8a4e0c5-7c79-4145-900e-83a9f06fcb6a'
output_dir = f'output/{space_id}/{room_id}'
now = datetime.now().strftime("%Y%m%d_%H%M%S")

In [292]:
def get_alerts_cloud(space_id, room_id, api_token=None, base_url='https://app.netdata.cloud', node_ids=[]):
    """
    """
    
    if api_token is None:
        api_token = os.getenv('NETDATA_API_TOKEN')
    
    base_url = 'https://app.netdata.cloud'
    url = f'{base_url}/api/v2/spaces/{space_id}/rooms/{room_id}/alerts'
    headers = {'Accept': '*/*', 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_token}'}
    data = {"scope":{"nodes":[],"contexts":[]},"selectors":{"status":["raised"]},"options":["summary","values","instances"]}
    r = requests.post(url, headers=headers, data=json.dumps(data))
    
    if r.status_code != 200:
        
        print(f'Error: {r.status_code, r.text}')
        return None
    
    else:
            
        return r.json()
    

def make_prompt(df_alert_instances):
    prompt = f"""
    You are an experienced SRE and sysadmin. You are monitoring your infrastructure using Netdata Cloud.

    Below you are given a list of alert instances representing all active alerts in your Netdata Cloud space right now.

    Can you summarize the and assess to state of your nodes and infrastructure given the dictionary of active alerts.
    
    Please just condense it to a paragraph or two of the key and most important points.
    
    Provide a list of some potential next steps in order of priority.
    
    Use markdown to format your response.

    Here is the list of active alert instances:

    {df_alert_instances.to_dict(orient='records')}
    
    """
    
    return prompt


In [293]:
alerts = get_alerts_cloud(space_id, room_id, api_token=NETDATA_API_TOKEN)

In [294]:
nodes_map = {n['ni']:n['nm'] for n in alerts['nodes']}

In [295]:
df_alert_instances = pd.DataFrame(alerts['alert_instances'])
df_alert_instances['node name'] = df_alert_instances['ni'].map(nodes_map)
df_alert_instances = df_alert_instances.rename(columns=colnames_clean, inplace=False)
df_alert_instances = df_alert_instances[['node name'] + list(colnames_clean.values())]
df_alert_instances['alert instance as at timestamp'] = pd.to_datetime(df_alert_instances['alert instance as at timestamp'], unit='s')
df_alert_instances['alert triggered at timestamp'] = pd.to_datetime(df_alert_instances['alert triggered at timestamp'], unit='s')
df_alert_instances

Unnamed: 0,node name,alert name,alert family,alert description,chart name,chart context,alert status,alert latest value,alert triggered value,alert units,alert class,alert component,alert source configuration,alert instance as at timestamp,alert triggered at timestamp,alert type,alert send to group
0,postgresql-0,10min_cpu_iowait,cpu,average CPU iowait time over the last 10 minutes,system.cpu,system.cpu,WARNING,0.0,46.320023,%,Utilization,CPU,20@/usr/lib/netdata/conf.d/health.d/cpu.conf,1970-01-01 00:00:00,2023-07-21 09:42:22,System,sysadmin
1,postgresql-0,10min_disk_utilization,nvme2n1,average percentage of time nvme2n1 disk was bu...,disk_util.nvme2n1,disk.util,WARNING,98.221667,99.619322,%,Utilization,Disk,133@/usr/lib/netdata/conf.d/health.d/disks.conf,2023-07-24 22:12:50,2023-07-19 10:00:31,System,silent
2,postgresql-0,load_average_15,load,system fifteen-minute load average,system.load,system.load,WARNING,9.48,8.69,load,Utilization,Load,,2023-07-24 22:12:50,2023-07-12 09:33:19,System,sysadmin
3,postgresql-0,load_average_5,load,system five-minute load average,system.load,system.load,WARNING,9.27,8.23,load,Utilization,Load,39@/usr/lib/netdata/conf.d/health.d/load.conf,2023-07-24 22:12:50,2023-07-24 22:09:12,System,sysadmin
4,netdata-collectors-0,httpcheck_web_service_bad_content,status,percentage of HTTP responses from https://www....,httpcheck_SOAP_test.request_status,httpcheck.status,CRITICAL,100.0,100.0,%,Workload,HTTP endpoint,,2023-07-24 22:12:51,2023-06-12 09:15:15,Web Server,webmaster
5,ml-demo-stable,disk_space_usage,/,disk / space utilization,disk_space._,disk.space,WARNING,85.016036,97.803746,%,Utilization,Disk,12@/etc/netdata/health.d/disks.conf,2023-07-24 22:12:24,2023-07-23 12:05:24,System,sysadmin
6,ip-10-20-137-182.ec2.internal,10min_disk_utilization,nvme2n1,average percentage of time nvme2n1 disk was bu...,disk_util.nvme2n1,disk.util,WARNING,98.208333,99.691382,%,Utilization,Disk,133@/usr/lib/netdata/conf.d/health.d/disks.conf,2023-07-24 22:12:50,2023-07-19 10:00:31,System,silent
7,ip-10-20-137-182.ec2.internal,load_average_15,load,system fifteen-minute load average,system.load,system.load,WARNING,9.48,13.28,load,Utilization,Load,,2023-07-24 22:12:50,2023-07-12 09:51:31,System,sysadmin
8,ip-10-20-137-182.ec2.internal,load_average_5,load,system five-minute load average,system.load,system.load,WARNING,9.27,8.58,load,Utilization,Load,39@/usr/lib/netdata/conf.d/health.d/load.conf,2023-07-24 22:12:50,2023-07-24 22:09:50,System,sysadmin


In [296]:
prompt = make_prompt(df_alert_instances)
print(prompt)


    You are an experienced SRE and sysadmin. You are monitoring your infrastructure using Netdata Cloud.

    Below you are given a list of alert instances representing all active alerts in your Netdata Cloud space right now.

    Can you summarize the and assess to state of your nodes and infrastructure given the dictionary of active alerts.
    
    Please just condense it to a paragraph or two of the key and most important points.
    
    Provide a list of some potential next steps in order of priority.
    
    Use markdown to format your response.

    Here is the list of active alert instances:

    
    


In [297]:
# build messages list to pass to openai
messages=[
    {"role": "user", "content": prompt}
]

# call openai api
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
)

reply_content = completion.choices[0].message.content
#pp.pprint(reply_content)

In [298]:
print(reply_content)

Based on the provided list of active alert instances in the Netdata Cloud space, there are several key points to note about the state of the nodes and infrastructure:

1. The node 'postgresql-0' has three active alerts:
   - The '10min_disk_utilization' alert warns about the average percentage of time the 'nvme2n1' disk is busy over the last 10 minutes. The current value is 98.2216667 (%), and the triggered value is 99.6193220338983 (%). This falls under the 'Utilization' class and the 'Disk' component.

2. The node 'netdata-collectors-0' has an active alert:
   - The 'httpcheck_web_service_bad_content' alert is critical and indicates issues with the percentage of HTTP responses from the specified endpoint with unexpected content in the last 5 minutes. The latest and triggered values are both 100.0 (%). This alert falls under the 'Workload' class and the 'HTTP endpoint' component.

3. The node 'ml-demo-stable' has one active alert:
   - The 'disk_space_usage' alert warns about the disk

In [299]:
if not os.path.exists(output_dir):
   os.makedirs(output_dir)

file_name_prompt = f'{space_id}_{room_id}__{now}_PROMPT.txt'
with open(f'{output_dir}/{file_name_prompt}', 'w') as f:
    f.write(prompt)
    
file_name_result = f'{space_id}_{room_id}__{now}_RESULT.md'
with open(f'{output_dir}/{file_name_result}', 'w') as f:
    f.write(reply_content)
