# Alerts Helper

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/andrewm4894/netdata-gpt-notebooks/blob/main/notebooks/alerts_helper/alerts_helper.ipynb)

In [17]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from netdata_pandas.data_cloud import get_data_cloud
import openai
import pprint as pp
from urllib.parse import urlparse
import requests
import json
from datetime import datetime

# load tokens from .env file
load_dotenv()

NETDATA_API_TOKEN = os.getenv('NETDATA_API_TOKEN')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

openai.api_key = OPENAI_API_KEY

colnames_clean = {
    "nm": "alert name",
    "fami": "alert family",
    "info": "alert description",
    "ch_n": "chart name",
    "ctx": "chart context",
    "st": "alert status",
    "v": "alert latest value",
    "tr_v": "alert triggered value",
    "units": "alert units",
    "cl": "alert class",
    "cm": "alert component",
    #"src": "alert source configuration",
    "t": "alert instance as at timestamp",
    "tr_t": "alert triggered at timestamp",
    "tp": "alert type",
    "to": "alert send to group",
}

In [18]:
# inputs
space_id = 'ea93d7b8-0df6-45c0-b13d-1560996c89eb' 
room_id = 'd8a4e0c5-7c79-4145-900e-83a9f06fcb6a' # all nodes
#room_id = 'ae33f57b-b54e-4236-a6de-054da3f0a748' # machine learning room
output_dir = f'output/{space_id}/{room_id}'
now = datetime.now().strftime("%Y%m%d_%H%M%S")

In [19]:
def get_alerts_cloud(space_id, room_id, api_token=None, base_url='https://app.netdata.cloud', node_ids=[]):
    """
    """
    
    if api_token is None:
        api_token = os.getenv('NETDATA_API_TOKEN')
    
    base_url = 'https://app.netdata.cloud'
    url = f'{base_url}/api/v2/spaces/{space_id}/rooms/{room_id}/alerts'
    headers = {'Accept': '*/*', 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_token}'}
    data = {"scope":{"nodes":[],"contexts":[]},"selectors":{"status":["raised"]},"options":["summary","values","instances"]}
    r = requests.post(url, headers=headers, data=json.dumps(data))
    
    if r.status_code != 200:
        
        print(f'Error: {r.status_code, r.text}')
        return None
    
    else:
            
        return r.json()
    

def make_prompt(df_alert_instances):
    prompt = f"""
    You are an experienced SRE and sysadmin. You are monitoring your infrastructure using Netdata Cloud.

    Below you are given a list of alert instances representing all active alerts in your Netdata Cloud space right now.

    Can you summarize the and assess to state of your nodes and infrastructure given the dictionary of active alerts.
    
    Please just condense it to a paragraph or two of the key and most important points.
    
    Provide a list of some potential next steps in order of priority (be as concrete as possible and provide examples where relevant or possible).
    
    Use markdown to format your response.

    Here is the list of active alert instances:

    {df_alert_instances.to_string()}
    
    """
    
    return prompt


In [20]:
alerts = get_alerts_cloud(space_id, room_id, api_token=NETDATA_API_TOKEN)

In [21]:
nodes_map = {n['ni']:n['nm'] for n in alerts['nodes']}

In [22]:
df_alert_instances = pd.DataFrame(alerts['alert_instances'])
df_alert_instances['node name'] = df_alert_instances['ni'].map(nodes_map)
df_alert_instances = df_alert_instances.rename(columns=colnames_clean, inplace=False)
df_alert_instances = df_alert_instances[['node name'] + list(colnames_clean.values())]
df_alert_instances['alert instance as at timestamp'] = pd.to_datetime(df_alert_instances['alert instance as at timestamp'], unit='s')
df_alert_instances['alert triggered at timestamp'] = pd.to_datetime(df_alert_instances['alert triggered at timestamp'], unit='s')
df_alert_instances

Unnamed: 0,node name,alert name,alert family,alert description,chart name,chart context,alert status,alert latest value,alert triggered value,alert units,alert class,alert component,alert instance as at timestamp,alert triggered at timestamp,alert type,alert send to group
0,redis-master-0,10min_cpu_usage,cpu,average CPU utilization over the last 10 minut...,system.cpu,system.cpu,WARNING,93.292337,87.82329,%,Utilization,CPU,2023-07-25 12:31:32,2023-07-25 09:49:30,System,sysadmin
1,postgresql-0,10min_cpu_iowait,cpu,average CPU iowait time over the last 10 minutes,system.cpu,system.cpu,WARNING,0.0,46.320023,%,Utilization,CPU,1970-01-01 00:00:00,2023-07-21 09:42:22,System,sysadmin
2,postgresql-0,10min_disk_backlog,nvme2n1,average backlog size of the nvme2n1 disk over ...,disk_backlog.nvme2n1,disk.backlog,WARNING,8967.222594,5321.927521,ms,Latency,Disk,2023-07-25 12:31:32,2023-07-25 01:16:50,System,silent
3,postgresql-0,10min_disk_utilization,nvme2n1,average percentage of time nvme2n1 disk was bu...,disk_util.nvme2n1,disk.util,WARNING,99.765,99.468333,%,Utilization,Disk,2023-07-25 12:31:32,2023-07-25 09:44:21,System,silent
4,postgresql-0,load_average_1,load,system one-minute load average,system.load,system.load,WARNING,17.94,16.62,load,Utilization,Load,2023-07-25 12:31:32,2023-07-25 11:52:32,System,sysadmin
5,postgresql-0,load_average_15,load,system fifteen-minute load average,system.load,system.load,WARNING,15.13,8.69,load,Utilization,Load,2023-07-25 12:31:32,2023-07-12 09:33:19,System,sysadmin
6,postgresql-0,load_average_5,load,system five-minute load average,system.load,system.load,WARNING,15.71,8.01,load,Utilization,Load,2023-07-25 12:31:32,2023-07-25 09:44:16,System,sysadmin
7,netdata-collectors-0,httpcheck_web_service_bad_content,status,percentage of HTTP responses from https://www....,httpcheck_SOAP_test.request_status,httpcheck.status,CRITICAL,100.0,100.0,%,Workload,HTTP endpoint,2023-07-25 12:31:42,2023-06-12 09:15:15,Web Server,webmaster
8,netdata-collectors-0,httpcheck_web_service_timeouts,status,percentage of timed-out HTTP requests to https...,httpcheck_POST_test.request_status,httpcheck.status,CRITICAL,95.0,40.0,%,Latency,HTTP endpoint,2023-07-25 12:31:42,2023-07-25 02:40:21,Web Server,webmaster
9,ml-demo-stable,ml_5min_system_ram,ram,rolling 5min anomaly rate for system.ram chart,system.ram,system.ram,WARNING,0.0,8.0,%,Anomaly,CPU,1970-01-01 00:00:00,2023-07-25 12:10:04,System,root


In [23]:
prompt = make_prompt(df_alert_instances)
print(prompt)


    You are an experienced SRE and sysadmin. You are monitoring your infrastructure using Netdata Cloud.

    Below you are given a list of alert instances representing all active alerts in your Netdata Cloud space right now.

    Can you summarize the and assess to state of your nodes and infrastructure given the dictionary of active alerts.
    
    Please just condense it to a paragraph or two of the key and most important points.
    
    Provide a list of some potential next steps in order of priority (be as concrete as possible and provide examples where relevant or possible).
    
    Use markdown to format your response.

    Here is the list of active alert instances:

                            node name                            alert name           alert family                                                                                                                                  alert description                          chart name     chart context alert status

In [24]:
# build messages list to pass to openai
messages=[
    {"role": "user", "content": prompt}
]

# call openai api
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
)

reply_content = completion.choices[0].message.content
#pp.pprint(reply_content)

In [25]:
print(reply_content)

Based on the active alerts in Netdata Cloud, there are several important points to consider about the state of the nodes and infrastructure:

1. CPU Utilization: Both the Redis master node and the PostgreSQL node are experiencing high CPU utilization. The Redis master node has a CPU utilization of 93.29%, while the PostgreSQL node has a CPU utilization of 93.32%. These high CPU utilizations indicate that the nodes may be under heavy load or may require optimization to improve performance.

2. Disk Backlog: The PostgreSQL node and the ml-demo-nightly-48h-training node are experiencing high disk backlog. The PostgreSQL node has a backlog size of 8967.22 ms for the nvme2n1 disk, while the ml-demo-nightly-48h-training node has a backlog size of 8967.22 ms for the nvme2n1 disk and 7984.33 ms for the sda disk. These high backlog sizes suggest that there may be significant IO operations queued up, potentially leading to performance issues.

3. Load Average: The PostgreSQL node and the ml-demo

In [26]:
if not os.path.exists(output_dir):
   os.makedirs(output_dir)

file_name_prompt = f'{space_id}_{room_id}__{now}_PROMPT.txt'
with open(f'{output_dir}/{file_name_prompt}', 'w') as f:
    f.write(prompt)
    
file_name_result = f'{space_id}_{room_id}__{now}_RESULT.md'
with open(f'{output_dir}/{file_name_result}', 'w') as f:
    f.write(reply_content)
