# Alerts Helper

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/andrewm4894/netdata-gpt-notebooks/blob/main/notebooks/alerts_helper/alerts_helper.ipynb)

In [61]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from netdata_pandas.data_cloud import get_data_cloud
import openai
import pprint as pp
from urllib.parse import urlparse
import requests
import json
from datetime import datetime

# load tokens from .env file
load_dotenv()

NETDATA_API_TOKEN = os.getenv('NETDATA_API_TOKEN')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

openai.api_key = OPENAI_API_KEY

colnames_clean = {
    "nm": "alert name",
    "fami": "alert family",
    "info": "alert description",
    "ch_n": "chart name",
    "ctx": "chart context",
    "st": "alert status",
    "v": "alert latest value",
    "tr_v": "alert triggered value",
    "units": "alert units",
    "cl": "alert class",
    "cm": "alert component",
    "src": "alert source configuration",
    "t": "alert instance as at timestamp",
    "tr_t": "alert triggered at timestamp",
    "tp": "alert type",
    "to": "alert send to group",
}

In [62]:
# inputs
space_id = 'ea93d7b8-0df6-45c0-b13d-1560996c89eb' 
#room_id = 'd8a4e0c5-7c79-4145-900e-83a9f06fcb6a' # all nodes
room_id = 'ae33f57b-b54e-4236-a6de-054da3f0a748' # machine learning room
output_dir = f'output/{space_id}/{room_id}'
now = datetime.now().strftime("%Y%m%d_%H%M%S")

In [63]:
def get_alerts_cloud(space_id, room_id, api_token=None, base_url='https://app.netdata.cloud', node_ids=[]):
    """
    """
    
    if api_token is None:
        api_token = os.getenv('NETDATA_API_TOKEN')
    
    base_url = 'https://app.netdata.cloud'
    url = f'{base_url}/api/v2/spaces/{space_id}/rooms/{room_id}/alerts'
    headers = {'Accept': '*/*', 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_token}'}
    data = {"scope":{"nodes":[],"contexts":[]},"selectors":{"status":["raised"]},"options":["summary","values","instances"]}
    r = requests.post(url, headers=headers, data=json.dumps(data))
    
    if r.status_code != 200:
        
        print(f'Error: {r.status_code, r.text}')
        return None
    
    else:
            
        return r.json()
    

def make_prompt(df_alert_instances):
    prompt = f"""
    You are an experienced SRE and sysadmin. You are monitoring your infrastructure using Netdata Cloud.

    Below you are given a list of alert instances representing all active alerts in your Netdata Cloud space right now.

    Can you summarize the and assess to state of your nodes and infrastructure given the dictionary of active alerts.
    
    Please just condense it to a paragraph or two of the key and most important points.
    
    Provide a list of some potential next steps in order of priority (be as concrete as possible and provide examples where relevant or possible).
    
    Use markdown to format your response.

    Here is the list of active alert instances:

    {df_alert_instances.to_dict(orient='records')}
    
    """
    
    return prompt


In [64]:
alerts = get_alerts_cloud(space_id, room_id, api_token=NETDATA_API_TOKEN)

In [65]:
nodes_map = {n['ni']:n['nm'] for n in alerts['nodes']}

In [66]:
df_alert_instances = pd.DataFrame(alerts['alert_instances'])
df_alert_instances['node name'] = df_alert_instances['ni'].map(nodes_map)
df_alert_instances = df_alert_instances.rename(columns=colnames_clean, inplace=False)
df_alert_instances = df_alert_instances[['node name'] + list(colnames_clean.values())]
df_alert_instances['alert instance as at timestamp'] = pd.to_datetime(df_alert_instances['alert instance as at timestamp'], unit='s')
df_alert_instances['alert triggered at timestamp'] = pd.to_datetime(df_alert_instances['alert triggered at timestamp'], unit='s')
df_alert_instances

Unnamed: 0,node name,alert name,alert family,alert description,chart name,chart context,alert status,alert latest value,alert triggered value,alert units,alert class,alert component,alert source configuration,alert instance as at timestamp,alert triggered at timestamp,alert type,alert send to group
0,ml-demo-nightly,10min_cpu_iowait,cpu,average CPU iowait time over the last 10 minutes,system.cpu,system.cpu,WARNING,41.065118,41.12046,%,Utilization,CPU,20@/usr/lib/netdata/conf.d/health.d/cpu.conf,2023-07-25 12:04:54,2023-07-25 11:52:54,System,sysadmin
1,ml-demo-nightly,10min_disk_backlog,ubuntu--vg-ubuntu--lv,average backlog size of the ubuntu--vg-ubuntu-...,disk_backlog.ubuntu--vg-ubuntu--lv,disk.backlog,WARNING,24476.275465,5410.095991,ms,Latency,Disk,154@/usr/lib/netdata/conf.d/health.d/disks.conf,2023-07-25 12:04:54,2023-07-25 11:46:06,System,silent
2,ml-demo-nightly,10min_disk_backlog,sda,average backlog size of the sda disk over the ...,disk_backlog.sda,disk.backlog,WARNING,23938.245206,5378.251514,ms,Latency,Disk,154@/usr/lib/netdata/conf.d/health.d/disks.conf,2023-07-25 12:04:54,2023-07-25 11:46:06,System,silent
3,ml-demo-nightly,load_average_1,load,system one-minute load average,system.load,system.load,WARNING,60.99577,21.59,load,Utilization,Load,55@/usr/lib/netdata/conf.d/health.d/load.conf,2023-07-25 12:04:34,2023-07-25 11:44:54,System,sysadmin
4,ml-demo-nightly,load_average_15,load,system fifteen-minute load average,system.load,system.load,WARNING,41.77,4.29,load,Utilization,Load,23@/usr/lib/netdata/conf.d/health.d/load.conf,2023-07-25 12:04:34,2023-07-25 11:06:54,System,sysadmin
5,ml-demo-nightly,load_average_5,load,system five-minute load average,system.load,system.load,WARNING,61.05887,11.01,load,Utilization,Load,39@/usr/lib/netdata/conf.d/health.d/load.conf,2023-07-25 12:04:34,2023-07-25 11:44:54,System,sysadmin
6,ml-demo-nightly,ml_5min_apps_cpu_dim_grafana,cpu,rolling 5min anomaly rate for each apps.cpu di...,apps.cpu,apps.cpu,WARNING,27.777778,27.777778,%,Anomaly,,86@/etc/netdata/health.d/ml.conf,2023-07-25 12:04:54,2023-07-25 12:00:24,,root
7,ml-demo-nightly,ml_5min_apps_cpu_dim_kernel,cpu,rolling 5min anomaly rate for each apps.cpu di...,apps.cpu,apps.cpu,WARNING,50.0,17.793594,%,Anomaly,,86@/etc/netdata/health.d/ml.conf,2023-07-25 12:04:54,2023-07-25 11:45:24,,root
8,ml-demo-nightly,ml_5min_apps_cpu_dim_ssh,cpu,rolling 5min anomaly rate for each apps.cpu di...,apps.cpu,apps.cpu,WARNING,22.222222,13.684211,%,Anomaly,,86@/etc/netdata/health.d/ml.conf,2023-07-25 12:04:54,2023-07-25 11:50:24,,root
9,ml-demo-nightly,ml_5min_apps_mem_dim_auth,mem,rolling 5min anomaly rate for each apps.mem di...,apps.mem,apps.mem,WARNING,38.888889,38.888889,%,Anomaly,,97@/etc/netdata/health.d/ml.conf,2023-07-25 12:04:54,2023-07-25 12:00:24,,root


In [67]:
prompt = make_prompt(df_alert_instances)
print(prompt)


    You are an experienced SRE and sysadmin. You are monitoring your infrastructure using Netdata Cloud.

    Below you are given a list of alert instances representing all active alerts in your Netdata Cloud space right now.

    Can you summarize the and assess to state of your nodes and infrastructure given the dictionary of active alerts.
    
    Please just condense it to a paragraph or two of the key and most important points.
    
    Provide a list of some potential next steps in order of priority (be as concrete as possible and provide examples where relevant or possible).
    
    Use markdown to format your response.

    Here is the list of active alert instances:

    
    


In [68]:
# build messages list to pass to openai
messages=[
    {"role": "user", "content": prompt}
]

# call openai api
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
)

reply_content = completion.choices[0].message.content
#pp.pprint(reply_content)

In [69]:
print(reply_content)






To address these issues and improve the state of the nodes and infrastructure, some potential next steps in order of priority could be:

1. Investigate and resolve the high CPU iowait time by identifying the cause of the increased iowait and optimizing disk or I/O operations if necessary. This could involve checking for any resource-intensive applications or disk bottlenecks.

2. Address the high disk backlogs by analyzing the disk performance and identifying potential bottlenecks. This could involve optimizing disk I/O or considering increasing disk capacity if necessary.

3. Check the system load averages and examine the load patterns over time to understand the cause of the high load. Consider optimizing resource utilization, scaling up resources, or load balancing if needed.

4. Investigate and resolve the anomalies in CPU, memory, and processes usage. This could involve checking the specific applications or users associated with the anomalies, analyzing their resource consump

In [70]:
if not os.path.exists(output_dir):
   os.makedirs(output_dir)

file_name_prompt = f'{space_id}_{room_id}__{now}_PROMPT.txt'
with open(f'{output_dir}/{file_name_prompt}', 'w') as f:
    f.write(prompt)
    
file_name_result = f'{space_id}_{room_id}__{now}_RESULT.md'
with open(f'{output_dir}/{file_name_result}', 'w') as f:
    f.write(reply_content)
