# Exploration Notebook for Cloudwatch Alarm Analyser

## Setup Dependencies and Clients
Local VSCode Jupyter doesn't play nice with IsengardCLI so I set the creds EnvVars explicitly

In [1]:
import datetime
import json
import os

import boto3

os.environ["AWS_PROFILE"] = "andevelt+docker-Admins"
os.environ["AWS_REGION"] = "eu-west-1"

cw_client = boto3.client("cloudwatch", region_name=os.environ["AWS_REGION"])


## Function to Download All Alarms
Places the alarm dictionaries into a separate metrics and composite alarm lists.

In [2]:
def retrieve_all_cw_alarms(client):
    paginator = client.get_paginator("describe_alarms")

    metric_alarms_list = []
    composite_alarms_list = []
    for page in paginator.paginate():
        print(f"Page:\n {page}")
        for alarm in page["MetricAlarms"]:
            # print(f"Metric Alarm retrieved: {alarm}")
            metric_alarms_list.append(alarm)
        for alarm in page.get("CompositeAlarms", []):
            # print(f"Composite Alarm retrieved: {alarm}")
            composite_alarms_list.append(alarm)

    # print(f"Total Metric Alarms: {len(metric_alarms_list)}")
    # print(f"Total Composite Alarms: {len(composite_alarms_list)}")

    return metric_alarms_list, composite_alarms_list

## Set Sample Alarm to Experiment

In [3]:
metric_alarms, comp_alarms = retrieve_all_cw_alarms(cw_client)

for alarm in metric_alarms:
    print(alarm)

sample_alarm = metric_alarms[49]

print(sample_alarm)
print(f"\nSample Alarm:\n{sample_alarm}")

Page:
 {'CompositeAlarms': [], 'MetricAlarms': [{'AlarmName': 'AWS/DynamoDB SuccessfulRequestLatency TableName=Services-ddbpetadoption7B7CFEC9-1EQRFE36HVUQA Operation=Scan', 'AlarmArn': 'arn:aws:cloudwatch:eu-west-1:083012691457:alarm:AWS/DynamoDB SuccessfulRequestLatency TableName=Services-ddbpetadoption7B7CFEC9-1EQRFE36HVUQA Operation=Scan', 'AlarmDescription': 'This alarm detects a high latency for the DynamoDB table operation ( indicated by the dimension value of the `Operation` in the alarm). See [this troubleshooting document](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/TroubleshootingLatency.html) for troubleshooting latency issues in Amazon DynamoDB.', 'AlarmConfigurationUpdatedTimestamp': datetime.datetime(2024, 5, 3, 15, 6, 54, 421000, tzinfo=tzutc()), 'ActionsEnabled': True, 'OKActions': [], 'AlarmActions': [], 'InsufficientDataActions': [], 'StateValue': 'OK', 'StateReason': 'Threshold Crossed: 10 out of the last 10 datapoints were not greater than the 

## General Alarm Checks
We only need the alarm info from describe_alarms to check these things
### Alarm has description? 

In [4]:
def alarm_has_description(alarm):
    alarm_desc = alarm.get("AlarmDescription")

    if alarm_desc is None or alarm_desc.strip() == "":
        return False

    return True


### Alarms With Long Thersholds or Too Many Data Points
These alarms are probably unlikely to ever trigger

In [5]:
def alarm_theshold_too_high(alarm):
    alarm_threshold = alarm.get("Threshold")

    if not alarm_threshold or alarm_threshold > 30.0:
        return True
    return False


def alarm_data_points_too_high(alarm):
    alarm_data_points = alarm.get("DatapointsToAlarm")

    if not alarm_data_points or alarm_data_points > 15:
        return True
    return False

### Alarm with No Actions

In [6]:
def alarm_has_actions(alarm):
    return True if len(alarm["AlarmActions"]) > 0 else False

In [None]:
for alarm in metric_alarms:
    if not alarm_has_description(alarm):
        print(f"Missing Description for Alarm: {alarm["AlarmName"]}")
    # if alarm_theshold_too_high(alarm):
    #     print(
    #         f"Alarm Threshold Too High: {alarm["AlarmName"]} Threshold set at {alarm.get("Threshold")}"
    #     )
    # if alarm_data_points_too_high(alarm):
    #     print(
    #         f"Alarm Data Points Too High: {alarm["AlarmName"]} Data Points set at {alarm.get("DatapointsToAlarm")}"
    #     )
    # if not alarm_has_actions(alarm):
    #     print(f"Missing Actions for Alarm: {alarm["AlarmName"]}")


## Get History for Alarm

In [7]:
def get_alarm_history(client, alarm):
    response = cw_client.describe_alarm_history(
        AlarmName=sample_alarm["AlarmName"]
    )
    return response[
        "AlarmHistoryItems"
    ]  ## TODO: Probably need to add pagination to get full history

In [8]:
import boto3
import logging
import os

alarm_history = get_alarm_history(cw_client, sample_alarm)

print(alarm_history)
print(len(alarm_history))

print(alarm_history[3]["HistoryData"])

[{'AlarmName': 'CatAdoptionsAnomalyAlarm', 'AlarmType': 'MetricAlarm', 'Timestamp': datetime.datetime(2024, 11, 13, 13, 41, 35, 180000, tzinfo=tzutc()), 'HistoryItemType': 'StateUpdate', 'HistorySummary': 'Alarm updated from OK to ALARM', 'HistoryData': '{"version":"1.0","oldState":{"stateValue":"OK","stateReason":"Thresholds Crossed: 1 datapoint [258.0 (11/11/24 23:33:00)] was not less than the lower thresholds [256.50483729283144].","stateReasonData":{"version":"1.0","queryDate":"2024-11-12T23:33:35.171+0000","startDate":"2024-11-11T23:33:00.000+0000","period":86400,"recentDatapoints":[258.0],"recentLowerThresholds":[256.50483729283144],"evaluatedDatapoints":[{"timestamp":"2024-11-11T23:33:00.000+0000","value":258.0,"threshold":256.50483729283144}]}},"newState":{"stateValue":"ALARM","stateReason":"Thresholds Crossed: 1 datapoint [268.0 (12/11/24 13:41:00)] was less than the lower thresholds [269.9004881593985].","stateReasonData":{"version":"1.0","queryDate":"2024-11-13T13:41:35.174+

## History Based Checks
A series of checks that require going through the history of an alarm to determine
May also limit this to 2 weeks of history or something

### Long Lived Alarm State
Alarm triggers and isn't resolved for > 48 hours

In [None]:
def long_lived_alarm(alarm_history):
    long_lived_alarm_count = 0
    alarm_stack = []  # In reality, this should ALWAYS be len==1 or len==0 but i like .pop
    for alarm in alarm_history:
        if len(alarm_stack) > 1:  # I assume this is the case
            print(
                "Error: Alarm cannot be triggered twice without being resolved."
            )
            break

        if alarm["HistoryItemType"] != "StateUpdate":
            continue

        if alarm["HistorySummary"] == "Alarm updated from OK to ALARM":
            alarm_stack.append(alarm)

        elif alarm["HistorySummary"] == "Alarm updated from ALARM to OK":
            if (
                len(alarm_stack) == 0
            ):  # TODO: remove this when grabbing FULL history (catches case where pagination happens on active alarm)
                continue
            alarm_trigger = alarm_stack.pop()
            alarm_hist = json.loads(alarm["HistoryData"])
            alarm_trigger_hist = json.loads(alarm_trigger["HistoryData"])

            # Sanity check something hasn't been missed by comparing state changes
            if alarm_hist["newState"] != alarm_trigger_hist["oldState"]:
                print(
                    "Error: Alarm cannot be triggered twice without being resolved."
                )
                break

            alarm_time_string = alarm_trigger_hist["newState"][
                "stateReasonData"
            ]["startDate"]
            alarm_time = datetime.datetime.strptime(
                alarm_time_string, "%Y-%m-%dT%H:%M:%S.%f%z"
            )

            ok_time_string = alarm_hist["newState"]["stateReasonData"][
                "startDate"
            ]
            ok_time = datetime.datetime.strptime(
                ok_time_string, "%Y-%m-%dT%H:%M:%S.%f%z"
            )

            time_to_solve = ok_time - alarm_time
            if time_to_solve > datetime.timedelta(hours=48):
                long_lived_alarm_count += 1
    return long_lived_alarm_count


### Never Alerted
Alarms that have never alerted since creation

In [9]:
def never_alerted(alarm_history):
    if not alarm_history["AlarmHistoryItems"]:
        return True
    return False

In [10]:
def get_alarm_start_time(alarm, state_type="newState"):
    if state_type not in ["newState", "oldState"]:
        raise ValueError("state_type must be either 'newState' or 'oldState'")

    alarm_hist = json.loads(alarm["HistoryData"])
    alarm_start_time_string = alarm_hist[state_type]["stateReasonData"][
        "startDate"
    ]
    last_alarm_start_time = datetime.datetime.strptime(
        alarm_start_time_string, "%Y-%m-%dT%H:%M:%S.%f%z"
    )

    return last_alarm_start_time


### Noisy Alarm
Alarms which close within 2 minutes or twice within 12 hours.

In [None]:
def alarm_history_checks(alarm_history):
    long_lived_alarm_count = 0
    long_term_issue_count = 0
    recurring_in_12_hours_count = 0
    short_alarm_count = 0

    for alarm in reversed(alarm_history):
        if alarm["HistoryItemType"] != "StateUpdate":
            print("Non Status Update data")
            continue

        if alarm["HistorySummary"] == "Alarm updated from OK to ALARM":
            alarm_start_time = get_alarm_start_time(
                alarm, state_type="newState"
            )
            prev_alarm_close_time = get_alarm_start_time(
                alarm, state_type="oldState"
            )
            time_between_close_and_trigger = (
                alarm_start_time - prev_alarm_close_time
            )
            if time_between_close_and_trigger <= datetime.timedelta(hours=24):
                long_term_issue_count += 1
                print(
                    f"Long Term Issue Alarm.\nPrev Close: {prev_alarm_close_time}\nNew Trigger: {alarm_start_time}.\nTime Delta: {time_between_close_and_trigger}"
                )
            if time_between_close_and_trigger <= datetime.timedelta(hours=12):
                recurring_in_12_hours_count += 1
                print(
                    f"Recurring in 12 Hours Alarm.\nPrev Close: {prev_alarm_close_time}\nNew Trigger: {alarm_start_time}.\nTime Delta: {time_between_close_and_trigger}"
                )

        elif alarm["HistorySummary"] == "Alarm updated from ALARM to OK":
            alarm_close_time = get_alarm_start_time(
                alarm, state_type="newState"
            )
            alarm_start_time = get_alarm_start_time(
                alarm, state_type="oldState"
            )
            time_to_solve = alarm_close_time - alarm_start_time
            if time_to_solve >= datetime.timedelta(hours=48):
                long_lived_alarm_count += 1
                print(
                    f"Long Lived Alarm.\nPrev Close: {prev_alarm_close_time}\nNew Trigger: {alarm_start_time}.\nTime Delta: {time_between_close_and_trigger}"
                )
            elif time_to_solve <= datetime.timedelta(minutes=2):
                short_alarm_count += 1
                print(
                    f"Short Term Alarm.\nPrev Close: {prev_alarm_close_time}\nNew Trigger: {alarm_start_time}.\nTime Delta: {time_between_close_and_trigger}"
                )

    return {
        "long_lived_alarm_count": long_lived_alarm_count,
        "long_term_issue_count": long_term_issue_count,
        "recurring_in_12_hours_count": recurring_in_12_hours_count,
        "short_alarm_count": short_alarm_count,
    }


In [24]:
response = alarm_history_checks(alarm_history)

print(f"Alarm history entries: {len(alarm_history)}")

print(json.dumps(response, indent=4))


Long Term Issue Alarm.
Prev Close: 2024-11-10 09:26:00+00:00
New Trigger: 2024-11-10 09:36:00+00:00.
Time Delta: 0:10:00
Recurring in 12 Hours Alarm.
Prev Close: 2024-11-10 09:26:00+00:00
New Trigger: 2024-11-10 09:36:00+00:00.
Time Delta: 0:10:00
Long Term Issue Alarm.
Prev Close: 2024-11-10 09:54:00+00:00
New Trigger: 2024-11-10 10:02:00+00:00.
Time Delta: 0:08:00
Recurring in 12 Hours Alarm.
Prev Close: 2024-11-10 09:54:00+00:00
New Trigger: 2024-11-10 10:02:00+00:00.
Time Delta: 0:08:00
Long Term Issue Alarm.
Prev Close: 2024-11-10 10:16:00+00:00
New Trigger: 2024-11-11 04:41:00+00:00.
Time Delta: 18:25:00
Long Term Issue Alarm.
Prev Close: 2024-11-11 07:18:00+00:00
New Trigger: 2024-11-11 07:36:00+00:00.
Time Delta: 0:18:00
Recurring in 12 Hours Alarm.
Prev Close: 2024-11-11 07:18:00+00:00
New Trigger: 2024-11-11 07:36:00+00:00.
Time Delta: 0:18:00
Long Term Issue Alarm.
Prev Close: 2024-11-11 07:45:00+00:00
New Trigger: 2024-11-11 08:02:00+00:00.
Time Delta: 0:17:00
Recurring in 

In [None]:
def noisy_alarm(alarm_history):
    long_lived_alarm_count = 0
    short_alarm_count = 0
    recurring_alarm_count = 0
    alarm_stack = []  # In reality, this should ALWAYS be len==1 or len==0 but i like .pop
    alarm_12_hour_stack = []
    previous_alarm = None

    for alarm in reversed(alarm_history):
        if alarm["HistoryItemType"] != "StateUpdate":
            print("Non Status Update data")
            continue

        if alarm["HistorySummary"] == "Alarm updated from OK to ALARM":
            print("Alarm updated from OK to ALARM")
            alarm_stack.append(alarm)
            if len(alarm_12_hour_stack) == 0:
                alarm_12_hour_stack.append(alarm)
            else:
                last_alarm = alarm_12_hour_stack.pop()
                last_alarm_start_time = get_alarm_start_time(last_alarm)
                alarm_start_time = get_alarm_start_time(alarm)

                retrigger_time = alarm_start_time - last_alarm_start_time
                print(f"Retrigger time: {retrigger_time}")
                if retrigger_time < datetime.timedelta(hours=12):
                    recurring_alarm_count += 1

        elif alarm["HistorySummary"] == "Alarm updated from ALARM to OK":
            if (
                len(alarm_stack) == 0
            ):  # TODO: remove this when grabbing FULL history (catches case where pagination happens on active alarm)
                continue
            alarm_trigger = alarm_stack.pop()
            alarm_hist = json.loads(alarm["HistoryData"])
            alarm_trigger_hist = json.loads(alarm_trigger["HistoryData"])

            # Sanity check something hasn't been missed by comparing state changes
            if alarm_hist["newState"] != alarm_trigger_hist["oldState"]:
                print(
                    "Error: Alarm cannot be triggered twice without being resolved."
                )
                break

            alarm_time_string = alarm_trigger_hist["newState"][
                "stateReasonData"
            ]["startDate"]
            alarm_time = datetime.datetime.strptime(
                alarm_time_string, "%Y-%m-%dT%H:%M:%S.%f%z"
            )

            ok_time_string = alarm_hist["newState"]["stateReasonData"][
                "startDate"
            ]
            ok_time = datetime.datetime.strptime(
                ok_time_string, "%Y-%m-%dT%H:%M:%S.%f%z"
            )

            time_to_solve = ok_time - alarm_time
            print(f"Resolve time: {time_to_solve}")
            if time_to_solve < datetime.timedelta(minutes=2):
                short_alarm_count += 1
    return short_alarm_count, recurring_alarm_count


### Alarms Indicating Long Term Issues
Alarms occuring at least once a day, every 2 days or less.

In [None]:
def long_term_issue_alarms(alarm_history):
    pass

## Notes
For the checks requiring alarm history, it makes much more sense to peruse the history once and perform each of the checks in parallel as we go along. Each check will have its own data structure (queue/stack) to track as we go and can use that structure plus the new entry to inform a decision.

In [None]:
long_lived_alarms = long_lived_alarm(alarm_history)

In [None]:
noisy_2min_alarm, recurring_12hour_alarm = noisy_alarm(alarm_history)

In [None]:
print(noisy_2min_alarm)

In [None]:
# for item in alarm_history:
#     print(item)

print(alarm_history)
print(alarm_history[0])
print(type(json.loads(alarm_history[0]["HistoryData"])))
print(json.loads(alarm_history[0]["HistoryData"]))
print(type(json.loads(alarm_history[0]["HistoryData"])["oldState"]))
print(
    type(
        json.loads(alarm_history[0]["HistoryData"])["oldState"][
            "stateReasonData"
        ]
    )
)
print(
    type(
        json.loads(alarm_history[0]["HistoryData"])["oldState"][
            "stateReasonData"
        ]["startDate"]
    )
)

In [None]:
from datetime import timedelta

dt = "2024-10-02T14:36:59.792+0000"

print(type(datetime.datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S.%f%z")))

## LLM Shennanigans

In [None]:
def generate_message(bedrock_runtime, model_id, system_prompt, messages):
    body = json.dumps(
        {
            "anthropic_version": "bedrock-2023-05-31",
            "system": system_prompt,
            "messages": messages,
            "temperature": 0.0,
            "max_tokens": 2000,
        }
    )

    response = bedrock_runtime.invoke_model(body=body, modelId=model_id)
    response_body = json.loads(response.get("body").read())

    return response_body

In [None]:
bedrock_runtime = boto3.client(
    service_name="bedrock-runtime", region_name="us-west-2"
)

model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
system_prompt = """
You are a helpful assistant that analyzes CloudWatch Alarm data.
You take Alarm information or alarm history data and use it to answer the user query.
Provide clear and consice answers and always explain your logic.
If you don't know the answer, say "I don't know".
"""

u_msg = "Given the following list of alarms, return the name of all the alarms which have no description.\n<alarms>\n"

for alarm in metric_alarms:
    u_msg += alarm.__str__() + "\n"

u_msg += "</alarms>"

# Prompt with user turn only.
user_message = {"role": "user", "content": u_msg}
messages = [user_message]

response = generate_message(bedrock_runtime, model_id, system_prompt, messages)
print(json.dumps(response, indent=4))

In [None]:
print(alarm_history)

In [None]:
u_msg2 = "Given the following list of events on an alarm, tell me if this alarm should be considred noisy. A noisy alarm is defined as an alarm which triggers and clears within 2 minutes or triggers more than once in 12 hours.\n<alarms>\n"

for alarm in alarm_history:
    u_msg2 += alarm.__str__() + "\n"

u_msg2 += "</alarms>"

# Prompt with user turn only.
user_message = {"role": "user", "content": u_msg2}
messages = [user_message]

response = generate_message(bedrock_runtime, model_id, system_prompt, messages)
print(json.dumps(response, indent=4))