# Exploration Notebook for Cloudwatch Alarm Analyser

## Setup Dependencies and Clients
Local VSCode Jupyter doesn't play nice with IsengardCLI so I set the creds EnvVars explicitly

In [2]:
import datetime
import json
import os

import boto3

os.environ["AWS_PROFILE"] = "andevelt+docker-Admins"
os.environ["AWS_REGION"] = "eu-west-1"

cw_client = boto3.client("cloudwatch", region_name=os.environ["AWS_REGION"])


## Function to Download All Alarms
Places the alarm dictionaries into a separate metrics and composite alarm lists.

In [3]:
def retrieve_all_cw_alarms(client):
    paginator = client.get_paginator("describe_alarms")

    metric_alarms_list = []
    composite_alarms_list = []
    for page in paginator.paginate():
        print(f"Page:\n {page}")
        for alarm in page["MetricAlarms"]:
            # print(f"Metric Alarm retrieved: {alarm}")
            metric_alarms_list.append(alarm)
        for alarm in page.get("CompositeAlarms", []):
            # print(f"Composite Alarm retrieved: {alarm}")
            composite_alarms_list.append(alarm)

    # print(f"Total Metric Alarms: {len(metric_alarms_list)}")
    # print(f"Total Composite Alarms: {len(composite_alarms_list)}")

    return metric_alarms_list, composite_alarms_list

## Set Sample Alarm to Experiment

In [4]:
metric_alarms, comp_alarms = retrieve_all_cw_alarms(cw_client)

for alarm in metric_alarms:
    print(alarm)

sample_alarm = metric_alarms[48]

print(sample_alarm)

Page:
 {'CompositeAlarms': [], 'MetricAlarms': [{'AlarmName': 'AWS/DynamoDB SuccessfulRequestLatency TableName=Services-ddbpetadoption7B7CFEC9-1EQRFE36HVUQA Operation=Scan', 'AlarmArn': 'arn:aws:cloudwatch:eu-west-1:083012691457:alarm:AWS/DynamoDB SuccessfulRequestLatency TableName=Services-ddbpetadoption7B7CFEC9-1EQRFE36HVUQA Operation=Scan', 'AlarmDescription': 'This alarm detects a high latency for the DynamoDB table operation ( indicated by the dimension value of the `Operation` in the alarm). See [this troubleshooting document](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/TroubleshootingLatency.html) for troubleshooting latency issues in Amazon DynamoDB.', 'AlarmConfigurationUpdatedTimestamp': datetime.datetime(2024, 5, 3, 15, 6, 54, 421000, tzinfo=tzutc()), 'ActionsEnabled': True, 'OKActions': [], 'AlarmActions': [], 'InsufficientDataActions': [], 'StateValue': 'OK', 'StateReason': 'Threshold Crossed: 10 out of the last 10 datapoints were not greater than the 

## General Alarm Checks
We only need the alarm info from describe_alarms to check these things
### Alarm has description? 

In [5]:
def alarm_has_description(alarm):
    alarm_desc = alarm.get("AlarmDescription")

    if alarm_desc is None or alarm_desc.strip() == "":
        return False

    return True


### Alarms With Long Thersholds or Too Many Data Points
These alarms are probably unlikely to ever trigger

In [6]:
def alarm_theshold_too_high(alarm):
    alarm_threshold = alarm.get("Threshold")

    if not alarm_threshold or alarm_threshold > 30.0:
        return True
    return False


def alarm_data_points_too_high(alarm):
    alarm_data_points = alarm.get("DatapointsToAlarm")

    if not alarm_data_points or alarm_data_points > 15:
        return True
    return False

### Alarm with No Actions

In [7]:
def alarm_has_actions(alarm):
    return True if len(alarm["AlarmActions"]) > 0 else False

In [8]:
for alarm in metric_alarms:
    if not alarm_has_description(alarm):
        print(f"Missing Description for Alarm: {alarm["AlarmName"]}")
    if alarm_theshold_too_high(alarm):
        print(
            f"Alarm Threshold Too High: {alarm["AlarmName"]} Threshold set at {alarm.get("Threshold")}"
        )
    if alarm_data_points_too_high(alarm):
        print(
            f"Alarm Data Points Too High: {alarm["AlarmName"]} Data Points set at {alarm.get("DatapointsToAlarm")}"
        )
    if not alarm_has_actions(alarm):
        print(f"Missing Actions for Alarm: {alarm["AlarmName"]}")


Alarm Threshold Too High: AWS/DynamoDB SuccessfulRequestLatency TableName=Services-ddbpetadoption7B7CFEC9-1EQRFE36HVUQA Operation=Scan Threshold set at 100000.0
Missing Actions for Alarm: AWS/DynamoDB SuccessfulRequestLatency TableName=Services-ddbpetadoption7B7CFEC9-1EQRFE36HVUQA Operation=Scan
Missing Description for Alarm: Anomaly-cat-adoptions
Alarm Threshold Too High: Anomaly-cat-adoptions Threshold set at None
Missing Actions for Alarm: ApplicationInsights/Services/AWS/ApplicationELB/HTTPCode_Target_4XX_Count/app/Servic-PetSi-tiy2LBEKxwQG/87664db7fd32e031/
Missing Actions for Alarm: ApplicationInsights/Services/AWS/ApplicationELB/HTTPCode_Target_4XX_Count/app/Servic-lista-07Nv8kCyNRa5/d8a105c7192e2064/
Missing Actions for Alarm: ApplicationInsights/Services/AWS/ApplicationELB/HTTPCode_Target_4XX_Count/app/Servic-payfo-LBPQv3iGDH6T/2511dcb0ea9a972b/
Missing Actions for Alarm: ApplicationInsights/Services/AWS/ApplicationELB/HTTPCode_Target_4XX_Count/app/Servic-searc-qIOLT5maicWz/9d

## Get History for Alarm

In [9]:
def get_alarm_history(client, alarm):
    response = cw_client.describe_alarm_history(
        AlarmName=sample_alarm["AlarmName"]
    )
    return response[
        "AlarmHistoryItems"
    ]  ## TODO: Probably need to add pagination to get full history

In [11]:
alarm_history = get_alarm_history(cw_client, sample_alarm)

print(alarm_history)


[{'AlarmName': 'CatsAdoptedTooLow', 'AlarmType': 'MetricAlarm', 'Timestamp': datetime.datetime(2024, 10, 23, 21, 47, 19, 330000, tzinfo=tzutc()), 'HistoryItemType': 'StateUpdate', 'HistorySummary': 'Alarm updated from ALARM to OK', 'HistoryData': '{"version":"1.0","oldState":{"stateValue":"ALARM","stateReason":"Threshold Crossed: 1 out of the last 1 datapoints [1.0 (23/10/24 20:46:00)] was less than or equal to the threshold (2.0) (minimum 1 datapoint for OK -> ALARM transition).","stateReasonData":{"version":"1.0","queryDate":"2024-10-23T21:46:19.329+0000","startDate":"2024-10-23T20:46:00.000+0000","statistic":"Sum","period":3600,"recentDatapoints":[1.0],"threshold":2.0,"evaluatedDatapoints":[{"timestamp":"2024-10-23T20:46:00.000+0000","sampleCount":1.0,"value":1.0}]}},"newState":{"stateValue":"OK","stateReason":"Threshold Crossed: 1 out of the last 1 datapoints [6.0 (23/10/24 20:47:00)] was not less than or equal to the threshold (2.0) (minimum 1 datapoint for ALARM -> OK transition)

## History Based Checks
A series of checks that require going through the history of an alarm to determine
May also limit this to 2 weeks of history or something

### Long Lived Alarm State
Alarm triggers and isn't resolved for > 48 hours

In [40]:
def long_lived_alarm(alarm_history):
    long_lived_alarm_count = 0
    alarm_stack = []  # In reality, this should ALWAYS be len==1 or len==0 but i like .pop
    for alarm in alarm_history:
        if len(alarm_stack) > 1:  # I assume this is the case
            print(
                "Error: Alarm cannot be triggered twice without being resolved."
            )
            break

        if alarm["HistoryItemType"] != "StateUpdate":
            continue

        if alarm["HistorySummary"] == "Alarm updated from OK to ALARM":
            alarm_stack.append(alarm)

        elif alarm["HistorySummary"] == "Alarm updated from ALARM to OK":
            if (
                len(alarm_stack) == 0
            ):  # TODO: remove this when grabbing FULL history (catches case where pagination happens on active alarm)
                continue
            alarm_trigger = alarm_stack.pop()
            alarm_hist = json.loads(alarm["HistoryData"])
            alarm_trigger_hist = json.loads(alarm_trigger["HistoryData"])

            # Sanity check something hasn't been missed by comparing state changes
            if alarm_hist["newState"] != alarm_trigger_hist["oldState"]:
                print(
                    "Error: Alarm cannot be triggered twice without being resolved."
                )
                break

            alarm_time_string = alarm_trigger_hist["newState"][
                "stateReasonData"
            ]["startDate"]
            alarm_time = datetime.datetime.strptime(
                alarm_time_string, "%Y-%m-%dT%H:%M:%S.%f%z"
            )

            ok_time_string = alarm_hist["newState"]["stateReasonData"][
                "startDate"
            ]
            ok_time = datetime.datetime.strptime(
                ok_time_string, "%Y-%m-%dT%H:%M:%S.%f%z"
            )

            time_to_solve = ok_time - alarm_time
            if time_to_solve > datetime.timedelta(hours=48):
                long_lived_alarm_count += 1
    return long_lived_alarm_count


### Never Alerted
Alarms that have never alerted since creation

In [12]:
def never_alerted(alarm_history):
    if not alarm_history["AlarmHistoryItems"]:
        return True
    return False

### Noisy Alarm
Alarms which close within 2 minutes or twice within 12 hours.

In [13]:
def noisy_alarm(alarm_history):
    short_alarm_count = 0
    recurring_alarm_count = 0
    alarm_stack = []  # In reality, this should ALWAYS be len==1 or len==0 but i like .pop
    alarm_12_hour_stack = []
    for alarm in alarm_history:
        if len(alarm_stack) > 1:  # I assume this is the case
            print(
                "Error: Alarm cannot be triggered twice without being resolved."
            )
            break

        if alarm["HistoryItemType"] != "StateUpdate":
            continue

        if alarm["HistorySummary"] == "Alarm updated from OK to ALARM":
            alarm_stack.append(alarm)
            if len(alarm_12_hour_stack) == 0:
                alarm_12_hour_stack.append(alarm)
            else:
                last_alarm = alarm_12_hour_stack.pop()
                last_alarm_hist = json.loads(last_alarm["HistoryData"])
                last_alarm_start_time_string = last_alarm_hist["newState"][
                    "stateReasonData"
                ]["startDate"]
                last_alarm_start_time = datetime.datetime.strptime(
                    last_alarm_start_time_string, "%Y-%m-%dT%H:%M:%S.%f%z"
                )

                alarm_hist = json.loads(alarm["HistoryData"])
                alarm_start_time_string = last_alarm_hist["newState"][
                    "stateReasonData"
                ]["startDate"]

                alarm_start_time = datetime.datetime.strptime(
                    alarm_start_time_string, "%Y-%m-%dT%H:%M:%S.%f%z"
                )

                retrigger_time = alarm_start_time - last_alarm_start_time
                if retrigger_time < datetime.timedelta(hours=12):
                    recurring_alarm_count += 1

        elif alarm["HistorySummary"] == "Alarm updated from ALARM to OK":
            if (
                len(alarm_stack) == 0
            ):  # TODO: remove this when grabbing FULL history (catches case where pagination happens on active alarm)
                continue
            alarm_trigger = alarm_stack.pop()
            alarm_hist = json.loads(alarm["HistoryData"])
            alarm_trigger_hist = json.loads(alarm_trigger["HistoryData"])

            # Sanity check something hasn't been missed by comparing state changes
            if alarm_hist["newState"] != alarm_trigger_hist["oldState"]:
                print(
                    "Error: Alarm cannot be triggered twice without being resolved."
                )
                break

            alarm_time_string = alarm_trigger_hist["newState"][
                "stateReasonData"
            ]["startDate"]
            alarm_time = datetime.datetime.strptime(
                alarm_time_string, "%Y-%m-%dT%H:%M:%S.%f%z"
            )

            ok_time_string = alarm_hist["newState"]["stateReasonData"][
                "startDate"
            ]
            ok_time = datetime.datetime.strptime(
                ok_time_string, "%Y-%m-%dT%H:%M:%S.%f%z"
            )

            time_to_solve = ok_time - alarm_time
            if time_to_solve < datetime.timedelta(minutes=2):
                short_alarm_count += 1
    return short_alarm_count, recurring_alarm_count


### Alarms Indicating Long Term Issues
Alarms occuring at least once a day, every 2 days or less.

In [None]:
def long_term_issue_alarms(alarm_history):
    pass

In [41]:
long_lived_alarms = long_lived_alarm(alarm_history)

In [14]:
noisy_2min_alarm, recurring_12hour_alarm = noisy_alarm(alarm_history)

IndexError: pop from empty list

In [12]:
# for item in alarm_history:
#     print(item)

print(alarm_history)
print(alarm_history[0])
print(type(json.loads(alarm_history[0]["HistoryData"])))
print(json.loads(alarm_history[0]["HistoryData"]))
print(type(json.loads(alarm_history[0]["HistoryData"])["oldState"]))
print(
    type(
        json.loads(alarm_history[0]["HistoryData"])["oldState"][
            "stateReasonData"
        ]
    )
)
print(
    type(
        json.loads(alarm_history[0]["HistoryData"])["oldState"][
            "stateReasonData"
        ]["startDate"]
    )
)

[{'AlarmName': 'CatsAdoptedTooLow', 'AlarmType': 'MetricAlarm', 'Timestamp': datetime.datetime(2024, 10, 23, 21, 47, 19, 330000, tzinfo=tzutc()), 'HistoryItemType': 'StateUpdate', 'HistorySummary': 'Alarm updated from ALARM to OK', 'HistoryData': '{"version":"1.0","oldState":{"stateValue":"ALARM","stateReason":"Threshold Crossed: 1 out of the last 1 datapoints [1.0 (23/10/24 20:46:00)] was less than or equal to the threshold (2.0) (minimum 1 datapoint for OK -> ALARM transition).","stateReasonData":{"version":"1.0","queryDate":"2024-10-23T21:46:19.329+0000","startDate":"2024-10-23T20:46:00.000+0000","statistic":"Sum","period":3600,"recentDatapoints":[1.0],"threshold":2.0,"evaluatedDatapoints":[{"timestamp":"2024-10-23T20:46:00.000+0000","sampleCount":1.0,"value":1.0}]}},"newState":{"stateValue":"OK","stateReason":"Threshold Crossed: 1 out of the last 1 datapoints [6.0 (23/10/24 20:47:00)] was not less than or equal to the threshold (2.0) (minimum 1 datapoint for ALARM -> OK transition)

In [None]:
from datetime import timedelta

dt = "2024-10-02T14:36:59.792+0000"

print(type(datetime.datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S.%f%z")))