# Summary of Anomalies

This notebook provides information on how to identify anomaly points/regions in NAB signals. In the study, this information was collected using Excel.

In [1]:
#import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob

In [5]:
#count the amounth of group anomalies
def count_anomaly_groups(df):
    """
    Counts the number of groups of consecutive 1s (anomalies) in the 'is_anomaly' column of a DataFrame.

    Args:
    - df(pd.DataFrame): The DataFrame containing the 'is_anomaly' column.

    Returns:
    - int: The count of groups of consecutive 1s.
    """
    group_count = 0
    current_group = 0

    for value in df['is_anomaly']:
        if value == 1:
            current_group += 1
        else:
            if current_group > 1:
                group_count += 1
            current_group = 0

    # Check the last group if the column ends with 1s
    if current_group > 1:
        group_count += 1

    return group_count

In [29]:
def create_summary_NAB(df_info, df, file_name, groups):
     """
     Creates a summary of NAB data and appends it to a DataFrame containing summary information.

     Args:
     - df_info (pd.DataFrame): The DataFrame containing existing summary information.
     - df (pd.DataFrame): The DataFrame containing NAB data.
     - file_name (str): The name of the file or dataset.
     - groups (int): The number of groups of consecutive anomalies.

     Returns:
     - pd.DataFrame: The updated DataFrame containing the summary information.
     """

     length = len(df)
     num_anomalies = sum(df.is_anomaly)

     df_info = df_info.append({'File Name': file_name,
                              'Length': length,
                              'Amount of Anomaly Points': num_anomalies,
                              'Amount of Anomaly Groups': groups},
                              ignore_index=True)
     return df_info

In [25]:
#become file lists
csv_files = glob.glob("NAB_new/*.csv")
file_names = [os.path.splitext(os.path.basename(file))[0] for file in csv_files]
file_names

['ec2_disk_write_bytes_c0d644_anomaly',
 'ec2_network_in_5abac7_anomaly',
 'Twitter_volume_IBM_anomaly',
 'rds_cpu_utilization_cc0c53_anomaly',
 'TravelTime_387_anomaly',
 'TravelTime_451_anomaly',
 'speed_t4013_anomaly',
 'ec2_cpu_utilization_fe7f93_anomaly',
 'art_load_balancer_spikes_anomaly',
 'Twitter_volume_AAPL_anomaly',
 'exchange-2_cpc_results_anomaly',
 'Twitter_volume_AMZN_anomaly',
 'Twitter_volume_FB_anomaly',
 'ec2_cpu_utilization_825cc2_anomaly',
 'rogue_agent_key_updown_anomaly',
 'art_daily_jumpsdown_anomaly',
 'grok_asg_anomaly_anomaly',
 'ec2_network_in_257a54_anomaly',
 'exchange-2_cpm_results_anomaly',
 'ambient_temperature_system_failure_anomaly',
 'art_increase_spike_density_anomaly',
 'elb_request_count_8c0756_anomaly',
 'ec2_cpu_utilization_5f5533_anomaly',
 'ec2_request_latency_system_failure_anomaly',
 'cpu_utilization_asg_misconfiguration_anomaly',
 'Twitter_volume_UPS_anomaly',
 'ec2_cpu_utilization_53ea38_anomaly',
 'Twitter_volume_CVS_anomaly',
 'exchange

In [30]:
#create a DataFrame for summary
df_summary = pd.DataFrame(columns=['File Name', 'Length', 'Amount of Anomaly Points', 'Amount of Anomaly Groups'])

In [31]:
#add information to df_summary
for file_name in file_names:
        df = pd.read_csv(f'NAB_new/{file_name}.csv')
        group = count_anomaly_groups(df)
        df_summary = create_summary_NAB(df_summary, df, file_name, group)


  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = df_info.append({'File Name': file_name,
  df_info = 

In [33]:
#create a .csv file with summary
df_summary.to_csv('Summary/anomaly_summary_NAB.csv',index=False)

In [23]:
summary_dfs = []  # Initialize a list to store DataFrames

for file_name in file_names:
    df = pd.read_csv(f'NAB_new/{file_name}.csv')
    group = count_anomaly_groups(df)
    summary_df = create_summary_NAB(df_info.copy(), df, file_name, group)  # Create a new DataFrame for each iteration
    summary_dfs.append(summary_df)

# Concatenate all the summary DataFrames into one
df_info = pd.concat(summary_dfs, ignore_index=True)


  df_info = df_info.append({'File Name': file_name,


InvalidIndexError: Reindexing only valid with uniquely valued Index objects