In [2]:
import os
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
# Functions for anomaly detection and evaluation
def sliding_window_std(df, column_name, window_size, threshold, start, end):
    results = []
    problem_vals = []  # List to store values that exceed the threshold

    for i in range(start,end):
        # Calculate the window boundaries
        start_index = max(0, i - window_size // 2)
        end_index = min(len(df), i + window_size // 2 + 1)
        
        # Get the numbers within the window
        window = df[column_name].iloc[start_index:end_index]
        
        # Calculate the standard deviation of the window
        std_dev = window.std()
        
        # Append the result to the results list
        # result list conations 3 things (number, window,std of the window)
        results.append((i, df.iloc[i][column_name], window.tolist(), std_dev))
        # print(results[i],'\n')

        if len(results) <= 2:
            # print("Skipping iteration#", i)
            # print("##################################################")
            continue
            
        if len(results) >= 3:
            # print("val1:",results[-3][3])
            # print("val2:",results[-2][3])
            # print("current std:",std_dev)
            # print("current iteration:", i)
            # print("##################################################")

            avg_prev_results = (results[-2][3] + results[-3][3]) / 2
            threshold_value = avg_prev_results + threshold

            if std_dev > threshold_value:
                problem_vals.append((i - window_size // 2, window.iloc[-1]))

            # print('Problem list:', problem_vals)
    return results, problem_vals

def actual_anomaly(df):
    anomaly_rows = []

    # Iterate over each row in the DataFrame
    for i, row in df.iterrows():
        if row['anomaly'] == 1:
            # Append a tuple containing the row number and the value from the 'anomaly' column to the result list
            anomaly_rows.append((i, row['value']))
    # print(anomaly_rows)
    return anomaly_rows

def find_normal_values(original_lst, anomaly_lst):
    normal_values = []
    for num in original_lst:
        is_anomaly = False
        for win_num, value in anomaly_lst:
            if num == value:
                is_anomaly = True
                break
        if not is_anomaly:
            normal_values.append(num)
    return normal_values

def finding_row_number(original_lst, anomaly_lst):
    rows = []
    i = 1
    for num in original_lst:
        i +=1
        for win_num, value in anomaly_lst:
            if num == value:
                rows.append((i,num))
    return rows

def categorize_points(original_lst, anomaly_lst):
    normal = []
    seasonal = []
    anomalies = []

    for i, num in enumerate(original_lst):
        if num not in (value for _, value in anomaly_lst):
            normal.append((i, num))
        elif i % 12 == 0 or i % 168 ==0:
            seasonal.append((i, num))
        else:
            anomalies.append((i, num))

    return normal, seasonal, anomalies

def evaluate_anomaly_detection(true_labels, predicted_labels):
    # Generate confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)

    # Extract true positives (TP) and true negatives (TN)
    TP = cm[1, 1]  # Actual positive (1) and predicted positive (1)
    TN = cm[0, 0]  # Actual negative (0) and predicted negative (0)
    FP = cm[0, 1]  # Actual negative (0) but predicted positive (1)
    FN = cm[1, 0]  # Actual positive (1) but predicted negative (0)

    # # Output true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN)
    # print("True Positives (TP):", TP)
    # print("True Negatives (TN):", TN)
    # print("False Positives (FP):", FP)
    # print("False Negatives (FN):", FN)

    # # Generate classification report
    # report = classification_report(true_labels, predicted_labels)
    # print("\nClassification Report:\n", report)

    # Calculate Accuracy
    accuracy = (TP + TN) / (TP + TN + FP + FN)

    # Calculate Precision
    precision = TP / (TP + FP)

    # Calculate Recall
    recall = TP / (TP + FN)

    # Calculate F1-score
    f1_score = 2 * (precision * recall) / (precision + recall)

    # print("Accuracy:", accuracy)
    # print("Precision:", precision)
    # print("Recall:", recall)
    # print("F1-score:", f1_score)

    return accuracy

df = pd.read_csv("frontend_cpu_data.csv")

# Specify the column name containing the data
column_name = 'Value'

window_size = 5
threshold = 4  # Adjust threshold as needed

start = 0
end = 20

for i in range(5):
    # df = df[start:end]
    
    std_devs, anomalies = sliding_window_std(df, column_name, window_size, threshold,start ,end)
    
    # actual = actual_anomaly(df)
    # normal_values = find_normal_values(df[column_name], anomalies)
    
    anomaly_row_list = finding_row_number(df[column_name], anomalies)
    normal_list, seasonal_anomalies, other_anomalies = categorize_points(df[column_name], anomaly_row_list)
    
    # Initialize and mark predicted anomalies
    df['predicted_anomalies'] = 0
    for anomaly_index, _ in other_anomalies:
        df.at[anomaly_index, 'predicted_anomalies'] = 1

    pd.set_option('display.max_rows', None)
    print("this is the df:\n", df)
    start+=end
    end+=end
    print("This is start point:", start)
    print("This is end point:", end)

   
# std_devs, anomalies = sliding_window_std(df, column_name, window_size, threshold)

# # actual = actual_anomaly(df)
# # normal_values = find_normal_values(df[column_name], anomalies)

# anomaly_row_list = finding_row_number(df[column_name], anomalies)
# normal_list, seasonal_anomalies, other_anomalies = categorize_points(df[column_name], anomaly_row_list)

# # Initialize and mark predicted anomalies
# df['predicted_anomalies'] = 0
# for anomaly_index, _ in other_anomalies:
#     df.at[anomaly_index, 'predicted_anomalies'] = 1
    
# # Evaluate anomaly detection and return accuracy
# accuracy = evaluate_anomaly_detection(df['anomaly'], df['predicted_anomalies'])
# print(accuracy)

this is the df:
                Timestamp     Value  anomaly  predicted_anomalies
0    2024-06-05 12:01:57  29.57400        0                    0
1    2024-06-05 12:02:02  23.42330        0                    0
2    2024-06-05 12:02:07  20.96727        0                    0
3    2024-06-05 12:02:12  29.47260        0                    0
4    2024-06-05 12:02:17  29.47484        0                    0
5    2024-06-05 12:02:22  29.47484        0                    0
6    2024-06-05 12:02:27  29.54237        0                    0
7    2024-06-05 12:02:32  29.54237        0                    0
8    2024-06-05 12:02:37  23.40051        0                    0
9    2024-06-05 12:02:42  29.50646        0                    0
10   2024-06-05 12:02:47  25.15984        0                    0
11   2024-06-05 12:02:52  22.69858        0                    0
12   2024-06-05 12:02:57  25.59037        0                    0
13   2024-06-05 12:03:02  23.12026        0                    0
14   202

IndexError: single positional indexer is out-of-bounds

In [31]:
std_devs

[(0, 29.574, [29.574, 23.4233, 20.96727], 4.433565065568943),
 (1, 23.4233, [29.574, 23.4233, 20.96727, 29.4726], 4.348217508384902),
 (2,
  20.96727,
  [29.574, 23.4233, 20.96727, 29.4726, 29.47484],
  4.098131692627752),
 (3,
  29.4726,
  [23.4233, 20.96727, 29.4726, 29.47484, 29.47484],
  4.080236081699195),
 (4,
  29.47484,
  [20.96727, 29.4726, 29.47484, 29.47484, 29.54237],
  3.812115367604974),
 (5,
  29.47484,
  [29.4726, 29.47484, 29.47484, 29.54237, 29.54237],
  0.037407849844650516),
 (6,
  29.54237,
  [29.47484, 29.47484, 29.54237, 29.54237, 23.40051],
  2.731831799549525),
 (7,
  29.54237,
  [29.47484, 29.54237, 29.54237, 23.40051, 29.50646],
  2.735303441603874),
 (8,
  23.40051,
  [29.54237, 29.54237, 23.40051, 29.50646, 25.15984],
  2.9422064751900057),
 (9,
  29.50646,
  [29.54237, 23.40051, 29.50646, 25.15984, 22.69858],
  3.285852051107901),
 (10,
  25.15984,
  [23.40051, 29.50646, 25.15984, 22.69858, 25.59037],
  2.6537498250155385),
 (11,
  22.69858,
  [29.50646, 2

In [32]:
anomalies

[(82, 15.44513), (83, 15.44513), (120, 12.01868), (121, 19.28995)]

In [33]:
pd.set_option('display.max_rows', None)
print(df)

               Timestamp     Value  anomaly  predicted_anomalies
0    2024-06-05 12:01:57  29.57400        0                    0
1    2024-06-05 12:02:02  23.42330        0                    0
2    2024-06-05 12:02:07  20.96727        0                    0
3    2024-06-05 12:02:12  29.47260        0                    0
4    2024-06-05 12:02:17  29.47484        0                    0
5    2024-06-05 12:02:22  29.47484        0                    0
6    2024-06-05 12:02:27  29.54237        0                    0
7    2024-06-05 12:02:32  29.54237        0                    0
8    2024-06-05 12:02:37  23.40051        0                    0
9    2024-06-05 12:02:42  29.50646        0                    0
10   2024-06-05 12:02:47  25.15984        0                    0
11   2024-06-05 12:02:52  22.69858        0                    0
12   2024-06-05 12:02:57  25.59037        0                    0
13   2024-06-05 12:03:02  23.12026        0                    0
14   2024-06-05 12:03:07 

In [29]:
for i in range(len(df)):
    print("This is start point:", start)
    print("This is end point:", end)
    df = df.iloc[start:end]
    
    std_devs, anomalies = sliding_window_std(df, column_name, window_size, threshold)
    
    # actual = actual_anomaly(df)
    # normal_values = find_normal_values(df[column_name], anomalies)
    
    anomaly_row_list = finding_row_number(df[column_name], anomalies)
    normal_list, seasonal_anomalies, other_anomalies = categorize_points(df[column_name], anomaly_row_list)
    
    # Initialize and mark predicted anomalies
    df['predicted_anomalies'] = 0
    for anomaly_index, _ in other_anomalies:
        df.at[anomaly_index, 'predicted_anomalies'] = 1

    pd.set_option('display.max_rows', None)
    print("this is the df:\n", df)

    # Add new entries to the dataframe
    df = pd.concat([df, new_data])
    
    # Update new_data for future iterations (optional)
    new_data = new_data.iloc[window_size:]  # Remove processed entries from new_data

    start+=end
    end+=end    
    
# # Evaluate anomaly detection and return accuracy
# accuracy = evaluate_anomaly_detection(df['anomaly'], df['predicted_anomalies'])
# print(accuracy)

This is start point: 0
This is end point: 20
this is the df:
               Timestamp     Value  anomaly  predicted_anomalies
0   2024-06-05 12:01:57  29.57400        0                    0
1   2024-06-05 12:02:02  23.42330        0                    0
2   2024-06-05 12:02:07  20.96727        0                    0
3   2024-06-05 12:02:12  29.47260        0                    0
4   2024-06-05 12:02:17  29.47484        0                    0
5   2024-06-05 12:02:22  29.47484        0                    0
6   2024-06-05 12:02:27  29.54237        0                    0
7   2024-06-05 12:02:32  29.54237        0                    0
8   2024-06-05 12:02:37  23.40051        0                    0
9   2024-06-05 12:02:42  29.50646        0                    0
10  2024-06-05 12:02:47  25.15984        0                    0
11  2024-06-05 12:02:52  22.69858        0                    0
12  2024-06-05 12:02:57  25.59037        0                    0
13  2024-06-05 12:03:02  23.12026        0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted_anomalies'] = 0


In [39]:
df.iloc[0:2]

Unnamed: 0,Timestamp,Value,anomaly,predicted_anomalies
0,2024-06-05 12:01:57,29.574,0,0
1,2024-06-05 12:02:02,23.4233,0,0
