In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


def correlation_based(df, streams, start_date, end_date, threshold=None):
    """
    Analyze streams to detect outliers based on the correlation between the streams.

    Parameters:
    - df: DataFrame containing data with 'created_at' column already set as the index.
    - streams: List of column names (streams) to analyze (at least 3 streams).
    - start_date: Start time (str or datetime).
    - end_date: End time (str or datetime).
    - threshold: Threshold to determine an outlier. If not provided, the value (mean - std) of the average correlations is used.

    Returns:
    - A dictionary with keys as stream names and values as a dict containing average correlation (avg_corr)
      and flag 'is_outlier' (True/False).
    """

    # Check the number of streams
    if len(streams) < 3:
        raise ValueError("At least 3 streams are required to analyze outliers.")

    # Filter the data for the given time period
    df_period = df.loc[start_date:end_date, streams]

    # Calculate the correlation matrix between the streams
    corr_matrix = df_period.corr()

    # Compute the average correlation for each stream with the other streams
    avg_corr = {}
    for stream in streams:
        # Exclude self-correlation (always 1)
        other_corr = corr_matrix.loc[stream, streams].drop(stream)
        avg_corr[stream] = other_corr.mean()

    avg_corr_series = pd.Series(avg_corr)

    # If no threshold is provided, use mean - std of the average correlations
    if threshold is None:
        threshold = avg_corr_series.mean() - avg_corr_series.std()

    # Identify streams with average correlation lower than the threshold (suspected anomaly)
    outlier_streams = avg_corr_series[avg_corr_series < threshold]

    # Print the analysis results
    print("Average correlation of each stream:")
    print(avg_corr_series)
    print("\nOutlier threshold:", threshold)
    print("\nSuspected outlier streams:")
    print(outlier_streams)

    # Return the results as a dictionary
    results = {stream: {"avg_corr": avg_corr_series[stream], "is_outlier": avg_corr_series[stream] < threshold}
               for stream in streams}
    return results

In [24]:
def mean_based(df, streams, start_date, end_date, threshold=None):
    """
    Detect outliers based on the mean value of each stream.
    
    Each stream is evaluated by its average value over the specified time period.
    If the average value of a stream is lower than the threshold (computed as mean - std of the average values),
    the stream is flagged as an outlier.
    
    Note: For consistency, the computed average is stored in the key 'avg_corr'.
    
    Parameters:
    - df: DataFrame containing data with 'created_at' as index.
    - streams: List of column names (streams) to analyze (at least 3 streams).
    - start_date: Start time (str or datetime).
    - end_date: End time (str or datetime).
    - threshold: Threshold to determine an outlier. If not provided, the value (mean - std) of the average values is used.
    
    Returns:
    - A dictionary with keys as stream names and values as a dict containing:
      - avg_corr: The average value of the stream.
      - is_outlier: True if the stream is detected as an outlier.
    """
    
    if len(streams) < 3:
        raise ValueError("At least 3 streams are required for analysis.")
    
    # Filter data for the specified time period and streams
    df_period = df.loc[start_date:end_date, streams]
    
    # Calculate the mean value for each stream
    avg_values = df_period.mean()
    
    # If threshold not provided, calculate it as (mean - std) of the average values
    if threshold is None:
        threshold = avg_values.mean() - avg_values.std()
    
    print("Mean value of each stream:")
    print(avg_values)
    print("\nOutlier threshold:", threshold)
    print("\nSuspected outlier streams (low mean):")
    print(avg_values[avg_values < threshold])
    
    results = {stream: {"avg_corr": avg_values[stream],
                        "is_outlier": avg_values[stream] < threshold}
               for stream in streams}
    return results


In [26]:
def volatility_based(df, streams, start_date, end_date, threshold=None):
    """
    Detect outliers based on the volatility (standard deviation) of each stream.
    
    The volatility of each stream is calculated over the specified period.
    We use the negative of the standard deviation as the metric (since higher volatility results in a lower negative value).
    If this metric is lower than the threshold (computed as mean - std of the metric), the stream is flagged as an outlier.
    
    Note: The computed metric is stored in the key 'avg_corr' to maintain a consistent output format.
    
    Parameters:
    - df: DataFrame containing data with 'created_at' as index.
    - streams: List of column names (streams) to analyze (at least 3 streams).
    - start_date: Start time (str or datetime).
    - end_date: End time (str or datetime).
    - threshold: Threshold to determine an outlier. If not provided, the value (mean - std) of the metric is used.
    
    Returns:
    - A dictionary with keys as stream names and values as a dict containing:
      - avg_corr: The negative standard deviation of the stream.
      - is_outlier: True if the stream is detected as an outlier.
    """
    
    if len(streams) < 3:
        raise ValueError("At least 3 streams are required for analysis.")
    
    # Filter data for the specified time period and streams
    df_period = df.loc[start_date:end_date, streams]
    
    # Calculate the standard deviation (volatility) for each stream
    volatility = df_period.std()
    # Invert the standard deviation to follow the same criteria: lower value indicates anomaly
    volatility_metric = -volatility
    
    # If threshold not provided, calculate it as (mean - std) of the volatility metric
    if threshold is None:
        threshold = volatility_metric.mean() - volatility_metric.std()
    
    print("Volatility (std) of each stream (inverted):")
    print(volatility_metric)
    print("\nOutlier threshold:", threshold)
    print("\nSuspected outlier streams (abnormal volatility):")
    print(volatility_metric[volatility_metric < threshold])
    
    results = {stream: {"avg_corr": volatility_metric[stream],
                        "is_outlier": volatility_metric[stream] < threshold}
               for stream in streams}
    return results


In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# from algorithm import correlation_based

# --- Read and process sample data ---
# Assume the file 'thingspeak_dataset.csv' has the following structure:
# created_at,entry_id,field1,field2,field3,field4,field5,field6,field7,field8
df = pd.read_csv('datasets/2881821.csv', parse_dates=['created_at'])
df.sort_values(by='created_at', inplace=True)
df.set_index('created_at', inplace=True)
df = df.interpolate()

# --- Example usage of the detect_outlier_streams function ---
# Choose at least 3 streams for analysis, for example: field1, field2, field3, field5
streams_to_check = ['field1', 'field2', 'field3','field6']
start_date = '2025-03-18 06:54:00'
end_date = '2025-03-18 06:58:00'

results = volatility_based(df, streams_to_check, start_date, end_date)

print("\nOutlier detection results:")
print(results)


Volatility (std) of each stream (inverted):
field1    -1.709701
field2    -1.231456
field3    -1.010277
field6   -37.640944
dtype: float64

Outlier threshold: -28.562339587121162

Suspected outlier streams (abnormal volatility):
field6   -37.640944
dtype: float64

Outlier detection results:
{'field1': {'avg_corr': -1.7097008285302207, 'is_outlier': False}, 'field2': {'avg_corr': -1.231455852429764, 'is_outlier': False}, 'field3': {'avg_corr': -1.0102768633693147, 'is_outlier': False}, 'field6': {'avg_corr': -37.640943922020064, 'is_outlier': True}}
