In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


def correlation_based(df, streams, start_date, end_date, threshold=None):
    """
    Analyze streams to detect outliers based on the correlation between the streams.

    Parameters:
    - df: DataFrame containing data with 'created_at' column already set as the index.
    - streams: List of column names (streams) to analyze (at least 3 streams).
    - start_date: Start time (str or datetime).
    - end_date: End time (str or datetime).
    - threshold: Threshold to determine an outlier. If not provided, the value (mean - std) of the average correlations is used.

    Returns:
    - A dictionary with keys as stream names and values as a dict containing average correlation (avg_corr)
      and flag 'is_outlier' (True/False).
    """

    # Check the number of streams
    if len(streams) < 3:
        raise ValueError("At least 3 streams are required to analyze outliers.")

    # Filter the data for the given time period
    df_period = df.loc[start_date:end_date, streams]

    # Calculate the correlation matrix between the streams
    corr_matrix = df_period.corr()

    # Compute the average correlation for each stream with the other streams
    avg_corr = {}
    for stream in streams:
        # Exclude self-correlation (always 1)
        other_corr = corr_matrix.loc[stream, streams].drop(stream)
        avg_corr[stream] = other_corr.mean()

    avg_corr_series = pd.Series(avg_corr)

    # If no threshold is provided, use mean - std of the average correlations
    if threshold is None:
        threshold = avg_corr_series.mean() - avg_corr_series.std()

    # Identify streams with average correlation lower than the threshold (suspected anomaly)
    outlier_streams = avg_corr_series[avg_corr_series < threshold]

    # Print the analysis results
    print("Average correlation of each stream:")
    print(avg_corr_series)
    print("\nOutlier threshold:", threshold)
    print("\nSuspected outlier streams:")
    print(outlier_streams)

    # Return the results as a dictionary
    results = {stream: {"avg_corr": avg_corr_series[stream], "is_outlier": avg_corr_series[stream] < threshold}
               for stream in streams}
    return results

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# from algorithm import correlation_based

# --- Read and process sample data ---
# Assume the file 'thingspeak_dataset.csv' has the following structure:
# created_at,entry_id,field1,field2,field3,field4,field5,field6,field7,field8
df = pd.read_csv('datasets/2881821.csv', parse_dates=['created_at'])
df.sort_values(by='created_at', inplace=True)
df.set_index('created_at', inplace=True)
df = df.interpolate()

# --- Example usage of the detect_outlier_streams function ---
# Choose at least 3 streams for analysis, for example: field1, field2, field3, field5
streams_to_check = ['field1', 'field2', 'field3','field6']
start_date = '2025-03-18 06:54:00'
end_date = '2025-03-18 06:58:00'

results = correlation_based(df, streams_to_check, start_date, end_date)

print("\nOutlier detection results:")
print(results)


Average correlation of each stream:
field1    0.255689
field2    0.195153
field3    0.478965
field6    0.077404
dtype: float64

Outlier threshold: 0.08323831743258805

Suspected outlier streams:
field6    0.077404
dtype: float64

Outlier detection results:
{'field1': {'avg_corr': 0.25568939693046017, 'is_outlier': False}, 'field2': {'avg_corr': 0.19515307026170856, 'is_outlier': False}, 'field3': {'avg_corr': 0.4789650787495647, 'is_outlier': False}, 'field6': {'avg_corr': 0.07740383050347768, 'is_outlier': True}}
