Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
8cbed6e
Updated outlier explainer to support adding text from external source…
YuvalUner Mar 25, 2025
91c538a
Fixed division by zero error.
YuvalUner Mar 30, 2025
a18ae18
Added WIP MetaInsight explainer (migrated from pd-explain).
YuvalUner May 5, 2025
3623479
Continued progress on MetaInsight explainer. Added most required func…
YuvalUner May 6, 2025
1d94cb4
Finished adding the MetaInsight mining process, and it seems to be ab…
YuvalUner May 6, 2025
c2a7324
Changed impact computing to a universal one, fixed some bugs.
YuvalUner May 9, 2025
5e38db6
Changed unimodality evaluation to a better fitting one.
YuvalUner May 9, 2025
39c6e1d
Changed all evaluation methods to more sound ones, added more caching.
YuvalUner May 10, 2025
82df133
Fixed bugs in some pattern evaluation, added visualizations for trend…
YuvalUner May 10, 2025
22af013
Added plotting of all currently supported patterns. Added binning whe…
YuvalUner May 12, 2025
c23a220
Optimized runtimes - with expected use-cases, code now takes about 1-…
YuvalUner May 12, 2025
45f1d7b
Added visualization of meta-insights as a whole (also for groups of m…
YuvalUner May 13, 2025
f274073
Fixed issue where NaN values were kept prior to pattern evaluation (c…
YuvalUner May 13, 2025
c202464
Visualization updates.
YuvalUner May 14, 2025
2795279
Completely redid visualization for more comprehensible visualization.…
YuvalUner May 18, 2025
d78fb7a
Improved visualizations to the point they are now almost entirely coh…
YuvalUner May 20, 2025
d42b9fc
Metainsight visualizations possibly done
YuvalUner May 20, 2025
5cf1837
Fixed bugs in computations and visualizations, added more caching for…
YuvalUner May 22, 2025
0727964
Changed so NONE patterns are included and taken into account, changed…
YuvalUner May 23, 2025
18dcd49
Fixed spacing and location issues in visualizations, fixed bugs in co…
YuvalUner May 27, 2025
cf74ff0
Added __str__ method to MetaInsight class.
YuvalUner May 31, 2025
31c55fa
Changed caching to work globally using a singleton LRU cache class.
YuvalUner Jun 2, 2025
44693c2
Added string representation function for exceptions in meta insights.
YuvalUner Jun 8, 2025
4ed7772
Fixed issue causing visualization to sometimes fail with multi-index …
YuvalUner Jun 9, 2025
62c8a76
Added support for MetaInsights to add text to the figure, allowing fo…
YuvalUner Jun 24, 2025
01c071e
Changed visualization of trend pattern to use the mean over the distr…
YuvalUner Jun 26, 2025
55f89ca
Fixed bug causing a crash when the subspace value was not a string.
YuvalUner Jun 27, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
numpy~=2.1.3
pandas~=2.2.3
matplotlib~=3.9.2
matplotlib~=3.9.2
diptest
scipy
scikit-learn
pymannkendall
cydets
Empty file.
118 changes: 118 additions & 0 deletions src/external_explainers/metainsight_explainer/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from singleton_decorator import singleton
from collections import OrderedDict

PATTERN_CACHE_MAX_SIZE = 40000
DATASCOPE_CACHE_MAX_SIZE = 40000
PATTERN_EVAL_CACHE_MAX_SIZE = 40000
GROUPBY_CACHE_MAX_SIZE = 5000

@singleton
class Cache:
"""
A singleton class to hold various caches used in the MetaInsight explainer.
This helps in avoiding redundant computations and speeds up the evaluation process.
We use a singleton pattern to make the cache:
1. Global across the application.
2. Persistent throughout the lifetime of the application.
This cache is a simple LRU (Least Recently Used) cache implementation, removing the least recently used items when the cache exceeds its maximum size.
The caches in this class are:
- pattern_cache: Stores the data pattern objects evaluated for different data scopes and patterns.
- datascope_cache: Stores the scores for different data scopes.
- groupby_cache: Stores the results of groupby operations.
- pattern_eval_cache: Stores the results of pattern evaluations on series.
"""

def __init__(self):
self._pattern_cache = OrderedDict()
self._datascope_cache = OrderedDict()
self._groupby_cache = OrderedDict()
self._pattern_eval_cache = OrderedDict()
self.pattern_cache_max_size = PATTERN_CACHE_MAX_SIZE
self.datascope_cache_max_size = DATASCOPE_CACHE_MAX_SIZE
self.groupby_cache_max_size = GROUPBY_CACHE_MAX_SIZE
self.pattern_eval_cache_max_size = PATTERN_EVAL_CACHE_MAX_SIZE


def _add_to_cache(self, cache, key, value, max_size) -> None:
"""
Adds a key-value pair to the specified cache.
If the cache exceeds its maximum size, it removes the least recently used item.
"""
if key in cache:
# Update the value and mark as recently used
cache.move_to_end(key)
cache[key] = value
if len(cache) > max_size:
# Pop the first item (least recently used)
cache.popitem(last=False)


def _get_from_cache(self, cache, key) -> any:
"""
Retrieves a value from the specified cache by key.
If the key exists, it marks the key as recently used.
"""
if key in cache:
# Move the accessed item to the end to mark it as recently used
cache.move_to_end(key)
return cache[key]
return None


def add_to_pattern_cache(self, key, value) -> None:
"""
Adds a key-value pair to the pattern cache.
If the cache exceeds its maximum size, it removes the least recently used item.
"""
self._add_to_cache(self._pattern_cache, key, value, PATTERN_CACHE_MAX_SIZE)


def add_to_datascope_cache(self, key, value) -> None:
"""
Adds a key-value pair to the datascope cache.
If the cache exceeds its maximum size, it removes the least recently used item.
"""
self._add_to_cache(self._datascope_cache, key, value, DATASCOPE_CACHE_MAX_SIZE)

def add_to_groupby_cache(self, key, value):
"""
Adds a key-value pair to the groupby cache.
If the cache exceeds its maximum size, it removes the least recently used item.
"""
self._add_to_cache(self._groupby_cache, key, value, GROUPBY_CACHE_MAX_SIZE)

def add_to_pattern_eval_cache(self, key, value) -> None:
"""
Adds a key-value pair to the pattern evaluation cache.
If the cache exceeds its maximum size, it removes the least recently used item.
"""
self._add_to_cache(self._pattern_eval_cache, key, value, PATTERN_EVAL_CACHE_MAX_SIZE)


def get_from_pattern_cache(self, key):
"""
Retrieves a value from the pattern cache by key.
If the key exists, it marks the key as recently used.
"""
return self._get_from_cache(self._pattern_cache, key)

def get_from_datascope_cache(self, key):
"""
Retrieves a value from the datascope cache by key.
If the key exists, it marks the key as recently used.
"""
return self._get_from_cache(self._datascope_cache, key)

def get_from_groupby_cache(self, key):
"""
Retrieves a value from the groupby cache by key.
If the key exists, it marks the key as recently used.
"""
return self._get_from_cache(self._groupby_cache, key)

def get_from_pattern_eval_cache(self, key):
"""
Retrieves a value from the pattern evaluation cache by key.
If the key exists, it marks the key as recently used.
"""
return self._get_from_cache(self._pattern_eval_cache, key)
226 changes: 226 additions & 0 deletions src/external_explainers/metainsight_explainer/data_pattern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
import typing

import pandas as pd
from typing import Dict, List, Tuple

from external_explainers.metainsight_explainer.data_scope import DataScope, HomogenousDataScope
from external_explainers.metainsight_explainer.pattern_evaluations import PatternEvaluator, PatternType
from external_explainers.metainsight_explainer.patterns import PatternInterface
from external_explainers.metainsight_explainer.cache import Cache


class BasicDataPattern:
"""
A data pattern, as defined in the MetaInsight paper.
Contains 3 elements: data scope, type (interpretation type) and highlight.
"""
cache = Cache()

def __init__(self, data_scope: DataScope, pattern_type: PatternType, highlight: PatternInterface | None):
"""
Initialize the BasicDataPattern with the provided data scope, type and highlight.

:param data_scope: The data scope of the pattern. a DataScope object.
:param pattern_type: str, e.g., 'Unimodality', 'Trend', 'Other Pattern', 'No Pattern'
:param highlight: depends on type, e.g., ('April', 'Valley') for Unimodality
"""
self.data_scope = data_scope
self.pattern_type = pattern_type
self.highlight = highlight
self.hash = None

def __eq__(self, other):
if not isinstance(other, BasicDataPattern):
return False
return self.pattern_type == other.pattern_type and \
self.highlight == other.highlight and \
self.data_scope == other.data_scope

def sim(self, other) -> bool:
"""
Computes the similarity between two BasicDataPattern objects.
They are similar if they have the same pattern type and highlight, as well as neither having
a pattern type of NONE or OTHER.

:param other: The other BasicDataPattern object to compare with.
:return: True if similar, False otherwise.
"""
if not isinstance(other, BasicDataPattern):
return False
# There is no REAL need to check that both don't have NONE or OTHER pattern types, since if one
# has it but the other doesn't, the equality will be false anyway. If they both have it, then
# the equality conditions will be true but the inequality conditions will be false.
return self.pattern_type == other.pattern_type and self.highlight == other.highlight and \
self.pattern_type != PatternType.NONE and self.pattern_type != PatternType.OTHER

def __hash__(self):
if self.hash is not None:
return self.hash
self.hash = hash((hash(self.data_scope), self.pattern_type, self.highlight))
return self.hash

def __repr__(self):
return f"BasicDataPattern(ds={self.data_scope}, type='{self.pattern_type}', highlight={self.highlight})"

@staticmethod
def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: PatternType) -> List['BasicDataPattern']:
"""
Evaluates a specific pattern type for the data distribution of a data scope.
:param data_scope: The data scope to evaluate.
:param df: The DataFrame containing the data.
:param pattern_type: The type of the pattern to evaluate.
"""
# Apply subspace filters
filtered_df = data_scope.apply_subspace()

# Group by breakdown dimension and aggregate measure
if any([dim for dim in data_scope.breakdown if dim not in filtered_df.columns]):
# Cannot group by breakdown if it's not in the filtered data
return [BasicDataPattern(data_scope, PatternType.NONE, None)]

measure_col, agg_func = data_scope.measure
if measure_col not in filtered_df.columns:
# Cannot aggregate if measure column is not in the data
return [BasicDataPattern(data_scope, PatternType.NONE, None)]

try:
# Perform the aggregation
if agg_func != "std":
aggregated_series = filtered_df.groupby(data_scope.breakdown)[measure_col].agg(agg_func)
else:
# For standard deviation, we need to use the std function directly
aggregated_series = filtered_df.groupby(data_scope.breakdown)[measure_col].std(ddof=1)
except Exception as e:
print(f"Error during aggregation for {data_scope}: {e}")
return [BasicDataPattern(data_scope, PatternType.NONE, None)]

# Ensure series is sortable if breakdown is temporal
if all([True for dim in data_scope.breakdown if df[dim].dtype.kind in 'iuMmfc']):
# If the breakdown is temporal or at-least can be sorted, sort the series
aggregated_series = aggregated_series.sort_index()

# Evaluate the specific pattern type
returned_patterns = []
pattern_evaluator = PatternEvaluator()
is_valid, highlight = pattern_evaluator(aggregated_series, pattern_type)
if is_valid:
# A returned highlight can contain multiple highlights, for example, if a peak and a valley are found
# in the same series.
for hl in highlight:
returned_patterns.append(BasicDataPattern(data_scope, pattern_type, hl))
else:
# Check for other pattern types
for other_type in PatternType:
if other_type == PatternType.OTHER or other_type == PatternType.NONE:
continue
if other_type != pattern_type:
other_is_valid, highlight = pattern_evaluator(aggregated_series, other_type)
if other_is_valid:
for hl in highlight:
returned_patterns.append(BasicDataPattern(data_scope, PatternType.OTHER, hl))

if len(returned_patterns) == 0:
# If no pattern is found, return a 'No Pattern' type
return [BasicDataPattern(data_scope, PatternType.NONE, None)]

return returned_patterns

def create_hdp(self, pattern_type: PatternType,
hds: List[DataScope] = None, group_by_dims: List[List[str]] = None,
measures: List[Tuple[str,str]] = None, n_bins: int = 10,
extend_by_measure: bool = False, extend_by_breakdown: bool = False) -> 'HomogenousDataPattern':
"""
Generates a Homogenous Data Pattern (HDP) either from a given HDS or from the current DataScope.

:param pattern_type: The type of the pattern (e.g., 'Unimodality', 'Trend', etc.), provided as a PatternType enum.
:param hds: A list of DataScopes to create the HDP from. If None, it will be created from the current DataScope.
:param group_by_dims: The temporal dimensions to extend the breakdown with. Expected as a list of lists of strings.
:param measures: The measures to extend the measure with. Expected to be a dict {measure_column: aggregate_function}. Only needed if hds is None.
:param n_bins: The number of bins to use for numeric columns. Defaults to 10.
:param extend_by_measure: Whether to extend the hds by measure. Defaults to False.
:param extend_by_breakdown: Whether to extend the hds by breakdown. Defaults to False.
:return: The HomogenousDataPattern object containing the evaluated patterns.
"""
if hds is None or len(hds) == 0:
hds = self.data_scope.create_hds(dims=group_by_dims, measures=measures,
n_bins=n_bins, extend_by_measure=extend_by_measure,
extend_by_breakdown=extend_by_breakdown)
# All the data scopes in the HDS should have the same source_df, and it should be
# the same as the source_df of the current DataScope (otherwise, this pattern should not be
# the one producing the HDP with this HDS).
source_df = self.data_scope.source_df

# Create the HDP
hdp = []
for ds in hds:
if ds != self.data_scope:
# Check pattern cache first
cache_key = (ds.__hash__(), pattern_type)
cache_result = self.cache.get_from_pattern_cache(cache_key)
if cache_result is not None:
dp = cache_result
else:
# Evaluate the pattern if not in cache, and add to cache
dp = self.evaluate_pattern(ds, source_df, pattern_type)
self.cache.add_to_pattern_cache(cache_key, dp)

# Some evaluation functions can return multiple patterns, so it is simpler to just
# convert it to a list and then treat it as an iterable.
if not isinstance(dp, typing.Iterable):
dp = [dp]

# # Only add patterns that are not 'No Pattern' to the HDP for MetaInsight evaluation
# for d in dp:
# if d is not None and d.pattern_type != PatternType.NONE:
# hdp.append(d)

# Add all patterns, including 'No Pattern', since it is important to know that we had a 'No Pattern'.
for d in dp:
if dp is not None:
hdp.append(d)

if self.pattern_type != PatternType.NONE:
# Add the current pattern to the HDP
hdp.append(self)
hdp = HomogenousDataPattern(hdp)

return hdp


class HomogenousDataPattern(HomogenousDataScope):
"""
A homogenous data pattern.
A list of data patterns induced by the same pattern type on a homogenous data scope.
"""

def __init__(self, data_patterns: List[BasicDataPattern]):
"""
Initialize the HomogenousDataPattern with the provided data patterns.

:param data_patterns: A list of BasicDataPattern objects.
"""
if not data_patterns:
raise ValueError("data_patterns cannot be empty.")
super(HomogenousDataPattern, self).__init__([dp.data_scope for dp in data_patterns])
self.data_patterns = data_patterns

def __iter__(self):
"""
Allows iteration over the data patterns.
"""
return iter(self.data_patterns)

def __len__(self):
"""
Returns the number of data patterns.
"""
return len(self.data_patterns)

def __repr__(self):
return f"HomogenousDataPattern(#Patterns={len(self.data_patterns)})"

def __getitem__(self, item):
"""
Allows indexing into the data patterns.
"""
return self.data_patterns[item]
Loading