From 8cbed6e796e19d0b1b1e556f47452b507576fb64 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Tue, 25 Mar 2025 18:42:13 +0200 Subject: [PATCH 01/27] Updated outlier explainer to support adding text from external source and delay drawing the plot. --- .../outlier_explainer/outlier_explainer.py | 102 +++++++++++++----- 1 file changed, 78 insertions(+), 24 deletions(-) diff --git a/src/external_explainers/outlier_explainer/outlier_explainer.py b/src/external_explainers/outlier_explainer/outlier_explainer.py index 0a2dfc1..79061ca 100644 --- a/src/external_explainers/outlier_explainer/outlier_explainer.py +++ b/src/external_explainers/outlier_explainer/outlier_explainer.py @@ -253,8 +253,31 @@ def compute_predicates_per_attribute(self, attr: str, df_in: DataFrame, g_att: s return predicates + + def pred_to_human_readable(self, non_formatted_pred): + explanation = f'This outlier is not as significant when excluding rows with:\n' + for_wizard = '' + for a, bins in non_formatted_pred.items(): + for b in bins: + if type(b[0]) is tuple: + pred = f"{b[0][0]} < {a} < {b[0][1]}" + inter_exp = r'$\bf{{{}}}$'.format(utils.to_valid_latex(pred)) + else: + pred = f"{a}={b[0]}" + inter_exp = r'$\bf{{{}}}$'.format(utils.to_valid_latex(pred)) + if b[1] is not None: + if b[1] <= 5: + inter_exp = inter_exp + '-' + r'$\bf{low}$' + elif b[1] >= 25: + inter_exp = inter_exp + '-' + r'$\bf{high}$' + inter_exp += '\n' + for_wizard += inter_exp + explanation += inter_exp + + return explanation, for_wizard + def draw_bar_plot(self, df_agg: DataFrame | Series, final_df: DataFrame, g_att: str, g_agg: str, final_pred_by_attr: dict, - target: str, agg_title: str) -> None: + target: str, agg_title: str, added_text: dict = None) -> None: """ Draw a bar plot to visualize the influence of predicates on the target attribute. @@ -269,10 +292,11 @@ def draw_bar_plot(self, df_agg: DataFrame | Series, final_df: DataFrame, g_att: :param final_pred_by_attr: Dictionary containing the final predicates grouped by attribute. :param target: The target attribute for which the influence is being visualized. :param agg_title: Title for the aggregation method used in the plot. + :param added_text: Additional text to add to the plot. Optional. Expected: dict with 'text' and 'position' keys. :return: None. Displays the bar plot. """ - fig, ax = plt.subplots(layout='constrained', figsize=(5, 5)) + fig, ax = plt.subplots(figsize=(5, 5)) x1 = list(df_agg.index) ind1 = np.arange(len(x1)) y1 = df_agg.values @@ -281,24 +305,7 @@ def draw_bar_plot(self, df_agg: DataFrame | Series, final_df: DataFrame, g_att: ind2 = np.arange(len(x2)) y2 = final_df.values - explanation = f'This outlier is not as significant when excluding rows with:\n' - for_wizard = '' - for a, bins in final_pred_by_attr.items(): - for b in bins: - if type(b[0]) is tuple: - pred = f"{b[0][0]} < {a} < {b[0][1]}" - inter_exp = r'$\bf{{{}}}$'.format(utils.to_valid_latex(pred)) - else: - pred = f"{a}={b[0]}" - inter_exp = r'$\bf{{{}}}$'.format(utils.to_valid_latex(pred)) - if b[1] is not None: - if b[1] <= 5: - inter_exp = inter_exp + '-' + r'$\bf{low}$' - elif b[1] >= 25: - inter_exp = inter_exp + '-' + r'$\bf{high}$' - inter_exp += '\n' - for_wizard += inter_exp - explanation += inter_exp + explanation, for_wizard = self.pred_to_human_readable(final_pred_by_attr) bar1 = ax.bar(ind1 - 0.2, y1, 0.4, alpha=1., label='All') bar2 = ax.bar(ind2 + 0.2, y2, 0.4, alpha=1., label=f'without\n{for_wizard}') @@ -314,11 +321,52 @@ def draw_bar_plot(self, df_agg: DataFrame | Series, final_df: DataFrame, g_att: bar2[x2.index(target)].set_linewidth(2) ax.get_xticklabels()[x1.index(target)].set_color('tab:green') + plt.tight_layout() + + if added_text is not None: + # Draw the plot first to establish the bounding boxes. + plt.draw() + text = added_text['text'] + position = added_text['position'] + renderer = ax.figure.canvas.get_renderer() + max_label_height = 0 + + for label in ax.get_xticklabels() + [ax.xaxis.get_label()]: + bbox = label.get_window_extent(renderer=renderer) + if bbox.height > max_label_height: + max_label_height = bbox.height + + if position == "bottom": + offset_in_points = -(max_label_height + 10) + + ax.annotate( + text, + xy=(0.5, 0), # anchor at the bottom of the axes + xycoords='axes fraction', + xytext=(0, offset_in_points), + textcoords='offset points', + ha='center', va='top', + fontsize=16 + ) + elif position == "top": + offset_in_points = max_label_height + 10 + + ax.annotate( + text, + xy=(0.5, 1), # anchor at the top of the axes + xycoords='axes fraction', + xytext=(0, offset_in_points), + textcoords='offset points', + ha='center', va='bottom', + fontsize=16 + ) + plt.show() def explain(self, df_agg: DataFrame, df_in: DataFrame, g_att: str, g_agg: str, agg_method: str, target: str, - dir: int, control=None, hold_out: List = [], k: int = 1) -> str | None: + dir: int, control=None, hold_out: List = None, k: int = 1, draw_plot: bool = True) \ + -> str | None | Tuple: """ Explain the outlier in the given DataFrame. @@ -340,6 +388,9 @@ def explain(self, df_agg: DataFrame, df_in: DataFrame, g_att: str, g_agg: str, a :return: None. Will generate a plot with the explanation for the outlier. """ + if hold_out is None: + hold_out = [] + # Get the attributes from the input DataFrame and remove the hold-out attributes. attrs = df_in.columns attrs = [a for a in attrs if a not in hold_out + [g_att, g_agg]] @@ -390,6 +441,9 @@ def explain(self, df_agg: DataFrame, df_in: DataFrame, g_att: str, g_agg: str, a final_pred_by_attr[a] = [] final_pred_by_attr[a].append((i, rank)) - # Create a plot to display the explanation for the outlier. - self.draw_bar_plot(df_agg, final_df, g_att, g_agg, final_pred_by_attr, target, agg_title) - return None + # Create a plot to display the explanation for the outlier, or return everything needed to draw the plot later. + if draw_plot: + self.draw_bar_plot(df_agg, final_df, g_att, g_agg, final_pred_by_attr, target, agg_title) + return None + else: + return df_agg, final_df, g_att, g_agg, final_pred_by_attr, target, agg_title From 91c538a065124be3e348f3e9d15fd2924fb369b4 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Sun, 30 Mar 2025 20:11:16 +0300 Subject: [PATCH 02/27] Fixed division by zero error. --- .../outlier_explainer/outlier_explainer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/external_explainers/outlier_explainer/outlier_explainer.py b/src/external_explainers/outlier_explainer/outlier_explainer.py index 79061ca..3e7cf91 100644 --- a/src/external_explainers/outlier_explainer/outlier_explainer.py +++ b/src/external_explainers/outlier_explainer/outlier_explainer.py @@ -66,7 +66,11 @@ def calc_influence_pred(self, df_before: DataFrame, df_after: DataFrame, target: try: # Compute target influence - the ratio between the change in the output and the number of # tuples that satisfy the predicate, multiplied by the direction factor. - target_inf = ((df_before[target] - df_after[target]) * dir) / (df_before[target] + df_after[target]) + denominator = df_before[target] + df_after[target] + # We may have a try catch here, but division by zero is still causing a runtime warning. + if denominator == 0: + return -1 + target_inf = ((df_before[target] - df_after[target]) * dir) / denominator except: return -1 From a18ae18fde2c72147b7c59fa14094ae1bb85220f Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Mon, 5 May 2025 23:08:43 +0300 Subject: [PATCH 03/27] Added WIP MetaInsight explainer (migrated from pd-explain). --- requirements.txt | 5 +- .../metainsight_explainer/__init__.py | 0 .../commoness_and_exceptions.py | 34 ++++ .../metainsight_explainer/data_pattern.py | 189 ++++++++++++++++++ .../metainsight_explainer/data_scope.py | 147 ++++++++++++++ .../metainsight_explainer/meta_insight.py | 141 +++++++++++++ .../pattern_evaluations.py | 152 ++++++++++++++ 7 files changed, 667 insertions(+), 1 deletion(-) create mode 100644 src/external_explainers/metainsight_explainer/__init__.py create mode 100644 src/external_explainers/metainsight_explainer/commoness_and_exceptions.py create mode 100644 src/external_explainers/metainsight_explainer/data_pattern.py create mode 100644 src/external_explainers/metainsight_explainer/data_scope.py create mode 100644 src/external_explainers/metainsight_explainer/meta_insight.py create mode 100644 src/external_explainers/metainsight_explainer/pattern_evaluations.py diff --git a/requirements.txt b/requirements.txt index 62768ef..b87196b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,6 @@ numpy~=2.1.3 pandas~=2.2.3 -matplotlib~=3.9.2 \ No newline at end of file +matplotlib~=3.9.2 +diptest +scipy +scikit-learn \ No newline at end of file diff --git a/src/external_explainers/metainsight_explainer/__init__.py b/src/external_explainers/metainsight_explainer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/external_explainers/metainsight_explainer/commoness_and_exceptions.py b/src/external_explainers/metainsight_explainer/commoness_and_exceptions.py new file mode 100644 index 0000000..f4ec6c8 --- /dev/null +++ b/src/external_explainers/metainsight_explainer/commoness_and_exceptions.py @@ -0,0 +1,34 @@ +from collections import defaultdict + +EXCEPTION_CATEGORY_COUNT = 3 + +def categorize_exceptions(commonness_set, exceptions): + """ + Categorizes exceptions based on differences from commonness highlights/types. + Simplified categorization: Highlight-Change, Type-Change, No-Pattern (though No-Pattern + should ideally not be in the exceptions list generated by generate_hdp). + Returns a dictionary mapping category names to lists of exception patterns. + """ + categorized = defaultdict(list) + commonness_highlights = set() + for commonness in commonness_set: + if commonness: # Ensure commonness is not empty + commonness_highlights.add(commonness[0].highlight) # Assume all in commonness have same highlight + + for exc_dp in exceptions: + if exc_dp.type == 'Other Pattern': + categorized['Type-Change'].append(exc_dp) + elif exc_dp.type == 'No Pattern': + # This case should ideally not happen if generate_hdp filters 'No Pattern' + categorized['No-Pattern'].append(exc_dp) + elif exc_dp.highlight not in commonness_highlights: + categorized['Highlight-Change'].append(exc_dp) + + # Keeping this commented out, since I couldn't figure out what to do with something in this catch-all category. + # For now it will be ignored, but it could maybe be useful. + # else: + # # Exception has a valid pattern type and highlight, but didn't meet commonness threshold + # # This could be another category or grouped with Highlight-Change + # categorized['Other-Exception'].append(exc_dp) # Add a catch-all category + + return categorized \ No newline at end of file diff --git a/src/external_explainers/metainsight_explainer/data_pattern.py b/src/external_explainers/metainsight_explainer/data_pattern.py new file mode 100644 index 0000000..e5a88aa --- /dev/null +++ b/src/external_explainers/metainsight_explainer/data_pattern.py @@ -0,0 +1,189 @@ +import pandas as pd +from typing import Dict, List, Tuple + +from external_explainers.metainsight_explainer.data_scope import DataScope, HomogenousDataScope +from external_explainers.metainsight_explainer.pattern_evaluations import PatternEvaluator, PatternType + +class BasicDataPattern: + """ + A data pattern, as defined in the MetaInsight paper. + Contains 3 elements: data scope, type (interpretation type) and highlight. + """ + + def __init__(self, data_scope: DataScope, pattern_type: PatternType, highlight: str | None): + """ + Initialize the BasicDataPattern with the provided data scope, type and highlight. + + :param data_scope: The data scope of the pattern. a DataScope object. + :param pattern_type: str, e.g., 'Unimodality', 'Trend', 'Other Pattern', 'No Pattern' + :param highlight: depends on type, e.g., ('April', 'Valley') for Unimodality + """ + self.data_scope = data_scope + self.pattern_type = pattern_type + self.highlight = highlight + self.pattern_cache = {} + + def __eq__(self, other): + if not isinstance(other, BasicDataPattern): + return False + return self.pattern_type == other.pattern_type and \ + self.highlight == other.highlight and \ + self.data_scope == other.data_scope + + + def sim(self, other) -> bool: + """ + Computes the similarity between two BasicDataPattern objects. + They are similar if they have the same pattern type and highlight, as well as neither having + a pattern type of NONE or OTHER. + + :param other: The other BasicDataPattern object to compare with. + :return: True if similar, False otherwise. + """ + if not isinstance(other, BasicDataPattern): + return False + # There is no REAL need to check that both don't have NONE or OTHER pattern types, since if one + # has it but the other doesn't, the equality will be false anyway. If they both have it, then + # the equality conditions will be true but the inequality conditions will be false. + return self.pattern_type == other.pattern_type and self.highlight == other.highlight and \ + self.pattern_type != PatternType.NONE and self.pattern_type != PatternType.OTHER + + def __hash__(self): + return hash((self.data_scope, self.pattern_type, self.highlight)) + + def __repr__(self): + return f"BasicDataPattern(ds={self.data_scope}, type='{self.pattern_type}', highlight={self.highlight})" + + @staticmethod + def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: PatternType) -> 'BasicDataPattern': + """ + Evaluates a specific pattern type for the data distribution of a data scope. + :param data_scope: The data scope to evaluate. + :param df: The DataFrame containing the data. + :param pattern_type: The type of the pattern to evaluate. + """ + # Apply subspace filters + filtered_df = df.copy() + for dim, value in data_scope.subspace.items(): + if value != '*': + filtered_df = filtered_df[filtered_df[dim] == value] + + # Group by breakdown dimension and aggregate measure + if data_scope.breakdown not in filtered_df.columns: + # Cannot group by breakdown if it's not in the filtered data + return BasicDataPattern(data_scope, PatternType.NONE, None) + + measure_col, agg_func = data_scope.measure + if measure_col not in filtered_df.columns: + # Cannot aggregate if measure column is not in the data + return BasicDataPattern(data_scope, PatternType.NONE, None) + + try: + # Perform the aggregation + aggregated_series = filtered_df.groupby(data_scope.breakdown)[measure_col].agg(agg_func) + except Exception as e: + print(f"Error during aggregation for {data_scope}: {e}") + return BasicDataPattern(data_scope, PatternType.NONE, None) + + # Ensure series is sortable if breakdown is temporal + if df[data_scope.breakdown].dtype in ['datetime64[ns]', 'period[M]', 'int64']: + aggregated_series = aggregated_series.sort_index() + + # Evaluate the specific pattern type + pattern_evaluator = PatternEvaluator() + is_valid, highlight = pattern_evaluator(aggregated_series, pattern_type) + if is_valid: + return BasicDataPattern(data_scope, pattern_type, highlight) + else: + # Check for other pattern types + for other_type in PatternType: + if other_type != pattern_type: + other_is_valid, _ = pattern_evaluator(aggregated_series, other_type) + if other_is_valid: + return BasicDataPattern(data_scope, PatternType.OTHER, None) + + # If no pattern is found, return a 'No Pattern' type + return BasicDataPattern(data_scope, PatternType.NONE, None) + + def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, + hds: List[DataScope] = None, temporal_dimensions: List[str] = None, + measures: Dict[str,str] = None) -> Tuple[List['BasicDataPattern'], Dict]: + """ + Generates a Homogenous Data Pattern (HDP) either from a given HDS or from the current DataScope. + + :param pattern_type: The type of the pattern (e.g., 'Unimodality', 'Trend', etc.), provided as a PatternType enum. + :param pattern_cache: A cache for the pattern, if available. + :param hds: A list of DataScopes to create the HDP from. If None, it will be created from the current DataScope. + :param temporal_dimensions: The temporal dimensions to extend the breakdown with. Expected as a list of strings. Only needed if hds is None. + :param measures: The measures to extend the measure with. Expected to be a dict {measure_column: aggregate_function}. Only needed if hds is None. + """ + if hds is None or len(hds) == 0: + hds = self.data_scope.create_hds(temporal_dimensions=temporal_dimensions, measures=measures) + # All the data scopes in the HDS should have the same source_df, and it should be + # the same as the source_df of the current DataScope (otherwise, this pattern should not be + # the one producing the HDP with this HDS). + source_df = self.data_scope.source_df + if not all(ds.source_df == source_df for ds in hds): + raise ValueError("All DataScopes in the HDS must have the same source_df.") + + # Append the existing cache if available + if pattern_cache is None: + pattern_cache = {} + pattern_cache.update(self.pattern_cache) + + # Create the HDP + hdp = [] + for ds in hds: + # Check pattern cache first + cache_key = (ds, pattern_type) + if cache_key in pattern_cache: + dp = pattern_cache[cache_key] + else: + # Evaluate the pattern if not in cache + dp = self.evaluate_pattern(ds, source_df, pattern_type) + pattern_cache[cache_key] = dp # Store in cache + + # Only add patterns that are not 'No Pattern' to the HDP for MetaInsight evaluation + if dp.type != PatternType.NONE: + hdp.append(dp) + + self.pattern_cache = pattern_cache + + return hdp, pattern_cache + +class HomogenousDataPattern: + + """ + A homogenous data pattern. + A list of data patterns induced by the same pattern type on a homogenous data scope. + """ + + def __init__(self, data_patterns: List[BasicDataPattern]): + """ + Initialize the HomogenousDataPattern with the provided data patterns. + + :param data_patterns: A list of BasicDataPattern objects. + """ + self.data_patterns = data_patterns + self.source_df = data_patterns[0].data_scope.source_df if data_patterns else None + + def __iter__(self): + """ + Allows iteration over the data patterns. + """ + return iter(self.data_patterns) + + def __len__(self): + """ + Returns the number of data patterns. + """ + return len(self.data_patterns) + + def __repr__(self): + return f"HomogenousDataPattern(#Patterns={len(self.data_patterns)})" + + def __getitem__(self, item): + """ + Allows indexing into the data patterns. + """ + return self.data_patterns[item] \ No newline at end of file diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py new file mode 100644 index 0000000..4d3a07a --- /dev/null +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -0,0 +1,147 @@ +import pandas as pd +from typing import Dict, List, Tuple + +class DataScope: + """ + A data scope, as defined in the MetaInsight paper. + Contains 3 elements: subspace, breakdown and measure. + Example: for the query SELECT Month, SUM(Sales) FROM DATASET WHERE City==“Los Angeles” GROUP BY Month + The subspace is {City: Los Angeles, Month: *}, the breakdown is {Month} and the measure is {SUM(Sales)}. + """ + + def __init__(self, source_df: pd.DataFrame, subspace: Dict[str, str], breakdown: str, measure: tuple): + """ + Initialize the DataScope with the provided subspace, breakdown and measure. + + :param source_df: The DataFrame containing the data. + :param subspace: dict of filters, e.g., {'City': 'Los Angeles', 'Month': '*'} + :param breakdown: str, the dimension for group-by + :param measure: tuple, (measure_column_name, aggregate_function_name) + """ + self.source_df = source_df + self.subspace = subspace + self.breakdown = breakdown + self.measure = measure + + def __hash__(self): + # Need a hashable representation of subspace for hashing + subspace_tuple = tuple(sorted(self.subspace.items())) if isinstance(self.subspace, dict) else tuple( + self.subspace) + return hash((subspace_tuple, self.breakdown, self.measure)) + + def __repr__(self): + return f"DataScope(subspace={self.subspace}, breakdown='{self.breakdown}', measure={self.measure})" + + def _subspace_extend(self) -> List['DataScope']: + """ + Extends the subspace of the DataScope into its sibling group by the dimension dim_to_extend. + Subspaces with the same sibling group only differ from each other in 1 non-empty filter. + + :return: A list of new DataScope objects with the extended subspace. + """ + new_ds = [] + if isinstance(self.subspace, dict): + for dim_to_extend in self.subspace.keys(): + unique_values = self.source_df[dim_to_extend].dropna().unique() + for value in unique_values: + # Ensure it's a sibling + if self.subspace.get(dim_to_extend) != value: + # Add the new DataScope with the extended subspace + new_subspace = self.subspace.copy() + new_subspace[dim_to_extend] = value + new_ds.append(DataScope(self.source_df, new_subspace, self.breakdown, self.measure)) + return new_ds + + def _measure_extend(self, measures: Dict[str,str]) -> List['DataScope']: + """ + Extends the measure of the DataScope while keeping the same breakdown and subspace. + + :param measures: The measures to extend. + :return: A list of new DataScope objects with the extended measure. + """ + new_ds = [] + for measure_col, agg_func in measures.items(): + if (measure_col, agg_func) != self.measure: + new_ds.append(DataScope(self.source_df, self.subspace, self.breakdown, (measure_col, agg_func))) + return new_ds + + def _breakdown_extend(self, temporal_dimensions: List[str]) -> List['DataScope']: + """ + Extends the breakdown of the DataScope while keeping the same subspace and measure. + + :param temporal_dimensions: The temporal dimensions to extend the breakdown with. + :return: A list of new DataScope objects with the extended breakdown. + """ + new_ds = [] + + temporal_dimensions = [d for d in temporal_dimensions if + self.source_df[d].dtype in ['datetime64[ns]', 'period[M]', 'int64']] + for breakdown_dim in temporal_dimensions: + if breakdown_dim != self.breakdown: + new_ds.append(DataScope(self.source_df, self.subspace, breakdown_dim, self.measure)) + return new_ds + + def create_hds(self, temporal_dimensions: List[str] = None, measures: Dict[str, str] = None) -> 'HomogenousDataScope': + """ + Generates a Homogeneous Data Scope (HDS) from a base data scope, using subspace, measure and breakdown + extensions as defined in the MetaInsight paper. + + :param temporal_dimensions: The temporal dimensions to extend the breakdown with. Expected as a list of strings. + :param measures: The measures to extend the measure with. Expected to be a dict {measure_column: aggregate_function}. + + :return: A HDS in the form of a list of DataScope objects. + """ + hds = [] + if temporal_dimensions is None: + temporal_dimensions = [] + if measures is None: + measures = {} + + # Subspace Extending + hds.extend(self._subspace_extend()) + + # Measure Extending + hds.extend(self._measure_extend(measures)) + + # Breakdown Extending + hds.extend(self._breakdown_extend(temporal_dimensions)) + + return HomogenousDataScope(hds) + + +class HomogenousDataScope: + """ + A homogenous data scope. + A list of data scopes that are all from the same source_df, and are all created using + one of the 3 extension methods of the DataScope class. + """ + + def __init__(self, data_scopes: List[DataScope]): + """ + Initialize the HomogenousDataScope with the provided data scopes. + + :param data_scopes: A list of DataScope objects. + """ + self.data_scopes = data_scopes + self.source_df = data_scopes[0].source_df if data_scopes else None + + def __iter__(self): + """ + Allows iteration over the data scopes. + """ + return iter(self.data_scopes) + + def __len__(self): + """ + Returns the number of data scopes. + """ + return len(self.data_scopes) + + def __getitem__(self, item): + """ + Allows indexing into the data scopes. + """ + return self.data_scopes[item] + + def __repr__(self): + return f"HomogenousDataScope(#DataScopes={len(self.data_scopes)})" \ No newline at end of file diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py new file mode 100644 index 0000000..ed28fcd --- /dev/null +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -0,0 +1,141 @@ +from collections import defaultdict +from typing import List, Dict + +import math + +from external_explainers.metainsight_explainer.commoness_and_exceptions import categorize_exceptions, EXCEPTION_CATEGORY_COUNT +from external_explainers.metainsight_explainer.data_pattern import HomogenousDataPattern +from external_explainers.metainsight_explainer.data_pattern import BasicDataPattern + +COMMONNESS_THRESHOLD = 0.5 +BALANCE_PARAMETER = 1 +ACTIONABILITY_REGULARIZER_PARAM = 0.1 + +class MetaInsight: + """ + Represents a MetaInsight (HDP, commonness_set, exceptions). + """ + + def __init__(self, hdp: HomogenousDataPattern, + commonness_set: Dict[BasicDataPattern, List[BasicDataPattern]], + exceptions: Dict[str, List[BasicDataPattern]], score=0): + """ + :param hdp: list of BasicDataPattern objects + :param commonness_set: A dictionary mapping commonness patterns to lists of BasicDataPattern objects + :param exceptions: A dictionary mapping exception categories to lists of BasicDataPattern objects + """ + self.hdp = hdp + self.commonness_set = commonness_set + self.exceptions = exceptions + self.score = score + + def __repr__(self): + return f"MetaInsight(score={self.score:.4f}, #HDP={len(self.hdp)}, #Commonness={len(self.commonness_set)}, #Exceptions={len(self.exceptions)})" + + @staticmethod + def create_meta_insight(hdp: HomogenousDataPattern) -> 'MetaInsight' | None: + """ + Evaluates the HDP and creates a MetaInsight object. + :param hdp: A HomogenousDataPattern object. + :return: A MetaInsight object if possible, None otherwise. + """ + if len(hdp) == 0: + return None + + # Group patterns by similarity + similarity_groups = defaultdict(list) + for dp in hdp: + found_group = False + for key in similarity_groups: + # Check similarity with the first element of an existing group + if dp.sim(similarity_groups[key][0]): + similarity_groups[key].append(dp) + found_group = True + break + if not found_group: + # Create a new group with this pattern as the first element (key) + similarity_groups[dp].append(dp) + + # Identify commonness(es) based on the threshold + commonness_set = [] + exceptions = [] + total_patterns_in_hdp = len(hdp) + + # Need to iterate through the original HDP to ensure all patterns are considered + # and assigned to either commonness or exceptions exactly once. + processed_patterns = set() + for dp in hdp: + if dp in processed_patterns: + continue + + is_commonness = False + for key, group in similarity_groups.items(): + if dp in group: + # An equivalence class is a commonness if it contains more than COMMONNESS_THRESHOLD of the HDP + if len(group) / total_patterns_in_hdp > COMMONNESS_THRESHOLD: + commonness_set.append(group) + for pattern in group: + processed_patterns.add(pattern) + is_commonness = True + break # Found the group for this pattern + + if not is_commonness: + # If the pattern wasn't part of a commonness, add it to exceptions + exceptions.append(dp) + processed_patterns.add(dp) + + # A valid MetaInsight requires at least one commonness + if not commonness_set: + return None + + # Categorize exceptions (optional for basic MetaInsight object, but needed for scoring) + categorized_exceptions = categorize_exceptions(commonness_set, exceptions) + + return MetaInsight(hdp, commonness_set, categorized_exceptions) + + def calculate_conciseness(self) -> float: + """ + Calculates the conciseness score of a MetaInsight. + Based on the entropy of category proportions. + """ + n = len(self.hdp) + if n == 0: + return 0 + + # Calculate entropy + S = 0 + commonness_proportions = [] + for group, patterns in self.commonness_set.items(): + if len(patterns) > 0: + proportion = len(patterns) / n + S += proportion * math.log2(proportion) + commonness_proportions.append(proportion) + + exception_proportions = [] + for category, patterns in self.exceptions.items(): + if len(patterns) > 0: + proportion = len(patterns) / n + S += BALANCE_PARAMETER * (proportion * math.log2(proportion)) + exception_proportions.append(proportion) + + # Convert to positive entropy + S = -S + + # Compute S* (the upper bound of S) + threshold = ((1 - COMMONNESS_THRESHOLD) * math.e) / (math.pow(COMMONNESS_THRESHOLD, 1 / BALANCE_PARAMETER)) + if EXCEPTION_CATEGORY_COUNT > threshold: + S_star = -math.log2(COMMONNESS_THRESHOLD) + (BALANCE_PARAMETER * EXCEPTION_CATEGORY_COUNT + * math.pow(COMMONNESS_THRESHOLD, 1 / BALANCE_PARAMETER) + * math.log2(math.e)) + else: + S_star = - COMMONNESS_THRESHOLD * math.log(COMMONNESS_THRESHOLD) - ( + BALANCE_PARAMETER * (1 - COMMONNESS_THRESHOLD) * math.log2((1 - COMMONNESS_THRESHOLD) / EXCEPTION_CATEGORY_COUNT) + ) + + + + indicator_value = 1 if len(exception_proportions) == 0 else 0 + conciseness = 1 - ((S + ACTIONABILITY_REGULARIZER_PARAM * indicator_value) / S_star) + + # Ensure conciseness is within a reasonable range, e.g., [0, 1] + return conciseness \ No newline at end of file diff --git a/src/external_explainers/metainsight_explainer/pattern_evaluations.py b/src/external_explainers/metainsight_explainer/pattern_evaluations.py new file mode 100644 index 0000000..83ba86e --- /dev/null +++ b/src/external_explainers/metainsight_explainer/pattern_evaluations.py @@ -0,0 +1,152 @@ +from enum import Enum +from typing import List, Dict, Tuple +import pandas as pd +import numpy as np +from diptest import diptest +from scipy.stats import gaussian_kde, zscore +from sklearn.linear_model import LinearRegression +from sklearn.cluster import DBSCAN + +class PatternType(Enum): + """ + An enumeration of the types of patterns. + """ + NONE = 0 + OTHER = 1 + UNIMODALITY = 2 + TREND = 3 + OUTLIER = 4 + +class PatternEvaluator: + """ + A class to evaluate different patterns in a series. + """ + + OUTLIER_ZSCORE_THRESHOLD = 2.0 # Z-score threshold for outlier detection + TREND_SLOPE_THRESHOLD = 0.01 # Minimum absolute slope for trend detection + + @staticmethod + def unimodality(series: pd.Series) -> (bool, Tuple[str, str]): + """ + Evaluates if the series is unimodal using Hartigan's Dip test and returns the highlight. + :param series: The series to evaluate. + :return: (is_unimodal, highlight) + """ + # Perform Hartigan's Dip test + dip_statistic, p_value = diptest(series.dropna().values) + is_unimodal = p_value > 0.05 + if not is_unimodal: + return False, (None, None) + # If there is unimodality, find the valley / peak + # 2. Perform Kernel Density Estimation + kde = gaussian_kde(series) + + # 3. Evaluate the KDE over a range of values + # Create a range of points covering the data span + x_range = np.linspace(series.min(), series.max(), 1000) + density_values = kde(x_range) + + # 4. Find the index of the maximum (peak) and minimum (valley) density + peak_index = np.argmax(density_values) + valley_index = np.argmin(density_values) + + # 5. Map indices back to data values to get the estimated locations + peak_location = x_range[peak_index] + valley_location = x_range[valley_index] + + # Check which of the two is the bigger outlier, and return the one that is + # furthest from the mean + if abs(peak_location - series.mean()) > abs(valley_location - series.mean()): + return True, (peak_location, 'Peak') + else: + return True, (valley_location, 'Valley') + + + + @staticmethod + def trend(series: pd.Series) -> (bool, Tuple[str, str]): + """ + Evaluates if a time series exhibits a significant trend (upward or downward). + Uses linear regression to find the slope. + Returns (True, highlight) if a trend is detected, (False, None) otherwise. + Highlight is (slope, 'Upward' or 'Downward'). + """ + if len(series) < 2: + return False, (None, None) + + # Create a simple linear model + X = np.arange(len(series)).reshape(-1, 1) # Independent variable (time index) + y = series.values # Dependent variable (data values) + + model = LinearRegression() + model.fit(X, y) + slope = model.coef_[0] + + # Check if the slope is significant + if abs(slope) > PatternEvaluator.TREND_SLOPE_THRESHOLD: + trend_direction = 'Upward' if slope > 0 else 'Downward' + return True, (slope, trend_direction) + else: + return False, (None, None) + + @staticmethod + def outlier(series: pd.Series) -> (bool, Tuple[str, str]): + """ + Evaluates if a series contains significant outliers. + Uses the Z-score method. + Returns (True, highlight) if outliers are detected, (False, None) otherwise. + Highlight is a list of indices of the outlier points. + """ + if len(series) < 2: + return False, (None, None) + + # Calculate Z-scores + z_scores = np.abs(zscore(series.dropna())) + + # Find indices where Z-score exceeds the threshold + outlier_indices = np.where(z_scores > PatternEvaluator.OUTLIER_ZSCORE_THRESHOLD)[0] + + if len(outlier_indices) > 0: + outlier_data_points = series[outlier_indices].values.tolist() + # If there are multiple outliers, use clustering and return the cluster means. + # This is more informative and easier to interpret than a list of raw outlier values. + if len(outlier_data_points) > 1: + # Reshape for clustering + outlier_data_points = np.array(outlier_data_points).reshape(-1, 1) + # Perform clustering + clustered = DBSCAN().fit_predict(outlier_data_points) + cluster_means = [] + for cluster in np.unique(clustered): + if cluster != -1: + cluster_points = outlier_data_points[clustered == cluster] + cluster_mean = np.mean(cluster_points) + cluster_means.append(cluster_mean) + # If there are noise points, they will be labeled as -1 in DBSCAN. To us though, those are + # not noise points, but outliers. So we will return them as well (unlike the clustered points, + # their mean may be meaningless because they might be very far apart. + noise_points = outlier_data_points[clustered == -1] + if len(noise_points) > 0: + noise_points = noise_points.flatten().tolist() + cluster_means.extend(noise_points) + # Return the cluster centers as the highlight meaning "outliers around these values" + return True, (cluster_means, None) + + return True, (outlier_data_points, None) + else: + return False, (None, None) + + def __call__(self, series: pd.Series, pattern_type: PatternType) -> (bool, str): + """ + Calls the appropriate pattern evaluation method based on the pattern type. + :param series: The series to evaluate. + :param pattern_type: The type of the pattern to evaluate. + :return: (is_valid, highlight) + """ + if pattern_type == PatternType.UNIMODALITY: + return self.unimodality(series) + elif pattern_type == PatternType.TREND: + return self.trend(series) + elif pattern_type == PatternType.OUTLIER: + return self.outlier(series) + else: + raise ValueError(f"Unsupported pattern type: {pattern_type}") \ No newline at end of file From 36234794686ea32cda989e940ae42485d1f793fe Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Tue, 6 May 2025 18:33:07 +0300 Subject: [PATCH 04/27] Continued progress on MetaInsight explainer. Added most required functionality (except for the main mining function). --- .../commoness_and_exceptions.py | 34 ----- .../metainsight_explainer/data_pattern.py | 6 +- .../metainsight_explainer/data_scope.py | 69 +++++++++- .../metainsight_explainer/meta_insight.py | 118 +++++++++++++++--- .../metainsight_mining.py | 74 +++++++++++ 5 files changed, 248 insertions(+), 53 deletions(-) delete mode 100644 src/external_explainers/metainsight_explainer/commoness_and_exceptions.py create mode 100644 src/external_explainers/metainsight_explainer/metainsight_mining.py diff --git a/src/external_explainers/metainsight_explainer/commoness_and_exceptions.py b/src/external_explainers/metainsight_explainer/commoness_and_exceptions.py deleted file mode 100644 index f4ec6c8..0000000 --- a/src/external_explainers/metainsight_explainer/commoness_and_exceptions.py +++ /dev/null @@ -1,34 +0,0 @@ -from collections import defaultdict - -EXCEPTION_CATEGORY_COUNT = 3 - -def categorize_exceptions(commonness_set, exceptions): - """ - Categorizes exceptions based on differences from commonness highlights/types. - Simplified categorization: Highlight-Change, Type-Change, No-Pattern (though No-Pattern - should ideally not be in the exceptions list generated by generate_hdp). - Returns a dictionary mapping category names to lists of exception patterns. - """ - categorized = defaultdict(list) - commonness_highlights = set() - for commonness in commonness_set: - if commonness: # Ensure commonness is not empty - commonness_highlights.add(commonness[0].highlight) # Assume all in commonness have same highlight - - for exc_dp in exceptions: - if exc_dp.type == 'Other Pattern': - categorized['Type-Change'].append(exc_dp) - elif exc_dp.type == 'No Pattern': - # This case should ideally not happen if generate_hdp filters 'No Pattern' - categorized['No-Pattern'].append(exc_dp) - elif exc_dp.highlight not in commonness_highlights: - categorized['Highlight-Change'].append(exc_dp) - - # Keeping this commented out, since I couldn't figure out what to do with something in this catch-all category. - # For now it will be ignored, but it could maybe be useful. - # else: - # # Exception has a valid pattern type and highlight, but didn't meet commonness threshold - # # This could be another category or grouped with Highlight-Change - # categorized['Other-Exception'].append(exc_dp) # Add a catch-all category - - return categorized \ No newline at end of file diff --git a/src/external_explainers/metainsight_explainer/data_pattern.py b/src/external_explainers/metainsight_explainer/data_pattern.py index e5a88aa..764ed77 100644 --- a/src/external_explainers/metainsight_explainer/data_pattern.py +++ b/src/external_explainers/metainsight_explainer/data_pattern.py @@ -151,7 +151,7 @@ def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, return hdp, pattern_cache -class HomogenousDataPattern: +class HomogenousDataPattern(HomogenousDataScope): """ A homogenous data pattern. @@ -164,8 +164,10 @@ def __init__(self, data_patterns: List[BasicDataPattern]): :param data_patterns: A list of BasicDataPattern objects. """ + if not data_patterns: + raise ValueError("data_patterns cannot be empty.") + super(HomogenousDataPattern, self).__init__([dp.data_scope for dp in data_patterns]) self.data_patterns = data_patterns - self.source_df = data_patterns[0].data_scope.source_df if data_patterns else None def __iter__(self): """ diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py index 4d3a07a..8ce5ed6 100644 --- a/src/external_explainers/metainsight_explainer/data_scope.py +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -108,6 +108,55 @@ def create_hds(self, temporal_dimensions: List[str] = None, measures: Dict[str, return HomogenousDataScope(hds) + def compute_impact(self, impact_measure: Tuple = None, precomputed_total_impact: float = None) -> Tuple[ + float, float]: + """ + Computes the impact of the data scope based on the provided impact measure. + Impact is defined as the ratio of the measure in the current data scope to the total measure in the source DataFrame. + + :param impact_measure: A tuple representing the impact measure. Optional. If not provided, the data scope's + measure will be used. + :param precomputed_total_impact: A precomputed total impact value. Optional. If provided, it will be used instead of + computing the total impact again. Used for performance optimization. + :return: The computed impact. + """ + if impact_measure is None: + impact_measure = self.measure + impact_col, agg_func = impact_measure + if impact_col not in self.source_df.columns: + raise ValueError(f"Impact column '{impact_col}' not found in source DataFrame.") + + total_impact = precomputed_total_impact + # If we are not using a precomputed total impact, compute it + if precomputed_total_impact is None: + try: + total_impact = self.source_df[impact_col].agg(agg_func) + except Exception as e: + print(f"Error during aggregation for {self}: {e}") + return 0, 0 + + # Avoid division by zero + if total_impact == 0: + return 0, 0 + + # Compute the impact for the current data scope + filtered_df = self.source_df + for dim, value in self.subspace.items(): + if value != '*': + filtered_df = filtered_df[filtered_df[dim] == value] + + if impact_col not in filtered_df.columns: + return 0, total_impact + else: + # Perform the aggregation + try: + impact = filtered_df[impact_col].agg(agg_func) + impact = impact / total_impact + return impact, total_impact + except Exception as e: + print(f"Error during aggregation for {self}: {e}") + return 0, total_impact + class HomogenousDataScope: """ @@ -144,4 +193,22 @@ def __getitem__(self, item): return self.data_scopes[item] def __repr__(self): - return f"HomogenousDataScope(#DataScopes={len(self.data_scopes)})" \ No newline at end of file + return f"HomogenousDataScope(#DataScopes={len(self.data_scopes)})" + + + def compute_impact(self, impact_measure) -> float: + """ + Computes the impact of the HDS. This is the sum of the impacts of all data scopes in the HDS. + :param impact_measure: + :return: The total impact of the HDS. + """ + impact = 0 + total_impact = 0 + if len(self.data_scopes) > 0: + impact, total_impact = self.data_scopes[0].compute_impact(impact_measure) + else: + return 0 + for ds in self.data_scopes[1:]: + ds_impact, _ = ds.compute_impact(impact_measure, total_impact) + impact += ds_impact + return impact \ No newline at end of file diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index ed28fcd..26a03d2 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -3,13 +3,13 @@ import math -from external_explainers.metainsight_explainer.commoness_and_exceptions import categorize_exceptions, EXCEPTION_CATEGORY_COUNT from external_explainers.metainsight_explainer.data_pattern import HomogenousDataPattern from external_explainers.metainsight_explainer.data_pattern import BasicDataPattern COMMONNESS_THRESHOLD = 0.5 BALANCE_PARAMETER = 1 ACTIONABILITY_REGULARIZER_PARAM = 0.1 +EXCEPTION_CATEGORY_COUNT = 3 class MetaInsight: """ @@ -17,8 +17,12 @@ class MetaInsight: """ def __init__(self, hdp: HomogenousDataPattern, - commonness_set: Dict[BasicDataPattern, List[BasicDataPattern]], - exceptions: Dict[str, List[BasicDataPattern]], score=0): + commonness_set: List[List[BasicDataPattern]], + exceptions: Dict[str, List[BasicDataPattern]], score=0, + commonness_threshold: float = COMMONNESS_THRESHOLD, + balance_parameter: float = BALANCE_PARAMETER, + actionability_regularizer_param: float = ACTIONABILITY_REGULARIZER_PARAM, + ): """ :param hdp: list of BasicDataPattern objects :param commonness_set: A dictionary mapping commonness patterns to lists of BasicDataPattern objects @@ -28,15 +32,52 @@ def __init__(self, hdp: HomogenousDataPattern, self.commonness_set = commonness_set self.exceptions = exceptions self.score = score + self.commonness_threshold = commonness_threshold + self.balance_parameter = balance_parameter + self.actionability_regularizer_param = actionability_regularizer_param def __repr__(self): return f"MetaInsight(score={self.score:.4f}, #HDP={len(self.hdp)}, #Commonness={len(self.commonness_set)}, #Exceptions={len(self.exceptions)})" + @staticmethod - def create_meta_insight(hdp: HomogenousDataPattern) -> 'MetaInsight' | None: + def categorize_exceptions(commonness_set, exceptions): + """ + Categorizes exceptions based on differences from commonness highlights/types. + Simplified categorization: Highlight-Change, Type-Change, No-Pattern (though No-Pattern + should ideally not be in the exceptions list generated by generate_hdp). + Returns a dictionary mapping category names to lists of exception patterns. + """ + categorized = defaultdict(list) + commonness_highlights = set() + for commonness in commonness_set: + if commonness: # Ensure commonness is not empty + commonness_highlights.add(commonness[0].highlight) # Assume all in commonness have same highlight + + for exc_dp in exceptions: + if exc_dp.type == 'Other Pattern': + categorized['Type-Change'].append(exc_dp) + elif exc_dp.type == 'No Pattern': + # This case should ideally not happen if generate_hdp filters 'No Pattern' + categorized['No-Pattern'].append(exc_dp) + elif exc_dp.highlight not in commonness_highlights: + categorized['Highlight-Change'].append(exc_dp) + + # Keeping this commented out, since I couldn't figure out what to do with something in this catch-all category. + # For now it will be ignored, but it could maybe be useful. + # else: + # # Exception has a valid pattern type and highlight, but didn't meet commonness threshold + # # This could be another category or grouped with Highlight-Change + # categorized['Other-Exception'].append(exc_dp) # Add a catch-all category + + return categorized + + @staticmethod + def create_meta_insight(hdp: HomogenousDataPattern, commonness_threshold=COMMONNESS_THRESHOLD) -> 'MetaInsight' | None: """ Evaluates the HDP and creates a MetaInsight object. :param hdp: A HomogenousDataPattern object. + :param commonness_threshold: The threshold for commonness. :return: A MetaInsight object if possible, None otherwise. """ if len(hdp) == 0: @@ -72,7 +113,7 @@ def create_meta_insight(hdp: HomogenousDataPattern) -> 'MetaInsight' | None: for key, group in similarity_groups.items(): if dp in group: # An equivalence class is a commonness if it contains more than COMMONNESS_THRESHOLD of the HDP - if len(group) / total_patterns_in_hdp > COMMONNESS_THRESHOLD: + if len(group) / total_patterns_in_hdp > commonness_threshold: commonness_set.append(group) for pattern in group: processed_patterns.add(pattern) @@ -89,9 +130,9 @@ def create_meta_insight(hdp: HomogenousDataPattern) -> 'MetaInsight' | None: return None # Categorize exceptions (optional for basic MetaInsight object, but needed for scoring) - categorized_exceptions = categorize_exceptions(commonness_set, exceptions) + categorized_exceptions = MetaInsight.categorize_exceptions(commonness_set, exceptions) - return MetaInsight(hdp, commonness_set, categorized_exceptions) + return MetaInsight(hdp, commonness_set, categorized_exceptions, commonness_threshold=commonness_threshold) def calculate_conciseness(self) -> float: """ @@ -105,7 +146,7 @@ def calculate_conciseness(self) -> float: # Calculate entropy S = 0 commonness_proportions = [] - for group, patterns in self.commonness_set.items(): + for patterns in self.commonness_set: if len(patterns) > 0: proportion = len(patterns) / n S += proportion * math.log2(proportion) @@ -115,27 +156,72 @@ def calculate_conciseness(self) -> float: for category, patterns in self.exceptions.items(): if len(patterns) > 0: proportion = len(patterns) / n - S += BALANCE_PARAMETER * (proportion * math.log2(proportion)) + S += self.balance_parameter * (proportion * math.log2(proportion)) exception_proportions.append(proportion) # Convert to positive entropy S = -S # Compute S* (the upper bound of S) - threshold = ((1 - COMMONNESS_THRESHOLD) * math.e) / (math.pow(COMMONNESS_THRESHOLD, 1 / BALANCE_PARAMETER)) + threshold = ((1 - self.commonness_threshold) * math.e) / (math.pow(self.commonness_threshold, 1 / self.balance_parameter)) if EXCEPTION_CATEGORY_COUNT > threshold: - S_star = -math.log2(COMMONNESS_THRESHOLD) + (BALANCE_PARAMETER * EXCEPTION_CATEGORY_COUNT - * math.pow(COMMONNESS_THRESHOLD, 1 / BALANCE_PARAMETER) + S_star = -math.log2(self.commonness_threshold) + (self.balance_parameter * EXCEPTION_CATEGORY_COUNT + * math.pow(self.commonness_threshold, 1 / self.balance_parameter) * math.log2(math.e)) else: - S_star = - COMMONNESS_THRESHOLD * math.log(COMMONNESS_THRESHOLD) - ( - BALANCE_PARAMETER * (1 - COMMONNESS_THRESHOLD) * math.log2((1 - COMMONNESS_THRESHOLD) / EXCEPTION_CATEGORY_COUNT) + S_star = - self.commonness_threshold * math.log(self.commonness_threshold) - ( + self.balance_parameter * (1 - self.commonness_threshold) * math.log2((1 - self.commonness_threshold) / EXCEPTION_CATEGORY_COUNT) ) indicator_value = 1 if len(exception_proportions) == 0 else 0 - conciseness = 1 - ((S + ACTIONABILITY_REGULARIZER_PARAM * indicator_value) / S_star) + conciseness = 1 - ((S + self.actionability_regularizer_param * indicator_value) / S_star) # Ensure conciseness is within a reasonable range, e.g., [0, 1] - return conciseness \ No newline at end of file + return conciseness + + def compute_score(self, impact_measure = None) -> float: + """ + Computes the score of the MetaInsight. + The score is the multiple of the conciseness of the MetaInsight and the impact score of the HDS + making up the HDP. + :param impact_measure: The impact measure to be used for the HDS. + :return: The score of the MetaInsight. + """ + conciseness = self.calculate_conciseness() + hds_score = self.hdp.compute_impact(impact_measure=impact_measure) + self.score = conciseness * hds_score + return self.score + + + def compute_pairwise_overlap_ratio(self, other: 'MetaInsight') -> float: + """ + Computes the pairwise overlap ratio between two MetaInsights, as the ratio between the + size of the intersection and the size of the union of their HDPs. + :param other: Another MetaInsight object to compare with. + :return: The overlap ratio between the two MetaInsights. + """ + if not isinstance(other, MetaInsight): + raise ValueError("The other object must be an instance of MetaInsight.") + hds_1 = set(self.hdp.data_scopes) + hds_2 = set(other.hdp.data_scopes) + + overlap = len(hds_1.intersection(hds_2)) + total = len(hds_1.union(hds_2)) + # Avoid division by 0 + if total == 0: + return 0.0 + return overlap / total + + def compute_pairwise_overlap_score(self, other: 'MetaInsight') -> float: + """ + Computes the pairwise overlap score between two MetaInsights. + This is computed as min(I_1.score, I_2.scor) * overlap_ratio(I_1, I_2) + :param other: Another MetaInsight object to compare with. + :return: The pairwise overlap score between the two MetaInsights. + """ + if not isinstance(other, MetaInsight): + raise ValueError("The other object must be an instance of MetaInsight.") + overlap_ratio = self.compute_pairwise_overlap_ratio(other) + return min(self.score, other.score) * overlap_ratio diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py new file mode 100644 index 0000000..f508b59 --- /dev/null +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -0,0 +1,74 @@ +import itertools +from typing import List +import numpy as np + +from external_explainers.metainsight_explainer.meta_insight import (MetaInsight, + ACTIONABILITY_REGULARIZER_PARAM, + BALANCE_PARAMETER, + COMMONNESS_THRESHOLD) + +MIN_SCORE = 0.01 + +class MetaInsightMiner: + + + + """ + This class is responsible for the actual process of mining MetaInsights. + """ + def __init__(self, k=5, min_score=MIN_SCORE, min_commonness=COMMONNESS_THRESHOLD, balance_factor=BALANCE_PARAMETER, + actionability_regularizer=ACTIONABILITY_REGULARIZER_PARAM): + """ + Initialize the MetaInsightMiner with the provided parameters. + + :param min_score: The minimum score for a MetaInsight to be considered. + :param min_commonness: The minimum commonness for a MetaInsight to be considered. + :param balance_factor: The balance factor for the MetaInsight. + :param actionability_regularizer: The actionability regularizer for the MetaInsight. + """ + self.k = k + self.min_score = min_score + self.min_commonness = min_commonness + self.balance_factor = balance_factor + self.actionability_regularizer = actionability_regularizer + + def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): + """ + Rank the MetaInsights based on their scores. + + :param metainsight_candidates: A list of MetaInsights to rank. + :return: A list of the top k MetaInsights. + """ + # Sort candidates by score initially (descending) + sorted_candidates = sorted(metainsight_candidates, key=lambda mi: mi.score, reverse=True) + + selected_metainsights = [] + candidate_set = set(sorted_candidates) + + # Greedy selection of MetaInsights. + # We compute the total use of the currently selected MetaInsights, then how much a candidate would add to that. + # We take the candidate that adds the most to the total use, repeating until we have k MetaInsights or no candidates left. + while len(selected_metainsights) < self.k and candidate_set: + best_candidate = None + max_gain = -np.inf + + for candidate in candidate_set: + total_use_approx = sum(mi.score for mi in selected_metainsights) - \ + sum(mi1.compute_pairwise_overlap_score(mi2) for mi1, mi2 in itertools.combinations(metainsight_candidates, 2)) + + total_use_with_candidate = total_use_approx + (candidate.score - sum(mi.compute_pairwise_overlap_score(candidate) for mi in selected_metainsights)) + + gain = total_use_with_candidate - total_use_approx + + if gain > max_gain: + max_gain = gain + best_candidate = candidate + + if best_candidate: + selected_metainsights.append(best_candidate) + candidate_set.remove(best_candidate) + else: + # No candidate provides a positive gain, or candidate_set is empty + break + + return selected_metainsights \ No newline at end of file From 1d94cb403f4362d4e3c0ae35f77819de7be5d809 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Tue, 6 May 2025 23:06:07 +0300 Subject: [PATCH 05/27] Finished adding the MetaInsight mining process, and it seems to be able to run to conclusion with possibly correct results. --- .../metainsight_explainer/data_pattern.py | 51 +++--- .../metainsight_explainer/data_scope.py | 27 +++- .../metainsight_explainer/meta_insight.py | 32 ++-- .../metainsight_mining.py | 151 ++++++++++++++++-- .../pattern_evaluations.py | 70 ++++++-- 5 files changed, 260 insertions(+), 71 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/data_pattern.py b/src/external_explainers/metainsight_explainer/data_pattern.py index 764ed77..b6ca4c0 100644 --- a/src/external_explainers/metainsight_explainer/data_pattern.py +++ b/src/external_explainers/metainsight_explainer/data_pattern.py @@ -4,6 +4,7 @@ from external_explainers.metainsight_explainer.data_scope import DataScope, HomogenousDataScope from external_explainers.metainsight_explainer.pattern_evaluations import PatternEvaluator, PatternType + class BasicDataPattern: """ A data pattern, as defined in the MetaInsight paper. @@ -30,7 +31,6 @@ def __eq__(self, other): self.highlight == other.highlight and \ self.data_scope == other.data_scope - def sim(self, other) -> bool: """ Computes the similarity between two BasicDataPattern objects. @@ -46,10 +46,15 @@ def sim(self, other) -> bool: # has it but the other doesn't, the equality will be false anyway. If they both have it, then # the equality conditions will be true but the inequality conditions will be false. return self.pattern_type == other.pattern_type and self.highlight == other.highlight and \ - self.pattern_type != PatternType.NONE and self.pattern_type != PatternType.OTHER + self.pattern_type != PatternType.NONE and self.pattern_type != PatternType.OTHER def __hash__(self): - return hash((self.data_scope, self.pattern_type, self.highlight)) + data_scope_str = "".join([f"{k}: {v}" for k, v in self.data_scope.subspace.items()]) + highlight_string = "" + if self.highlight: + for h in self.highlight: + highlight_string += f"{h} " + return hash((data_scope_str, self.pattern_type, highlight_string)) def __repr__(self): return f"BasicDataPattern(ds={self.data_scope}, type='{self.pattern_type}', highlight={self.highlight})" @@ -97,6 +102,8 @@ def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: Patt else: # Check for other pattern types for other_type in PatternType: + if other_type == PatternType.OTHER or other_type == PatternType.NONE: + continue if other_type != pattern_type: other_is_valid, _ = pattern_evaluator(aggregated_series, other_type) if other_is_valid: @@ -107,7 +114,7 @@ def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: Patt def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, hds: List[DataScope] = None, temporal_dimensions: List[str] = None, - measures: Dict[str,str] = None) -> Tuple[List['BasicDataPattern'], Dict]: + measures: Dict[str, str] = None) -> Tuple['HomogenousDataPattern', Dict]: """ Generates a Homogenous Data Pattern (HDP) either from a given HDS or from the current DataScope. @@ -123,8 +130,6 @@ def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, # the same as the source_df of the current DataScope (otherwise, this pattern should not be # the one producing the HDP with this HDS). source_df = self.data_scope.source_df - if not all(ds.source_df == source_df for ds in hds): - raise ValueError("All DataScopes in the HDS must have the same source_df.") # Append the existing cache if available if pattern_cache is None: @@ -134,25 +139,31 @@ def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, # Create the HDP hdp = [] for ds in hds: - # Check pattern cache first - cache_key = (ds, pattern_type) - if cache_key in pattern_cache: - dp = pattern_cache[cache_key] - else: - # Evaluate the pattern if not in cache - dp = self.evaluate_pattern(ds, source_df, pattern_type) - pattern_cache[cache_key] = dp # Store in cache - - # Only add patterns that are not 'No Pattern' to the HDP for MetaInsight evaluation - if dp.type != PatternType.NONE: - hdp.append(dp) + if ds != self.data_scope: + # Check pattern cache first + cache_key = (ds, pattern_type) + if cache_key in pattern_cache: + dp = pattern_cache[cache_key] + else: + # Evaluate the pattern if not in cache + dp = self.evaluate_pattern(ds, source_df, pattern_type) + pattern_cache[cache_key] = dp # Store in cache + + # Only add patterns that are not 'No Pattern' to the HDP for MetaInsight evaluation + if dp.pattern_type != PatternType.NONE: + hdp.append(dp) self.pattern_cache = pattern_cache + if self.pattern_type != PatternType.NONE: + # Add the current pattern to the HDP + hdp.append(self) + hdp = HomogenousDataPattern(hdp) + return hdp, pattern_cache -class HomogenousDataPattern(HomogenousDataScope): +class HomogenousDataPattern(HomogenousDataScope): """ A homogenous data pattern. A list of data patterns induced by the same pattern type on a homogenous data scope. @@ -188,4 +199,4 @@ def __getitem__(self, item): """ Allows indexing into the data patterns. """ - return self.data_patterns[item] \ No newline at end of file + return self.data_patterns[item] diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py index 8ce5ed6..3dfa560 100644 --- a/src/external_explainers/metainsight_explainer/data_scope.py +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -1,6 +1,7 @@ import pandas as pd from typing import Dict, List, Tuple + class DataScope: """ A data scope, as defined in the MetaInsight paper. @@ -52,7 +53,7 @@ def _subspace_extend(self) -> List['DataScope']: new_ds.append(DataScope(self.source_df, new_subspace, self.breakdown, self.measure)) return new_ds - def _measure_extend(self, measures: Dict[str,str]) -> List['DataScope']: + def _measure_extend(self, measures: Dict[str, str]) -> List['DataScope']: """ Extends the measure of the DataScope while keeping the same breakdown and subspace. @@ -61,8 +62,9 @@ def _measure_extend(self, measures: Dict[str,str]) -> List['DataScope']: """ new_ds = [] for measure_col, agg_func in measures.items(): - if (measure_col, agg_func) != self.measure: - new_ds.append(DataScope(self.source_df, self.subspace, self.breakdown, (measure_col, agg_func))) + for func in agg_func: + if (measure_col, func) != self.measure: + new_ds.append(DataScope(self.source_df, self.subspace, self.breakdown, (measure_col, agg_func))) return new_ds def _breakdown_extend(self, temporal_dimensions: List[str]) -> List['DataScope']: @@ -81,7 +83,8 @@ def _breakdown_extend(self, temporal_dimensions: List[str]) -> List['DataScope'] new_ds.append(DataScope(self.source_df, self.subspace, breakdown_dim, self.measure)) return new_ds - def create_hds(self, temporal_dimensions: List[str] = None, measures: Dict[str, str] = None) -> 'HomogenousDataScope': + def create_hds(self, temporal_dimensions: List[str] = None, + measures: Dict[str, str] = None) -> 'HomogenousDataScope': """ Generates a Homogeneous Data Scope (HDS) from a base data scope, using subspace, measure and breakdown extensions as defined in the MetaInsight paper. @@ -91,7 +94,7 @@ def create_hds(self, temporal_dimensions: List[str] = None, measures: Dict[str, :return: A HDS in the form of a list of DataScope objects. """ - hds = [] + hds = [self] if temporal_dimensions is None: temporal_dimensions = [] if measures is None: @@ -173,6 +176,7 @@ def __init__(self, data_scopes: List[DataScope]): """ self.data_scopes = data_scopes self.source_df = data_scopes[0].source_df if data_scopes else None + self.impact = 0 def __iter__(self): """ @@ -195,6 +199,14 @@ def __getitem__(self, item): def __repr__(self): return f"HomogenousDataScope(#DataScopes={len(self.data_scopes)})" + def __lt__(self, other): + """ + Less than comparison for sorting. + :param other: Another HomogenousDataScope object. + :return: True if this object is less than the other, False otherwise. + """ + # We use the negative impact, since we want to use a max-heap but only have min-heap available + return - self.impact < - other.impact def compute_impact(self, impact_measure) -> float: """ @@ -202,8 +214,6 @@ def compute_impact(self, impact_measure) -> float: :param impact_measure: :return: The total impact of the HDS. """ - impact = 0 - total_impact = 0 if len(self.data_scopes) > 0: impact, total_impact = self.data_scopes[0].compute_impact(impact_measure) else: @@ -211,4 +221,5 @@ def compute_impact(self, impact_measure) -> float: for ds in self.data_scopes[1:]: ds_impact, _ = ds.compute_impact(impact_measure, total_impact) impact += ds_impact - return impact \ No newline at end of file + self.impact = impact + return impact diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index 26a03d2..0f5707c 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -5,12 +5,14 @@ from external_explainers.metainsight_explainer.data_pattern import HomogenousDataPattern from external_explainers.metainsight_explainer.data_pattern import BasicDataPattern +from external_explainers.metainsight_explainer.pattern_evaluations import PatternType COMMONNESS_THRESHOLD = 0.5 BALANCE_PARAMETER = 1 ACTIONABILITY_REGULARIZER_PARAM = 0.1 EXCEPTION_CATEGORY_COUNT = 3 + class MetaInsight: """ Represents a MetaInsight (HDP, commonness_set, exceptions). @@ -39,7 +41,6 @@ def __init__(self, hdp: HomogenousDataPattern, def __repr__(self): return f"MetaInsight(score={self.score:.4f}, #HDP={len(self.hdp)}, #Commonness={len(self.commonness_set)}, #Exceptions={len(self.exceptions)})" - @staticmethod def categorize_exceptions(commonness_set, exceptions): """ @@ -52,15 +53,15 @@ def categorize_exceptions(commonness_set, exceptions): commonness_highlights = set() for commonness in commonness_set: if commonness: # Ensure commonness is not empty - commonness_highlights.add(commonness[0].highlight) # Assume all in commonness have same highlight + commonness_highlights.add(str(commonness[0].highlight)) # Assume all in commonness have same highlight for exc_dp in exceptions: - if exc_dp.type == 'Other Pattern': + if exc_dp.pattern_type == PatternType.OTHER: categorized['Type-Change'].append(exc_dp) - elif exc_dp.type == 'No Pattern': + elif exc_dp.pattern_type == PatternType.NONE: # This case should ideally not happen if generate_hdp filters 'No Pattern' categorized['No-Pattern'].append(exc_dp) - elif exc_dp.highlight not in commonness_highlights: + elif str(exc_dp.highlight) not in commonness_highlights: categorized['Highlight-Change'].append(exc_dp) # Keeping this commented out, since I couldn't figure out what to do with something in this catch-all category. @@ -73,7 +74,7 @@ def categorize_exceptions(commonness_set, exceptions): return categorized @staticmethod - def create_meta_insight(hdp: HomogenousDataPattern, commonness_threshold=COMMONNESS_THRESHOLD) -> 'MetaInsight' | None: + def create_meta_insight(hdp: HomogenousDataPattern, commonness_threshold=COMMONNESS_THRESHOLD) -> 'MetaInsight': """ Evaluates the HDP and creates a MetaInsight object. :param hdp: A HomogenousDataPattern object. @@ -163,25 +164,26 @@ def calculate_conciseness(self) -> float: S = -S # Compute S* (the upper bound of S) - threshold = ((1 - self.commonness_threshold) * math.e) / (math.pow(self.commonness_threshold, 1 / self.balance_parameter)) + threshold = ((1 - self.commonness_threshold) * math.e) / ( + math.pow(self.commonness_threshold, 1 / self.balance_parameter)) if EXCEPTION_CATEGORY_COUNT > threshold: S_star = -math.log2(self.commonness_threshold) + (self.balance_parameter * EXCEPTION_CATEGORY_COUNT - * math.pow(self.commonness_threshold, 1 / self.balance_parameter) - * math.log2(math.e)) + * math.pow(self.commonness_threshold, + 1 / self.balance_parameter) + * math.log2(math.e)) else: S_star = - self.commonness_threshold * math.log(self.commonness_threshold) - ( - self.balance_parameter * (1 - self.commonness_threshold) * math.log2((1 - self.commonness_threshold) / EXCEPTION_CATEGORY_COUNT) + self.balance_parameter * (1 - self.commonness_threshold) * math.log2( + (1 - self.commonness_threshold) / EXCEPTION_CATEGORY_COUNT) ) - - indicator_value = 1 if len(exception_proportions) == 0 else 0 conciseness = 1 - ((S + self.actionability_regularizer_param * indicator_value) / S_star) # Ensure conciseness is within a reasonable range, e.g., [0, 1] return conciseness - def compute_score(self, impact_measure = None) -> float: + def compute_score(self, impact_measure=None) -> float: """ Computes the score of the MetaInsight. The score is the multiple of the conciseness of the MetaInsight and the impact score of the HDS @@ -190,11 +192,11 @@ def compute_score(self, impact_measure = None) -> float: :return: The score of the MetaInsight. """ conciseness = self.calculate_conciseness() - hds_score = self.hdp.compute_impact(impact_measure=impact_measure) + # If the impact has already been computed, use it + hds_score = self.hdp.impact if self.hdp.impact != 0 else self.hdp.compute_impact(impact_measure=impact_measure) self.score = conciseness * hds_score return self.score - def compute_pairwise_overlap_ratio(self, other: 'MetaInsight') -> float: """ Computes the pairwise overlap ratio between two MetaInsights, as the ratio between the diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index f508b59..e3fc67d 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -1,22 +1,27 @@ import itertools -from typing import List +from typing import List, Dict, Tuple import numpy as np +from queue import PriorityQueue +import pandas as pd + +from external_explainers.metainsight_explainer.data_pattern import BasicDataPattern from external_explainers.metainsight_explainer.meta_insight import (MetaInsight, ACTIONABILITY_REGULARIZER_PARAM, BALANCE_PARAMETER, COMMONNESS_THRESHOLD) +from external_explainers.metainsight_explainer.data_scope import DataScope +from external_explainers.metainsight_explainer.pattern_evaluations import PatternType -MIN_SCORE = 0.01 - -class MetaInsightMiner: - +MIN_IMPACT = 0.01 +class MetaInsightMiner: """ This class is responsible for the actual process of mining MetaInsights. """ - def __init__(self, k=5, min_score=MIN_SCORE, min_commonness=COMMONNESS_THRESHOLD, balance_factor=BALANCE_PARAMETER, + + def __init__(self, k=5, min_score=MIN_IMPACT, min_commonness=COMMONNESS_THRESHOLD, balance_factor=BALANCE_PARAMETER, actionability_regularizer=ACTIONABILITY_REGULARIZER_PARAM): """ Initialize the MetaInsightMiner with the provided parameters. @@ -39,11 +44,10 @@ def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): :param metainsight_candidates: A list of MetaInsights to rank. :return: A list of the top k MetaInsights. """ - # Sort candidates by score initially (descending) - sorted_candidates = sorted(metainsight_candidates, key=lambda mi: mi.score, reverse=True) selected_metainsights = [] - candidate_set = set(sorted_candidates) + # Sort candidates by score initially (descending) + candidate_set = sorted(list(set(metainsight_candidates)), key=lambda mi: mi.score, reverse=True) # Greedy selection of MetaInsights. # We compute the total use of the currently selected MetaInsights, then how much a candidate would add to that. @@ -52,11 +56,13 @@ def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): best_candidate = None max_gain = -np.inf - for candidate in candidate_set: - total_use_approx = sum(mi.score for mi in selected_metainsights) - \ - sum(mi1.compute_pairwise_overlap_score(mi2) for mi1, mi2 in itertools.combinations(metainsight_candidates, 2)) + total_use_approx = sum(mi.score for mi in selected_metainsights) - \ + sum(mi1.compute_pairwise_overlap_score(mi2) for mi1, mi2 in + itertools.combinations(selected_metainsights, 2)) - total_use_with_candidate = total_use_approx + (candidate.score - sum(mi.compute_pairwise_overlap_score(candidate) for mi in selected_metainsights)) + for candidate in candidate_set: + total_use_with_candidate = total_use_approx + (candidate.score - sum( + mi.compute_pairwise_overlap_score(candidate) for mi in selected_metainsights)) gain = total_use_with_candidate - total_use_approx @@ -71,4 +77,121 @@ def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): # No candidate provides a positive gain, or candidate_set is empty break - return selected_metainsights \ No newline at end of file + return selected_metainsights + + def mine_metainsights(self, source_df: pd.DataFrame, + dimensions: List[str], + measures: Dict[str, str], + impact_measure: Tuple[str, str]) -> List[MetaInsight]: + """ + The main function to mine MetaInsights. + Mines metainsights from the given data frame based on the provided dimensions, measures, and impact measure. + :param source_df: + :param dimensions: + :param measures: + :param impact_measure: + :return: + """ + metainsight_candidates = [] + query_cache = {} + pattern_cache = {} + hdp_queue = PriorityQueue() + + # Example: Generate data scopes with one dimension as breakdown, all '*' subspace + base_data_scopes = [] + for breakdown_dim in dimensions: + for measure_col, agg_func in measures.items(): + base_data_scopes.append( + DataScope(source_df, {}, breakdown_dim, (measure_col, agg_func))) + + # Example: Generate data scopes with one filter in subspace and one breakdown + for filter_dim in dimensions: + unique_values = source_df[filter_dim].dropna().unique() + for value in unique_values: + for breakdown_dim in dimensions: + if breakdown_dim != filter_dim: # Breakdown should be different from filter dim + for measure_col, agg_func in measures.items(): + base_data_scopes.append( + DataScope(source_df, {filter_dim: value}, breakdown_dim, (measure_col, agg_func))) + + print(f"Generated {len(base_data_scopes)} potential base data scopes.") + + # --- Pattern-Guided HDS Generation and Evaluation --- + # For each base data scope, evaluate basic patterns and generate HDSs + + for base_ds in base_data_scopes: + # Evaluate basic patterns for the base data scope for selected types + for pattern_type in PatternType: + if pattern_type == PatternType.OTHER or pattern_type == PatternType.NONE: + continue + base_dp = BasicDataPattern.evaluate_pattern(base_ds, source_df, pattern_type) + + if base_dp.pattern_type not in [PatternType.NONE, PatternType.OTHER]: + # If a valid basic pattern is found, extend the data scope to generate HDS + hdp, pattern_cache = base_dp.create_hdp(temporal_dimensions=dimensions, measures=measures, + pattern_type=pattern_type, pattern_cache=pattern_cache) + + # Pruning 2: Discard HDS with extremely low impact + hds_impact = hdp.compute_impact(impact_measure) + if hds_impact < MIN_IMPACT: + # print(f"Pruning HDS for {base_ds} due to low impact ({hds_impact:.4f})") + continue + + # Add HDS to a queue for evaluation + hdp_queue.put((hdp, pattern_type)) + + # --- Evaluate HDSs to find MetaInsights --- + # Process HDSs from the queue (simulating priority queue by just processing in order) + + processed_hdp_count = 0 + while not hdp_queue.empty(): # and time_elapsed < time_budget: # Add time budget check + hdp, pattern_type = hdp_queue.get() + processed_hdp_count += 1 + # print(f"Processing HDS {processed_hds_count}/{len(hds_queue) + processed_hds_count} for pattern '{pattern_type}'") + + # Evaluate HDP to find MetaInsight + metainsight = MetaInsight.create_meta_insight(hdp, commonness_threshold=self.min_commonness) + + if metainsight: + # Calculate and assign the score + metainsight.compute_score() + metainsight_candidates.append(metainsight) + # print(f"Found MetaInsight with score: {metainsight.score:.4f}") + + return self.rank_metainsights(metainsight_candidates) + + +if __name__ == "__main__": + # Create a sample Pandas DataFrame (similar to the paper's example) + df = pd.read_csv("C:\\Users\\Yuval\\PycharmProjects\\pd-explain\\Examples\\Datasets\\adult.csv") + + # Define dimensions, measures, and impact measure + dimensions = ['workclass', 'education'] + measures = { + "capital-gain": ["mean"], + "capital-loss": ["mean"], + } + impact_measure = ('capital-gain', 'mean') # Using total sales as impact + + # Run the mining process + miner = MetaInsightMiner(k=5, min_score=0.01, min_commonness=0.5) + top_metainsights = miner.mine_metainsights( + df, + dimensions, + measures, + impact_measure, + ) + + print("\n--- Top MetaInsights ---") + if top_metainsights: + for i, mi in enumerate(top_metainsights): + print(f"Rank {i + 1}: {mi}") + # You can further print details about commonness and exceptions if needed + # print(" Commonness:") + # for c in mi.commonness_set: + # print(f" - {len(c)} patterns, Type: {c[0].type}, Highlight: {c[0].highlight}") + # print(" Exceptions:") + # for e in mi.exceptions: + # print(f" - {e}") + else: + print("No MetaInsights found.") diff --git a/src/external_explainers/metainsight_explainer/pattern_evaluations.py b/src/external_explainers/metainsight_explainer/pattern_evaluations.py index 83ba86e..c72eddb 100644 --- a/src/external_explainers/metainsight_explainer/pattern_evaluations.py +++ b/src/external_explainers/metainsight_explainer/pattern_evaluations.py @@ -7,6 +7,7 @@ from sklearn.linear_model import LinearRegression from sklearn.cluster import DBSCAN + class PatternType(Enum): """ An enumeration of the types of patterns. @@ -17,6 +18,7 @@ class PatternType(Enum): TREND = 3 OUTLIER = 4 + class PatternEvaluator: """ A class to evaluate different patterns in a series. @@ -38,30 +40,49 @@ def unimodality(series: pd.Series) -> (bool, Tuple[str, str]): if not is_unimodal: return False, (None, None) # If there is unimodality, find the valley / peak - # 2. Perform Kernel Density Estimation - kde = gaussian_kde(series) + # If a series is all 0s, then this can happen + try: + kde = gaussian_kde(series) + except np.linalg.LinAlgError: + return False, (None, None) - # 3. Evaluate the KDE over a range of values + # Evaluate the KDE over a range of values # Create a range of points covering the data span x_range = np.linspace(series.min(), series.max(), 1000) density_values = kde(x_range) - # 4. Find the index of the maximum (peak) and minimum (valley) density + # Find the index of the maximum (peak) and minimum (valley) density peak_index = np.argmax(density_values) valley_index = np.argmin(density_values) - # 5. Map indices back to data values to get the estimated locations + # Get the location of the peak / valley peak_location = x_range[peak_index] valley_location = x_range[valley_index] + # Get the index from the real series for which the peak and valley occurr. + # Because we are approximating, we get the index for which the values are the closest. + peak_dist = np.inf + valley_dist = np.inf + valley_index = None + peak_index = None + for idx in series.index.tolist(): + val = series[idx] + val_peak_dist = abs(val - peak_location) + val_valley_dist = abs(val - valley_location) + if val_peak_dist < peak_dist: + peak_index = idx + peak_dist = val_peak_dist + if val_valley_dist < valley_dist: + valley_index = idx + valley_dist = val_valley_dist + + # Check which of the two is the bigger outlier, and return the one that is # furthest from the mean if abs(peak_location - series.mean()) > abs(valley_location - series.mean()): - return True, (peak_location, 'Peak') + return True, (peak_index, 'Peak') else: - return True, (valley_location, 'Valley') - - + return True, (valley_index, 'Valley') @staticmethod def trend(series: pd.Series) -> (bool, Tuple[str, str]): @@ -74,6 +95,21 @@ def trend(series: pd.Series) -> (bool, Tuple[str, str]): if len(series) < 2: return False, (None, None) + # Check if the series is a time series, or just a series of numbers + # We say a series is a time series if its index is either a datetime index or an increasing integer index + is_datetime_index = isinstance(series.index, pd.DatetimeIndex) + is_numeric_index = np.issubdtype(series.index.dtype, np.number) + if is_numeric_index: + series = series.sort_index() + # Check if the index is strictly increasing + is_increasing = np.all(np.diff(series.index) > 0) + else: + is_increasing = False + + # We can't find trends in series that are not time series - + if not is_datetime_index and not is_increasing: + return False, (None, None) + # Create a simple linear model X = np.arange(len(series)).reshape(-1, 1) # Independent variable (time index) y = series.values # Dependent variable (data values) @@ -85,7 +121,7 @@ def trend(series: pd.Series) -> (bool, Tuple[str, str]): # Check if the slope is significant if abs(slope) > PatternEvaluator.TREND_SLOPE_THRESHOLD: trend_direction = 'Upward' if slope > 0 else 'Downward' - return True, (slope, trend_direction) + return True, (None, trend_direction) else: return False, (None, None) @@ -107,7 +143,8 @@ def outlier(series: pd.Series) -> (bool, Tuple[str, str]): outlier_indices = np.where(z_scores > PatternEvaluator.OUTLIER_ZSCORE_THRESHOLD)[0] if len(outlier_indices) > 0: - outlier_data_points = series[outlier_indices].values.tolist() + outlier_data_points = series.iloc[outlier_indices].values.tolist() + outlier_index = series.index[outlier_indices].tolist() # If there are multiple outliers, use clustering and return the cluster means. # This is more informative and easier to interpret than a list of raw outlier values. if len(outlier_data_points) > 1: @@ -116,11 +153,16 @@ def outlier(series: pd.Series) -> (bool, Tuple[str, str]): # Perform clustering clustered = DBSCAN().fit_predict(outlier_data_points) cluster_means = [] + cluster_indexes = [] for cluster in np.unique(clustered): if cluster != -1: cluster_points = outlier_data_points[clustered == cluster] cluster_mean = np.mean(cluster_points) cluster_means.append(cluster_mean) + # Take the most common index of the cluster points to represent the cluster + cluster_index = outlier_index[clustered == cluster] + cluster_index = pd.Series(cluster_index).mode()[0] + cluster_indexes.append(cluster_index) # If there are noise points, they will be labeled as -1 in DBSCAN. To us though, those are # not noise points, but outliers. So we will return them as well (unlike the clustered points, # their mean may be meaningless because they might be very far apart. @@ -129,9 +171,9 @@ def outlier(series: pd.Series) -> (bool, Tuple[str, str]): noise_points = noise_points.flatten().tolist() cluster_means.extend(noise_points) # Return the cluster centers as the highlight meaning "outliers around these values" - return True, (cluster_means, None) + return True, (cluster_indexes, None) - return True, (outlier_data_points, None) + return True, ([outlier_index[0]], None) else: return False, (None, None) @@ -149,4 +191,4 @@ def __call__(self, series: pd.Series, pattern_type: PatternType) -> (bool, str): elif pattern_type == PatternType.OUTLIER: return self.outlier(series) else: - raise ValueError(f"Unsupported pattern type: {pattern_type}") \ No newline at end of file + raise ValueError(f"Unsupported pattern type: {pattern_type}") From c2a7324ca37df5bb99bcdad45b2e65e29e4954e0 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Fri, 9 May 2025 20:36:25 +0300 Subject: [PATCH 06/27] Changed impact computing to a universal one, fixed some bugs. --- .../metainsight_explainer/data_pattern.py | 2 +- .../metainsight_explainer/data_scope.py | 105 +++++++++--------- .../metainsight_mining.py | 50 ++++++--- .../pattern_evaluations.py | 9 +- 4 files changed, 99 insertions(+), 67 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/data_pattern.py b/src/external_explainers/metainsight_explainer/data_pattern.py index b6ca4c0..18ec706 100644 --- a/src/external_explainers/metainsight_explainer/data_pattern.py +++ b/src/external_explainers/metainsight_explainer/data_pattern.py @@ -114,7 +114,7 @@ def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: Patt def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, hds: List[DataScope] = None, temporal_dimensions: List[str] = None, - measures: Dict[str, str] = None) -> Tuple['HomogenousDataPattern', Dict]: + measures: List[Tuple[str,str]] = None) -> Tuple['HomogenousDataPattern', Dict]: """ Generates a Homogenous Data Pattern (HDP) either from a given HDS or from the current DataScope. diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py index 3dfa560..89298f6 100644 --- a/src/external_explainers/metainsight_explainer/data_scope.py +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -1,5 +1,8 @@ import pandas as pd from typing import Dict, List, Tuple +from scipy.special import kl_div +from concurrent.futures import ThreadPoolExecutor +import time class DataScope: @@ -53,7 +56,7 @@ def _subspace_extend(self) -> List['DataScope']: new_ds.append(DataScope(self.source_df, new_subspace, self.breakdown, self.measure)) return new_ds - def _measure_extend(self, measures: Dict[str, str]) -> List['DataScope']: + def _measure_extend(self, measures: List[Tuple[str, str]]) -> List['DataScope']: """ Extends the measure of the DataScope while keeping the same breakdown and subspace. @@ -61,10 +64,9 @@ def _measure_extend(self, measures: Dict[str, str]) -> List['DataScope']: :return: A list of new DataScope objects with the extended measure. """ new_ds = [] - for measure_col, agg_func in measures.items(): - for func in agg_func: - if (measure_col, func) != self.measure: - new_ds.append(DataScope(self.source_df, self.subspace, self.breakdown, (measure_col, agg_func))) + for measure_col, agg_func in measures: + if (measure_col, agg_func) != self.measure: + new_ds.append(DataScope(self.source_df, self.subspace, self.breakdown, (measure_col, agg_func))) return new_ds def _breakdown_extend(self, temporal_dimensions: List[str]) -> List['DataScope']: @@ -84,7 +86,7 @@ def _breakdown_extend(self, temporal_dimensions: List[str]) -> List['DataScope'] return new_ds def create_hds(self, temporal_dimensions: List[str] = None, - measures: Dict[str, str] = None) -> 'HomogenousDataScope': + measures: List[Tuple[str,str]] = None) -> 'HomogenousDataScope': """ Generates a Homogeneous Data Scope (HDS) from a base data scope, using subspace, measure and breakdown extensions as defined in the MetaInsight paper. @@ -111,54 +113,53 @@ def create_hds(self, temporal_dimensions: List[str] = None, return HomogenousDataScope(hds) - def compute_impact(self, impact_measure: Tuple = None, precomputed_total_impact: float = None) -> Tuple[ - float, float]: + def compute_impact(self, precomputed_source_df: pd.DataFrame = None) -> float: """ Computes the impact of the data scope based on the provided impact measure. - Impact is defined as the ratio of the measure in the current data scope to the total measure in the source DataFrame. - - :param impact_measure: A tuple representing the impact measure. Optional. If not provided, the data scope's - measure will be used. - :param precomputed_total_impact: A precomputed total impact value. Optional. If provided, it will be used instead of - computing the total impact again. Used for performance optimization. - :return: The computed impact. + We define impact as the proportion of rows between the data scope and the total date scope, multiplied + by their KL divergence. """ - if impact_measure is None: - impact_measure = self.measure - impact_col, agg_func = impact_measure + if len(self.subspace) == 0: + # No subspace, no impact + return 0 + # Use the provided impact measure or default to the data scope's measure + impact_col, agg_func = self.measure if impact_col not in self.source_df.columns: raise ValueError(f"Impact column '{impact_col}' not found in source DataFrame.") - total_impact = precomputed_total_impact - # If we are not using a precomputed total impact, compute it - if precomputed_total_impact is None: - try: - total_impact = self.source_df[impact_col].agg(agg_func) - except Exception as e: - print(f"Error during aggregation for {self}: {e}") - return 0, 0 - - # Avoid division by zero - if total_impact == 0: - return 0, 0 - - # Compute the impact for the current data scope - filtered_df = self.source_df + # Perform subspace filtering + filtered_df = self.source_df.copy() for dim, value in self.subspace.items(): if value != '*': filtered_df = filtered_df[filtered_df[dim] == value] - + # Group by breakdown dimension and aggregate measure + if self.breakdown not in filtered_df.columns: + # Cannot group by breakdown if it's not in the filtered data + return 0 if impact_col not in filtered_df.columns: - return 0, total_impact - else: + # Cannot aggregate if measure column is not in the data + return 0 + try: + numeric_columns = filtered_df.select_dtypes(include=['number']).columns.tolist() # Perform the aggregation - try: - impact = filtered_df[impact_col].agg(agg_func) - impact = impact / total_impact - return impact, total_impact - except Exception as e: - print(f"Error during aggregation for {self}: {e}") - return 0, total_impact + aggregated_series = filtered_df.groupby(impact_col)[numeric_columns].agg(agg_func) + if precomputed_source_df is None: + aggregated_source = self.source_df.groupby(impact_col)[numeric_columns].agg(agg_func) + else: + aggregated_source = precomputed_source_df.groupby(impact_col)[[numeric_columns]].agg(agg_func) + except Exception as e: + print(f"Error during aggregation for {self}: {e}") + return 0 + + kl_divergence = kl_div(aggregated_series, aggregated_source).mean() + # If it is still a series, then the first mean was on a dataframe and not a series, and thus we need + # to take the mean to get a float. + if isinstance(kl_divergence, pd.Series): + kl_divergence = kl_divergence.mean() + row_proportion = len(filtered_df.index.to_list()) / len(self.source_df.index.to_list()) + impact = row_proportion * kl_divergence + return impact + class HomogenousDataScope: @@ -208,18 +209,18 @@ def __lt__(self, other): # We use the negative impact, since we want to use a max-heap but only have min-heap available return - self.impact < - other.impact - def compute_impact(self, impact_measure) -> float: + def compute_impact(self) -> float: """ Computes the impact of the HDS. This is the sum of the impacts of all data scopes in the HDS. - :param impact_measure: :return: The total impact of the HDS. """ - if len(self.data_scopes) > 0: - impact, total_impact = self.data_scopes[0].compute_impact(impact_measure) - else: - return 0 - for ds in self.data_scopes[1:]: - ds_impact, _ = ds.compute_impact(impact_measure, total_impact) - impact += ds_impact + impact = 0 + # with ThreadPoolExecutor() as executor: + # # Compute the impact of each data scope in parallel + # futures = [executor.submit(ds.compute_impact) for ds in self.data_scopes] + # for future in futures: + # impact += future.result() + for ds in self.data_scopes: + impact += ds.compute_impact() self.impact = impact return impact diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index e3fc67d..8257a43 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -4,8 +4,9 @@ from queue import PriorityQueue import pandas as pd +from concurrent.futures import ThreadPoolExecutor -from external_explainers.metainsight_explainer.data_pattern import BasicDataPattern +from external_explainers.metainsight_explainer.data_pattern import BasicDataPattern, HomogenousDataPattern from external_explainers.metainsight_explainer.meta_insight import (MetaInsight, ACTIONABILITY_REGULARIZER_PARAM, BALANCE_PARAMETER, @@ -79,10 +80,34 @@ def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): return selected_metainsights + + def _create_hdp(self, base_ds: DataScope, source_df: pd.DataFrame, + dimensions: List[str], measures: Dict[str, str], + pattern_cache: Dict[Tuple[DataScope, PatternType], BasicDataPattern]) -> List[HomogenousDataPattern]: + hdps = [] + for pattern_type in PatternType: + if pattern_type == PatternType.OTHER or pattern_type == PatternType.NONE: + continue + base_dp = BasicDataPattern.evaluate_pattern(base_ds, source_df, pattern_type) + + if base_dp.pattern_type not in [PatternType.NONE, PatternType.OTHER]: + # If a valid basic pattern is found, extend the data scope to generate HDS + hdp, _ = base_dp.create_hdp(temporal_dimensions=dimensions, measures=measures, + pattern_type=pattern_type, pattern_cache=pattern_cache) + + + # Pruning: Discard HDS with extremely low impact + hds_impact = hdp.compute_impact() + if hds_impact < MIN_IMPACT: + # print(f"Pruning HDS for {base_ds} due to low impact ({hds_impact:.4f})") + continue + + + return hdps + def mine_metainsights(self, source_df: pd.DataFrame, dimensions: List[str], - measures: Dict[str, str], - impact_measure: Tuple[str, str]) -> List[MetaInsight]: + measures: List[Tuple[str,str]]) -> List[MetaInsight]: """ The main function to mine MetaInsights. Mines metainsights from the given data frame based on the provided dimensions, measures, and impact measure. @@ -100,7 +125,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, # Example: Generate data scopes with one dimension as breakdown, all '*' subspace base_data_scopes = [] for breakdown_dim in dimensions: - for measure_col, agg_func in measures.items(): + for measure_col, agg_func in measures: base_data_scopes.append( DataScope(source_df, {}, breakdown_dim, (measure_col, agg_func))) @@ -110,7 +135,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, for value in unique_values: for breakdown_dim in dimensions: if breakdown_dim != filter_dim: # Breakdown should be different from filter dim - for measure_col, agg_func in measures.items(): + for measure_col, agg_func in measures: base_data_scopes.append( DataScope(source_df, {filter_dim: value}, breakdown_dim, (measure_col, agg_func))) @@ -131,8 +156,12 @@ def mine_metainsights(self, source_df: pd.DataFrame, hdp, pattern_cache = base_dp.create_hdp(temporal_dimensions=dimensions, measures=measures, pattern_type=pattern_type, pattern_cache=pattern_cache) + # Pruning 1 - if the HDP is unlikely to form a commonness, discard it + if len(hdp) < len(hdp.data_scopes) * self.min_commonness: + continue + # Pruning 2: Discard HDS with extremely low impact - hds_impact = hdp.compute_impact(impact_measure) + hds_impact = hdp.compute_impact() if hds_impact < MIN_IMPACT: # print(f"Pruning HDS for {base_ds} due to low impact ({hds_impact:.4f})") continue @@ -156,7 +185,6 @@ def mine_metainsights(self, source_df: pd.DataFrame, # Calculate and assign the score metainsight.compute_score() metainsight_candidates.append(metainsight) - # print(f"Found MetaInsight with score: {metainsight.score:.4f}") return self.rank_metainsights(metainsight_candidates) @@ -164,14 +192,11 @@ def mine_metainsights(self, source_df: pd.DataFrame, if __name__ == "__main__": # Create a sample Pandas DataFrame (similar to the paper's example) df = pd.read_csv("C:\\Users\\Yuval\\PycharmProjects\\pd-explain\\Examples\\Datasets\\adult.csv") + df = df.sample(5000, random_state=42) # Sample 5000 rows for testing # Define dimensions, measures, and impact measure dimensions = ['workclass', 'education'] - measures = { - "capital-gain": ["mean"], - "capital-loss": ["mean"], - } - impact_measure = ('capital-gain', 'mean') # Using total sales as impact + measures = [('age', 'mean'), ('capital-gain', 'mean'), ('capital-loss', 'mean')] # Run the mining process miner = MetaInsightMiner(k=5, min_score=0.01, min_commonness=0.5) @@ -179,7 +204,6 @@ def mine_metainsights(self, source_df: pd.DataFrame, df, dimensions, measures, - impact_measure, ) print("\n--- Top MetaInsights ---") diff --git a/src/external_explainers/metainsight_explainer/pattern_evaluations.py b/src/external_explainers/metainsight_explainer/pattern_evaluations.py index c72eddb..1b2badd 100644 --- a/src/external_explainers/metainsight_explainer/pattern_evaluations.py +++ b/src/external_explainers/metainsight_explainer/pattern_evaluations.py @@ -34,8 +34,15 @@ def unimodality(series: pd.Series) -> (bool, Tuple[str, str]): :param series: The series to evaluate. :return: (is_unimodal, highlight) """ + if isinstance(series, pd.Series): + series = series.sort_values() + else: + return False, (None, None) + vals = series.values + if len(vals) < 4: + return False, (None, None) # Perform Hartigan's Dip test - dip_statistic, p_value = diptest(series.dropna().values) + dip_statistic, p_value = diptest(vals) is_unimodal = p_value > 0.05 if not is_unimodal: return False, (None, None) From 5e38db60e1f723a9eeed7e2ad091d205d86e1741 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Fri, 9 May 2025 21:15:04 +0300 Subject: [PATCH 07/27] Changed unimodality evaluation to a better fitting one. Began a more OOP approach to patterns to better streamline their usage. --- .../metainsight_mining.py | 33 ++----- .../pattern_evaluations.py | 74 ++++++--------- .../metainsight_explainer/patterns.py | 93 +++++++++++++++++++ 3 files changed, 126 insertions(+), 74 deletions(-) create mode 100644 src/external_explainers/metainsight_explainer/patterns.py diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index 8257a43..641ad8b 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -80,31 +80,6 @@ def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): return selected_metainsights - - def _create_hdp(self, base_ds: DataScope, source_df: pd.DataFrame, - dimensions: List[str], measures: Dict[str, str], - pattern_cache: Dict[Tuple[DataScope, PatternType], BasicDataPattern]) -> List[HomogenousDataPattern]: - hdps = [] - for pattern_type in PatternType: - if pattern_type == PatternType.OTHER or pattern_type == PatternType.NONE: - continue - base_dp = BasicDataPattern.evaluate_pattern(base_ds, source_df, pattern_type) - - if base_dp.pattern_type not in [PatternType.NONE, PatternType.OTHER]: - # If a valid basic pattern is found, extend the data scope to generate HDS - hdp, _ = base_dp.create_hdp(temporal_dimensions=dimensions, measures=measures, - pattern_type=pattern_type, pattern_cache=pattern_cache) - - - # Pruning: Discard HDS with extremely low impact - hds_impact = hdp.compute_impact() - if hds_impact < MIN_IMPACT: - # print(f"Pruning HDS for {base_ds} due to low impact ({hds_impact:.4f})") - continue - - - return hdps - def mine_metainsights(self, source_df: pd.DataFrame, dimensions: List[str], measures: List[Tuple[str,str]]) -> List[MetaInsight]: @@ -192,19 +167,23 @@ def mine_metainsights(self, source_df: pd.DataFrame, if __name__ == "__main__": # Create a sample Pandas DataFrame (similar to the paper's example) df = pd.read_csv("C:\\Users\\Yuval\\PycharmProjects\\pd-explain\\Examples\\Datasets\\adult.csv") - df = df.sample(5000, random_state=42) # Sample 5000 rows for testing + df = df.sample(2500, random_state=42) # Sample 5000 rows for testing # Define dimensions, measures, and impact measure - dimensions = ['workclass', 'education'] + dimensions = ['age', 'education-num'] measures = [('age', 'mean'), ('capital-gain', 'mean'), ('capital-loss', 'mean')] # Run the mining process + import time + start_time = time.time() miner = MetaInsightMiner(k=5, min_score=0.01, min_commonness=0.5) top_metainsights = miner.mine_metainsights( df, dimensions, measures, ) + end_time = time.time() + print(f"Time taken: {end_time - start_time:.2f} seconds") print("\n--- Top MetaInsights ---") if top_metainsights: diff --git a/src/external_explainers/metainsight_explainer/pattern_evaluations.py b/src/external_explainers/metainsight_explainer/pattern_evaluations.py index 1b2badd..f1a0a41 100644 --- a/src/external_explainers/metainsight_explainer/pattern_evaluations.py +++ b/src/external_explainers/metainsight_explainer/pattern_evaluations.py @@ -6,6 +6,7 @@ from scipy.stats import gaussian_kde, zscore from sklearn.linear_model import LinearRegression from sklearn.cluster import DBSCAN +from external_explainers.metainsight_explainer.patterns import UnimodalityPattern class PatternType(Enum): @@ -28,68 +29,47 @@ class PatternEvaluator: TREND_SLOPE_THRESHOLD = 0.01 # Minimum absolute slope for trend detection @staticmethod - def unimodality(series: pd.Series) -> (bool, Tuple[str, str]): + def unimodality(series: pd.Series) -> (bool, UnimodalityPattern | None): """ Evaluates if the series is unimodal using Hartigan's Dip test and returns the highlight. :param series: The series to evaluate. :return: (is_unimodal, highlight) """ if isinstance(series, pd.Series): - series = series.sort_values() + series = series.sort_index() else: - return False, (None, None) + return False, None vals = series.values if len(vals) < 4: - return False, (None, None) + return False, None # Perform Hartigan's Dip test dip_statistic, p_value = diptest(vals) is_unimodal = p_value > 0.05 if not is_unimodal: - return False, (None, None) + return False, None # If there is unimodality, find the valley / peak - # If a series is all 0s, then this can happen - try: - kde = gaussian_kde(series) - except np.linalg.LinAlgError: - return False, (None, None) - - # Evaluate the KDE over a range of values - # Create a range of points covering the data span - x_range = np.linspace(series.min(), series.max(), 1000) - density_values = kde(x_range) - - # Find the index of the maximum (peak) and minimum (valley) density - peak_index = np.argmax(density_values) - valley_index = np.argmin(density_values) - - # Get the location of the peak / valley - peak_location = x_range[peak_index] - valley_location = x_range[valley_index] - - # Get the index from the real series for which the peak and valley occurr. - # Because we are approximating, we get the index for which the values are the closest. - peak_dist = np.inf - valley_dist = np.inf - valley_index = None - peak_index = None - for idx in series.index.tolist(): - val = series[idx] - val_peak_dist = abs(val - peak_location) - val_valley_dist = abs(val - valley_location) - if val_peak_dist < peak_dist: - peak_index = idx - peak_dist = val_peak_dist - if val_valley_dist < valley_dist: - valley_index = idx - valley_dist = val_valley_dist - - - # Check which of the two is the bigger outlier, and return the one that is - # furthest from the mean - if abs(peak_location - series.mean()) > abs(valley_location - series.mean()): - return True, (peak_index, 'Peak') + max_value = series.max() + min_value = series.min() + # Check to make sure either the max or min happens only once, and is not at the start or end of the series + peaks = series[series == max_value] + valleys = series[series == min_value] + if len(peaks) > 1 and len(valleys) > 1: + return False, None + max_value_index = peaks.index[0] if len(peaks) == 1 else None + min_value_index = valleys.index[0] if len(valleys) == 1 else None + # If both are at the edges, we can't use them + if (max_value_index is not None and (max_value_index == series.index[0] or max_value_index == series.index[-1])) or \ + (min_value_index is not None and (min_value_index == series.index[0] or min_value_index == series.index[-1])): + return False, None + index_name = series.index.name + if max_value_index: + return True, UnimodalityPattern(series, 'Peak', max_value_index, index_name=index_name) + elif min_value_index: + return True, UnimodalityPattern(series, 'Valley', min_value_index, index_name=index_name) else: - return True, (valley_index, 'Valley') + return False, None + + @staticmethod def trend(series: pd.Series) -> (bool, Tuple[str, str]): diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py new file mode 100644 index 0000000..3e0df61 --- /dev/null +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -0,0 +1,93 @@ +from abc import ABC, abstractmethod +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +class PatternInterface(ABC): + """ + Abstract base class for defining patterns. + """ + + @abstractmethod + def visualize(self, ax): + """ + Visualize the pattern. + """ + raise NotImplementedError("Subclasses must implement this method.") + + @abstractmethod + def create_explanation_string(self, commonness_set, exceptions): + """ + Create an explanation string for the pattern. + :param commonness_set: The commonness set, where the pattern is common. + :param exceptions: The exceptions dict, where the pattern is different from the commonness set or did not occur. + :return: + """ + raise NotImplementedError("Subclasses must implement this method.") + + @abstractmethod + def __eq__(self, other): + """ + Check if two patterns are equal + :param other: Another pattern of the same type + :return: + """ + + +class UnimodalityPattern(PatternInterface): + + def __init__(self, source_series: pd.Series, type: str, index, index_name: str = None): + """ + Initialize the UnimodalityPattern with the provided parameters. + + :param source_series: The source series to evaluate. + :param type: The type of the pattern. + :param location: The location of the pattern. + """ + self.source_series = source_series + self.type = type + self.index = index + self.index_name = index_name + + def visualize(self, ax): + """ + Visualize the unimodality pattern. + :return: + """ + ax.plot(self.source_series, label='Unimodality Pattern') + ax.axvline(x=self.index, color='r', linestyle='--', label='Location') + ax.legend() + ax.set_title(f'Unimodality Pattern: {self.type}') + + + def create_explanation_string(self, commonness_set, exceptions): + """ + Create an explanation string for the unimodality pattern. + :param commonness_set: The commonness set, where the pattern is common. + :param exceptions: The exceptions dict, where the pattern is different from the commonness set or did not occur. + :return: + """ + return f"Unimodality Pattern: {commonness_set}, Exceptions: {exceptions}" + + def __eq__(self, other): + """ + Check if two UnimodalityPattern objects are equal. + :param other: Another UnimodalityPattern object. + :return: True if they are equal, False otherwise. + """ + if not isinstance(other, UnimodalityPattern): + return False + return (self.source_series.equals(other.source_series) and + self.type == other.type and + self.index == other.index and + self.index_name == other.index_name) + + + +if __name__ == '__main__': + data = np.random.normal(0, 1, 1000) + series = pd.Series(data) + pattern = UnimodalityPattern(series, 'Peak', 500) + fig, ax = plt.subplots() + pattern.visualize(ax) + plt.show() \ No newline at end of file From 39c6e1dc0e0c4ae0c53d1ceb0d26b2625e598de5 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Sat, 10 May 2025 15:32:31 +0300 Subject: [PATCH 08/27] Changed all evaluation methods to more sound ones, added more caching. --- requirements.txt | 4 +- .../metainsight_explainer/data_scope.py | 15 +- .../metainsight_mining.py | 4 +- .../pattern_evaluations.py | 150 +++++++++--------- .../metainsight_explainer/patterns.py | 76 ++++++++- 5 files changed, 166 insertions(+), 83 deletions(-) diff --git a/requirements.txt b/requirements.txt index b87196b..444a30c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ pandas~=2.2.3 matplotlib~=3.9.2 diptest scipy -scikit-learn \ No newline at end of file +scikit-learn +pymannkendall +cydets \ No newline at end of file diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py index 89298f6..136f439 100644 --- a/src/external_explainers/metainsight_explainer/data_scope.py +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -209,18 +209,19 @@ def __lt__(self, other): # We use the negative impact, since we want to use a max-heap but only have min-heap available return - self.impact < - other.impact - def compute_impact(self) -> float: + def compute_impact(self, cache) -> float: """ Computes the impact of the HDS. This is the sum of the impacts of all data scopes in the HDS. :return: The total impact of the HDS. """ impact = 0 - # with ThreadPoolExecutor() as executor: - # # Compute the impact of each data scope in parallel - # futures = [executor.submit(ds.compute_impact) for ds in self.data_scopes] - # for future in futures: - # impact += future.result() for ds in self.data_scopes: - impact += ds.compute_impact() + if ds in cache: + # Use the cached impact if available to avoid recomputation, since computing the impact + # is the single most expensive operation in the entire pipeline + impact += cache[ds] + else: + impact += ds.compute_impact() + cache[ds] = impact self.impact = impact return impact diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index 641ad8b..386ea42 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -93,7 +93,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, :return: """ metainsight_candidates = [] - query_cache = {} + datascope_cache = {} pattern_cache = {} hdp_queue = PriorityQueue() @@ -136,7 +136,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, continue # Pruning 2: Discard HDS with extremely low impact - hds_impact = hdp.compute_impact() + hds_impact = hdp.compute_impact(datascope_cache) if hds_impact < MIN_IMPACT: # print(f"Pruning HDS for {base_ds} due to low impact ({hds_impact:.4f})") continue diff --git a/src/external_explainers/metainsight_explainer/pattern_evaluations.py b/src/external_explainers/metainsight_explainer/pattern_evaluations.py index f1a0a41..fddc80e 100644 --- a/src/external_explainers/metainsight_explainer/pattern_evaluations.py +++ b/src/external_explainers/metainsight_explainer/pattern_evaluations.py @@ -4,9 +4,10 @@ import numpy as np from diptest import diptest from scipy.stats import gaussian_kde, zscore -from sklearn.linear_model import LinearRegression -from sklearn.cluster import DBSCAN -from external_explainers.metainsight_explainer.patterns import UnimodalityPattern +from external_explainers.metainsight_explainer.patterns import UnimodalityPattern, TrendPattern, OutlierPattern, \ + CyclePattern +import pymannkendall as mk +from cydets.algorithm import detect_cycles class PatternType(Enum): @@ -18,6 +19,7 @@ class PatternType(Enum): UNIMODALITY = 2 TREND = 3 OUTLIER = 4 + CYCLE = 5 class PatternEvaluator: @@ -28,12 +30,33 @@ class PatternEvaluator: OUTLIER_ZSCORE_THRESHOLD = 2.0 # Z-score threshold for outlier detection TREND_SLOPE_THRESHOLD = 0.01 # Minimum absolute slope for trend detection + @staticmethod + def _is_time_series(series: pd.Series) -> bool: + """ + Checks if the series is a time series. + We consider a series to be a time series if its index is either a datetime index or an increasing integer index. + The second case is not always accurate, since an ordered series of numbers may not be a time series, but + we also can not discard the possibility that it is a time series. + :param series: The series to check. + :return: True if the series is a time series, False otherwise. + """ + if isinstance(series.index, pd.DatetimeIndex): + return True + elif np.issubdtype(series.index.dtype, np.number): + # Sort the index first, just in case the series it is not sorted, but it does have meaningful time intervals + series.sort_index(inplace=True) + # Check if the index is strictly increasing + return np.all(np.diff(series.index) > 0) + else: + return False + @staticmethod def unimodality(series: pd.Series) -> (bool, UnimodalityPattern | None): """ - Evaluates if the series is unimodal using Hartigan's Dip test and returns the highlight. + Evaluates if the series is unimodal using Hartigan's Dip test. + If it is, finds the peak or valley. :param series: The series to evaluate. - :return: (is_unimodal, highlight) + :return: Tuple (is_unimodal, UnimodalityPattern or None if not unimodal) """ if isinstance(series, pd.Series): series = series.sort_index() @@ -72,48 +95,34 @@ def unimodality(series: pd.Series) -> (bool, UnimodalityPattern | None): @staticmethod - def trend(series: pd.Series) -> (bool, Tuple[str, str]): + def trend(series: pd.Series) -> (bool, TrendPattern | None): """ Evaluates if a time series exhibits a significant trend (upward or downward). - Uses linear regression to find the slope. - Returns (True, highlight) if a trend is detected, (False, None) otherwise. - Highlight is (slope, 'Upward' or 'Downward'). + Uses the Mann-Kendall test to check for monotonic trends. + + :param series: The series to evaluate. + :return: Tuple (trend_detected, a Trend pattern object or None. None if no trend is detected) """ if len(series) < 2: - return False, (None, None) + return False, None - # Check if the series is a time series, or just a series of numbers - # We say a series is a time series if its index is either a datetime index or an increasing integer index - is_datetime_index = isinstance(series.index, pd.DatetimeIndex) - is_numeric_index = np.issubdtype(series.index.dtype, np.number) - if is_numeric_index: - series = series.sort_index() - # Check if the index is strictly increasing - is_increasing = np.all(np.diff(series.index) > 0) - else: - is_increasing = False + # Check if the series is a time series + if not PatternEvaluator._is_time_series(series): + return False, None - # We can't find trends in series that are not time series - - if not is_datetime_index and not is_increasing: - return False, (None, None) + # Use the Mann Kendall test to check for trend. + mk_result = mk.original_test(series) + p_val = mk_result.p + # Reject or accept the null hypothesis + if p_val > 0.05 or mk_result.trend == 'no trend': + return False, None + else: + return True, TrendPattern(series, type=mk_result.trend, slope=mk_result.slope, intercept=mk_result.intercept) - # Create a simple linear model - X = np.arange(len(series)).reshape(-1, 1) # Independent variable (time index) - y = series.values # Dependent variable (data values) - model = LinearRegression() - model.fit(X, y) - slope = model.coef_[0] - - # Check if the slope is significant - if abs(slope) > PatternEvaluator.TREND_SLOPE_THRESHOLD: - trend_direction = 'Upward' if slope > 0 else 'Downward' - return True, (None, trend_direction) - else: - return False, (None, None) @staticmethod - def outlier(series: pd.Series) -> (bool, Tuple[str, str]): + def outlier(series: pd.Series) -> (bool, OutlierPattern): """ Evaluates if a series contains significant outliers. Uses the Z-score method. @@ -128,41 +137,38 @@ def outlier(series: pd.Series) -> (bool, Tuple[str, str]): # Find indices where Z-score exceeds the threshold outlier_indices = np.where(z_scores > PatternEvaluator.OUTLIER_ZSCORE_THRESHOLD)[0] + if len(outlier_indices) == 0: + return False, None + outlier_values = series.iloc[outlier_indices] + outlier_indexes = series.index[outlier_indices] + return True, OutlierPattern(series, outlier_indexes=outlier_indexes, outlier_values=outlier_values) + + + @staticmethod + def cycle(series: pd.Series) -> (bool, CyclePattern): + """ + Evaluates if a series exhibits cyclical patterns. + Uses the Cydets library to detect cycles. + :param series: The series to evaluate. + :return: Tuple (is_cyclical, CyclePattern or None) + """ + if len(series) < 2: + return False, None + + # Check if the series is a time series + if not PatternEvaluator._is_time_series(series): + return False, None + + # Detect cycles using Cydets + try: + cycle_info = detect_cycles(series) + return True, CyclePattern(series, cycle_info) + # For some godforsaken reason, Cydets throws a ValueError when it fails to detect cycles, instead of + # returning None like it should. And so, we have this incredibly silly try/except block. + except ValueError: + return False, None + - if len(outlier_indices) > 0: - outlier_data_points = series.iloc[outlier_indices].values.tolist() - outlier_index = series.index[outlier_indices].tolist() - # If there are multiple outliers, use clustering and return the cluster means. - # This is more informative and easier to interpret than a list of raw outlier values. - if len(outlier_data_points) > 1: - # Reshape for clustering - outlier_data_points = np.array(outlier_data_points).reshape(-1, 1) - # Perform clustering - clustered = DBSCAN().fit_predict(outlier_data_points) - cluster_means = [] - cluster_indexes = [] - for cluster in np.unique(clustered): - if cluster != -1: - cluster_points = outlier_data_points[clustered == cluster] - cluster_mean = np.mean(cluster_points) - cluster_means.append(cluster_mean) - # Take the most common index of the cluster points to represent the cluster - cluster_index = outlier_index[clustered == cluster] - cluster_index = pd.Series(cluster_index).mode()[0] - cluster_indexes.append(cluster_index) - # If there are noise points, they will be labeled as -1 in DBSCAN. To us though, those are - # not noise points, but outliers. So we will return them as well (unlike the clustered points, - # their mean may be meaningless because they might be very far apart. - noise_points = outlier_data_points[clustered == -1] - if len(noise_points) > 0: - noise_points = noise_points.flatten().tolist() - cluster_means.extend(noise_points) - # Return the cluster centers as the highlight meaning "outliers around these values" - return True, (cluster_indexes, None) - - return True, ([outlier_index[0]], None) - else: - return False, (None, None) def __call__(self, series: pd.Series, pattern_type: PatternType) -> (bool, str): """ @@ -177,5 +183,7 @@ def __call__(self, series: pd.Series, pattern_type: PatternType) -> (bool, str): return self.trend(series) elif pattern_type == PatternType.OUTLIER: return self.outlier(series) + elif pattern_type == PatternType.CYCLE: + return self.cycle(series) else: raise ValueError(f"Unsupported pattern type: {pattern_type}") diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py index 3e0df61..06289f5 100644 --- a/src/external_explainers/metainsight_explainer/patterns.py +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -2,6 +2,7 @@ import pandas as pd import matplotlib.pyplot as plt import numpy as np +from typing import Literal class PatternInterface(ABC): """ @@ -36,12 +37,12 @@ def __eq__(self, other): class UnimodalityPattern(PatternInterface): - def __init__(self, source_series: pd.Series, type: str, index, index_name: str = None): + def __init__(self, source_series: pd.Series, type: Literal['Peak', 'Valley'], index, index_name: str = None): """ Initialize the UnimodalityPattern with the provided parameters. :param source_series: The source series to evaluate. - :param type: The type of the pattern. + :param type: The type of the pattern. Either 'Peak' or 'Valley' is expected. :param location: The location of the pattern. """ self.source_series = source_series @@ -84,6 +85,77 @@ def __eq__(self, other): +class TrendPattern(PatternInterface): + + def __init__(self, source_series: pd.Series, type: Literal['Increasing', 'Decreasing'], slope: float, intercept: float = 0): + """ + Initialize the Trend pattern with the provided parameters. + + :param source_series: The source series to evaluate. + :param type: The type of the pattern. + :param slope: The slope of the trend. + """ + self.source_series = source_series + self.type = type + self.slope = slope + self.intercept = intercept + + def visualize(self, ax): + pass + + def create_explanation_string(self, commonness_set, exceptions): + pass + + def __eq__(self, other): + pass + + +class OutlierPattern(PatternInterface): + + def __init__(self, source_series: pd.Series, outlier_indexes: pd.Index, outlier_values: pd.Series): + """ + Initialize the Outlier pattern with the provided parameters. + + :param source_series: The source series to evaluate. + :param outlier_indexes: The indexes of the outliers. + :param outlier_values: The values of the outliers. + """ + self.source_series = source_series + self.outlier_indexes = outlier_indexes + self.outlier_values = outlier_values + + def visualize(self, ax): + pass + + def create_explanation_string(self, commonness_set, exceptions): + pass + + def __eq__(self, other): + pass + + +class CyclePattern(PatternInterface): + + def __init__(self, source_series: pd.Series, cycles: pd.DataFrame): + """ + Initialize the Cycle pattern with the provided parameters. + + :param source_series: The source series to evaluate. + :param cycle_length: The length of the cycle. + """ + self.source_series = source_series + self.cycles = cycles + + def visualize(self, ax): + pass + + def create_explanation_string(self, commonness_set, exceptions): + pass + + def __eq__(self, other): + pass + + if __name__ == '__main__': data = np.random.normal(0, 1, 1000) series = pd.Series(data) From 82df1339860c42223603747a29ac8b9c9a9fb601 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Sat, 10 May 2025 16:09:57 +0300 Subject: [PATCH 09/27] Fixed bugs in some pattern evaluation, added visualizations for trends and unimodalities. --- .../pattern_evaluations.py | 4 +- .../metainsight_explainer/patterns.py | 74 +++++++++++++------ 2 files changed, 55 insertions(+), 23 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/pattern_evaluations.py b/src/external_explainers/metainsight_explainer/pattern_evaluations.py index fddc80e..08bfad9 100644 --- a/src/external_explainers/metainsight_explainer/pattern_evaluations.py +++ b/src/external_explainers/metainsight_explainer/pattern_evaluations.py @@ -80,8 +80,8 @@ def unimodality(series: pd.Series) -> (bool, UnimodalityPattern | None): return False, None max_value_index = peaks.index[0] if len(peaks) == 1 else None min_value_index = valleys.index[0] if len(valleys) == 1 else None - # If both are at the edges, we can't use them - if (max_value_index is not None and (max_value_index == series.index[0] or max_value_index == series.index[-1])) or \ + # If both are at the edges, this is more likely a trend than a unimodal pattern + if (max_value_index is not None and (max_value_index == series.index[0] or max_value_index == series.index[-1])) and \ (min_value_index is not None and (min_value_index == series.index[0] or min_value_index == series.index[-1])): return False, None index_name = series.index.name diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py index 06289f5..1ccfffe 100644 --- a/src/external_explainers/metainsight_explainer/patterns.py +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -10,7 +10,7 @@ class PatternInterface(ABC): """ @abstractmethod - def visualize(self, ax): + def visualize(self, plt_ax): """ Visualize the pattern. """ @@ -37,28 +37,35 @@ def __eq__(self, other): class UnimodalityPattern(PatternInterface): - def __init__(self, source_series: pd.Series, type: Literal['Peak', 'Valley'], index, index_name: str = None): + def __init__(self, source_series: pd.Series, type: Literal['Peak', 'Valley'], highlight_index): """ Initialize the UnimodalityPattern with the provided parameters. :param source_series: The source series to evaluate. :param type: The type of the pattern. Either 'Peak' or 'Valley' is expected. - :param location: The location of the pattern. + :param highlight_index: The index of the peak or valley. + :param index_name: The name of the index. """ self.source_series = source_series self.type = type - self.index = index - self.index_name = index_name + self.highlight_index = highlight_index + self.index_name = source_series.index.name if source_series.index.name else 'Index' - def visualize(self, ax): + def visualize(self, plt_ax): """ Visualize the unimodality pattern. :return: """ - ax.plot(self.source_series, label='Unimodality Pattern') - ax.axvline(x=self.index, color='r', linestyle='--', label='Location') - ax.legend() - ax.set_title(f'Unimodality Pattern: {self.type}') + plt_ax.plot(self.source_series) + plt_ax.set_xlabel(self.index_name) + plt_ax.set_ylabel('Value') + # Emphasize the peak or valley + if self.type.lower() == 'peak': + plt_ax.plot(self.highlight_index, self.source_series[self.highlight_index], 'ro', label='Peak') + elif self.type.lower() == 'valley': + plt_ax.plot(self.highlight_index, self.source_series[self.highlight_index], 'bo', label='Valley') + plt_ax.legend() + plt_ax.set_title(f'Unimodality Pattern: {self.type}') def create_explanation_string(self, commonness_set, exceptions): @@ -74,20 +81,21 @@ def __eq__(self, other): """ Check if two UnimodalityPattern objects are equal. :param other: Another UnimodalityPattern object. - :return: True if they are equal, False otherwise. + :return: True if they are equal, False otherwise. They are considered equal if they have the same type, + the same highlight index, and are on the same index. """ if not isinstance(other, UnimodalityPattern): return False - return (self.source_series.equals(other.source_series) and - self.type == other.type and - self.index == other.index and - self.index_name == other.index_name) + return (self.type == other.type and + self.highlight_index == other.highlight_index and + self.source_series.index == other.source_series.index) class TrendPattern(PatternInterface): - def __init__(self, source_series: pd.Series, type: Literal['Increasing', 'Decreasing'], slope: float, intercept: float = 0): + def __init__(self, source_series: pd.Series, type: Literal['Increasing', 'Decreasing'], + slope: float, intercept: float = 0): """ Initialize the Trend pattern with the provided parameters. @@ -100,14 +108,38 @@ def __init__(self, source_series: pd.Series, type: Literal['Increasing', 'Decrea self.slope = slope self.intercept = intercept - def visualize(self, ax): - pass + def visualize(self, plt_ax): + """ + Visualize the trend pattern. + :param plt_ax: + :return: + """ + plt_ax.plot(self.source_series) + plt_ax.set_xlabel(self.source_series.index.name if self.source_series.index.name else 'Index') + plt_ax.set_ylabel('Value') + x_numeric = np.arange(len(self.source_series)) + # Emphasize the trend + plt_ax.plot(self.source_series.index, self.slope * x_numeric + self.intercept, 'g--', + linewidth=2, + label='Increasing Trend' if self.type.lower() == 'Increasing' else 'Decreasing Trend') + plt_ax.legend() + plt_ax.set_title(f'Trend Pattern: {self.type}') def create_explanation_string(self, commonness_set, exceptions): pass def __eq__(self, other): - pass + """ + Check if two TrendPattern objects are equal. + :param other: Another TrendPattern object. + :return: True if they are equal, False otherwise. They are considered equal if they have the same type + (increasing / decreasing) and are on the same index. + """ + if not isinstance(other, TrendPattern): + return False + # We do not compare the slope and intercept - we only ca + return self.source_series.index == other.source_series.index and \ + self.type == other.type class OutlierPattern(PatternInterface): @@ -124,7 +156,7 @@ def __init__(self, source_series: pd.Series, outlier_indexes: pd.Index, outlier_ self.outlier_indexes = outlier_indexes self.outlier_values = outlier_values - def visualize(self, ax): + def visualize(self, plt_ax): pass def create_explanation_string(self, commonness_set, exceptions): @@ -146,7 +178,7 @@ def __init__(self, source_series: pd.Series, cycles: pd.DataFrame): self.source_series = source_series self.cycles = cycles - def visualize(self, ax): + def visualize(self, plt_ax): pass def create_explanation_string(self, commonness_set, exceptions): From 22af0139287953360c2e55e82e42fcf4ce6d027f Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Mon, 12 May 2025 19:45:34 +0300 Subject: [PATCH 10/27] Added plotting of all currently supported patterns. Added binning when there are too many values in a column. Added caching and runtime improvements. --- .../metainsight_explainer/data_pattern.py | 16 +- .../metainsight_explainer/data_scope.py | 55 +++- .../metainsight_explainer/meta_insight.py | 16 +- .../metainsight_mining.py | 34 +-- .../pattern_evaluations.py | 8 +- .../metainsight_explainer/patterns.py | 238 ++++++++++++++---- 6 files changed, 277 insertions(+), 90 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/data_pattern.py b/src/external_explainers/metainsight_explainer/data_pattern.py index 18ec706..68c70cc 100644 --- a/src/external_explainers/metainsight_explainer/data_pattern.py +++ b/src/external_explainers/metainsight_explainer/data_pattern.py @@ -23,6 +23,7 @@ def __init__(self, data_scope: DataScope, pattern_type: PatternType, highlight: self.pattern_type = pattern_type self.highlight = highlight self.pattern_cache = {} + self.hash = None def __eq__(self, other): if not isinstance(other, BasicDataPattern): @@ -49,12 +50,10 @@ def sim(self, other) -> bool: self.pattern_type != PatternType.NONE and self.pattern_type != PatternType.OTHER def __hash__(self): - data_scope_str = "".join([f"{k}: {v}" for k, v in self.data_scope.subspace.items()]) - highlight_string = "" - if self.highlight: - for h in self.highlight: - highlight_string += f"{h} " - return hash((data_scope_str, self.pattern_type, highlight_string)) + if self.hash is not None: + return self.hash + self.hash = hash((hash(self.data_scope), self.pattern_type, self.highlight)) + return self.hash def __repr__(self): return f"BasicDataPattern(ds={self.data_scope}, type='{self.pattern_type}', highlight={self.highlight})" @@ -68,10 +67,7 @@ def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: Patt :param pattern_type: The type of the pattern to evaluate. """ # Apply subspace filters - filtered_df = df.copy() - for dim, value in data_scope.subspace.items(): - if value != '*': - filtered_df = filtered_df[filtered_df[dim] == value] + filtered_df = data_scope.apply_subspace() # Group by breakdown dimension and aggregate measure if data_scope.breakdown not in filtered_df.columns: diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py index 136f439..ef4720b 100644 --- a/src/external_explainers/metainsight_explainer/data_scope.py +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -3,6 +3,7 @@ from scipy.special import kl_div from concurrent.futures import ThreadPoolExecutor import time +import re class DataScope: @@ -26,16 +27,45 @@ def __init__(self, source_df: pd.DataFrame, subspace: Dict[str, str], breakdown: self.subspace = subspace self.breakdown = breakdown self.measure = measure + self.hash = None def __hash__(self): + if self.hash is not None: + return self.hash # Need a hashable representation of subspace for hashing subspace_tuple = tuple(sorted(self.subspace.items())) if isinstance(self.subspace, dict) else tuple( self.subspace) - return hash((subspace_tuple, self.breakdown, self.measure)) + self.hash = hash((subspace_tuple, self.breakdown, self.measure)) + return self.hash def __repr__(self): return f"DataScope(subspace={self.subspace}, breakdown='{self.breakdown}', measure={self.measure})" + def __eq__(self, other): + if not isinstance(other, DataScope): + return False + return (self.subspace == other.subspace and + self.breakdown == other.breakdown and + self.measure == other.measure) + + def apply_subspace(self) -> pd.DataFrame: + """ + Applies the subspace filters to the source DataFrame and returns the filtered DataFrame. + """ + filtered_df = self.source_df.copy() + for dim, value in self.subspace.items(): + if value != '*': + pattern = rf"^.+<= {dim} <= .+$" + pattern_matched = re.match(pattern, str(value)) + if pattern_matched: + # If the value is a range, split it and filter accordingly + split = re.split(r"<=|>=|<|>", value) + lower_bound, dim, upper_bound = float(split[0].strip()), split[1].strip(), float(split[2].strip()) + filtered_df = filtered_df[(filtered_df[dim] >= lower_bound) & (filtered_df[dim] <= upper_bound)] + else: + filtered_df = filtered_df[filtered_df[dim] == value] + return filtered_df + def _subspace_extend(self) -> List['DataScope']: """ Extends the subspace of the DataScope into its sibling group by the dimension dim_to_extend. @@ -47,6 +77,17 @@ def _subspace_extend(self) -> List['DataScope']: if isinstance(self.subspace, dict): for dim_to_extend in self.subspace.keys(): unique_values = self.source_df[dim_to_extend].dropna().unique() + # If there are too many unique values, we bin them if it's a numeric column, or only choose the + # top 10 most frequent values if it's a categorical column + if len(unique_values) > 10: + if self.source_df[dim_to_extend].dtype in ['int64', 'float64']: + # Bin the numeric column + bins = pd.cut(self.source_df[dim_to_extend], bins=10, retbins=True)[1] + unique_values = [f"{bins[i]} <= {dim_to_extend} <= {bins[i + 1]}" for i in range(len(bins) - 1)] + else: + # Choose the top 10 most frequent values + top_values = self.source_df[dim_to_extend].value_counts().nlargest(10).index.tolist() + unique_values = [v for v in unique_values if v in top_values] for value in unique_values: # Ensure it's a sibling if self.subspace.get(dim_to_extend) != value: @@ -128,10 +169,7 @@ def compute_impact(self, precomputed_source_df: pd.DataFrame = None) -> float: raise ValueError(f"Impact column '{impact_col}' not found in source DataFrame.") # Perform subspace filtering - filtered_df = self.source_df.copy() - for dim, value in self.subspace.items(): - if value != '*': - filtered_df = filtered_df[filtered_df[dim] == value] + filtered_df = self.apply_subspace() # Group by breakdown dimension and aggregate measure if self.breakdown not in filtered_df.columns: # Cannot group by breakdown if it's not in the filtered data @@ -219,9 +257,10 @@ def compute_impact(self, cache) -> float: if ds in cache: # Use the cached impact if available to avoid recomputation, since computing the impact # is the single most expensive operation in the entire pipeline - impact += cache[ds] + ds_impact = cache[ds] else: - impact += ds.compute_impact() - cache[ds] = impact + ds_impact = ds.compute_impact() + cache[ds] = ds_impact + impact += ds_impact self.impact = impact return impact diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index 0f5707c..f9bef47 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -183,7 +183,7 @@ def calculate_conciseness(self) -> float: # Ensure conciseness is within a reasonable range, e.g., [0, 1] return conciseness - def compute_score(self, impact_measure=None) -> float: + def compute_score(self, cache) -> float: """ Computes the score of the MetaInsight. The score is the multiple of the conciseness of the MetaInsight and the impact score of the HDS @@ -193,7 +193,7 @@ def compute_score(self, impact_measure=None) -> float: """ conciseness = self.calculate_conciseness() # If the impact has already been computed, use it - hds_score = self.hdp.impact if self.hdp.impact != 0 else self.hdp.compute_impact(impact_measure=impact_measure) + hds_score = self.hdp.impact if self.hdp.impact != 0 else self.hdp.compute_impact(cache) self.score = conciseness * hds_score return self.score @@ -227,3 +227,15 @@ def compute_pairwise_overlap_score(self, other: 'MetaInsight') -> float: raise ValueError("The other object must be an instance of MetaInsight.") overlap_ratio = self.compute_pairwise_overlap_ratio(other) return min(self.score, other.score) * overlap_ratio + + def visualize(self, plt_ax) -> None: + """ + Visualize the metainsight - both its commonness and exceptions. + :param plt_ax: The matplotlib axis to plot on. + :return: + """ + len_commoness = len(self.commonness_set) + len_exceptions = len(self.exceptions) + # Split the axes in 2: one for commonness, one for exceptions + if len_commoness >= 1 and len_exceptions >= 1: + pass diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index 386ea42..a02c6c9 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -1,10 +1,9 @@ import itertools -from typing import List, Dict, Tuple +from typing import List, Tuple import numpy as np from queue import PriorityQueue import pandas as pd -from concurrent.futures import ThreadPoolExecutor from external_explainers.metainsight_explainer.data_pattern import BasicDataPattern, HomogenousDataPattern from external_explainers.metainsight_explainer.meta_insight import (MetaInsight, @@ -86,10 +85,9 @@ def mine_metainsights(self, source_df: pd.DataFrame, """ The main function to mine MetaInsights. Mines metainsights from the given data frame based on the provided dimensions, measures, and impact measure. - :param source_df: - :param dimensions: - :param measures: - :param impact_measure: + :param source_df: The source DataFrame to mine MetaInsights from. + :param dimensions: The dimensions to consider for mining. + :param measures: The measures to consider for mining. :return: """ metainsight_candidates = [] @@ -107,6 +105,17 @@ def mine_metainsights(self, source_df: pd.DataFrame, # Example: Generate data scopes with one filter in subspace and one breakdown for filter_dim in dimensions: unique_values = source_df[filter_dim].dropna().unique() + # If there are too many unique values, we bin them if it's a numeric column, or only choose the + # top 10 most frequent values if it's a categorical column + if len(unique_values) > 10: + if source_df[filter_dim].dtype in ['int64', 'float64']: + # Bin the numeric column + bins = pd.cut(source_df[filter_dim], bins=10, retbins=True)[1] + unique_values = [f"{bins[i]} <= {filter_dim} <= {bins[i + 1]}" for i in range(len(bins) - 1)] + else: + # Choose the top 10 most frequent values + top_values = source_df[filter_dim].value_counts().nlargest(10).index.tolist() + unique_values = [v for v in unique_values if v in top_values] for value in unique_values: for breakdown_dim in dimensions: if breakdown_dim != filter_dim: # Breakdown should be different from filter dim @@ -158,7 +167,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, if metainsight: # Calculate and assign the score - metainsight.compute_score() + metainsight.compute_score(datascope_cache) metainsight_candidates.append(metainsight) return self.rank_metainsights(metainsight_candidates) @@ -169,9 +178,9 @@ def mine_metainsights(self, source_df: pd.DataFrame, df = pd.read_csv("C:\\Users\\Yuval\\PycharmProjects\\pd-explain\\Examples\\Datasets\\adult.csv") df = df.sample(2500, random_state=42) # Sample 5000 rows for testing - # Define dimensions, measures, and impact measure + # Define dimensions, measures dimensions = ['age', 'education-num'] - measures = [('age', 'mean'), ('capital-gain', 'mean'), ('capital-loss', 'mean')] + measures = [('capital-gain', 'mean'), ('capital-loss', 'mean')] # Run the mining process import time @@ -189,12 +198,5 @@ def mine_metainsights(self, source_df: pd.DataFrame, if top_metainsights: for i, mi in enumerate(top_metainsights): print(f"Rank {i + 1}: {mi}") - # You can further print details about commonness and exceptions if needed - # print(" Commonness:") - # for c in mi.commonness_set: - # print(f" - {len(c)} patterns, Type: {c[0].type}, Highlight: {c[0].highlight}") - # print(" Exceptions:") - # for e in mi.exceptions: - # print(f" - {e}") else: print("No MetaInsights found.") diff --git a/src/external_explainers/metainsight_explainer/pattern_evaluations.py b/src/external_explainers/metainsight_explainer/pattern_evaluations.py index 08bfad9..0c9b332 100644 --- a/src/external_explainers/metainsight_explainer/pattern_evaluations.py +++ b/src/external_explainers/metainsight_explainer/pattern_evaluations.py @@ -86,9 +86,9 @@ def unimodality(series: pd.Series) -> (bool, UnimodalityPattern | None): return False, None index_name = series.index.name if max_value_index: - return True, UnimodalityPattern(series, 'Peak', max_value_index, index_name=index_name) + return True, UnimodalityPattern(series, 'Peak', max_value_index) elif min_value_index: - return True, UnimodalityPattern(series, 'Valley', min_value_index, index_name=index_name) + return True, UnimodalityPattern(series, 'Valley', min_value_index) else: return False, None @@ -162,7 +162,9 @@ def cycle(series: pd.Series) -> (bool, CyclePattern): # Detect cycles using Cydets try: cycle_info = detect_cycles(series) - return True, CyclePattern(series, cycle_info) + if cycle_info is not None and len(cycle_info) > 0: + return True, CyclePattern(series, cycle_info) + return False, None # For some godforsaken reason, Cydets throws a ValueError when it fails to detect cycles, instead of # returning None like it should. And so, we have this incredibly silly try/except block. except ValueError: diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py index 1ccfffe..1eeaaf8 100644 --- a/src/external_explainers/metainsight_explainer/patterns.py +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -10,29 +10,43 @@ class PatternInterface(ABC): """ @abstractmethod - def visualize(self, plt_ax): + def visualize(self, plt_ax) -> None: """ Visualize the pattern. """ raise NotImplementedError("Subclasses must implement this method.") @abstractmethod - def create_explanation_string(self, commonness_set, exceptions): + def __eq__(self, other) -> bool: """ - Create an explanation string for the pattern. - :param commonness_set: The commonness set, where the pattern is common. - :param exceptions: The exceptions dict, where the pattern is different from the commonness set or did not occur. + Check if two patterns are equal + :param other: Another pattern of the same type :return: """ raise NotImplementedError("Subclasses must implement this method.") @abstractmethod - def __eq__(self, other): + def __repr__(self) -> str: """ - Check if two patterns are equal - :param other: Another pattern of the same type - :return: + String representation of the pattern. + """ + raise NotImplementedError("Subclasses must implement this method.") + + + @abstractmethod + def __str__(self) -> str: + """ + String representation of the pattern. """ + raise NotImplementedError("Subclasses must implement this method.") + + + @abstractmethod + def __hash__(self) -> int: + """ + Hash representation of the pattern. + """ + raise NotImplementedError("Subclasses must implement this method.") class UnimodalityPattern(PatternInterface): @@ -50,8 +64,9 @@ def __init__(self, source_series: pd.Series, type: Literal['Peak', 'Valley'], hi self.type = type self.highlight_index = highlight_index self.index_name = source_series.index.name if source_series.index.name else 'Index' + self.hash = None - def visualize(self, plt_ax): + def visualize(self, plt_ax) -> None: """ Visualize the unimodality pattern. :return: @@ -65,30 +80,44 @@ def visualize(self, plt_ax): elif self.type.lower() == 'valley': plt_ax.plot(self.highlight_index, self.source_series[self.highlight_index], 'bo', label='Valley') plt_ax.legend() - plt_ax.set_title(f'Unimodality Pattern: {self.type}') - - def create_explanation_string(self, commonness_set, exceptions): - """ - Create an explanation string for the unimodality pattern. - :param commonness_set: The commonness set, where the pattern is common. - :param exceptions: The exceptions dict, where the pattern is different from the commonness set or did not occur. - :return: - """ - return f"Unimodality Pattern: {commonness_set}, Exceptions: {exceptions}" - def __eq__(self, other): + def __eq__(self, other) -> bool: """ Check if two UnimodalityPattern objects are equal. :param other: Another UnimodalityPattern object. :return: True if they are equal, False otherwise. They are considered equal if they have the same type, - the same highlight index, and are on the same index. + the same highlight index. """ if not isinstance(other, UnimodalityPattern): return False return (self.type == other.type and - self.highlight_index == other.highlight_index and - self.source_series.index == other.source_series.index) + self.highlight_index == other.highlight_index) + + + def __repr__(self) -> str: + """ + String representation of the UnimodalityPattern. + :return: A string representation of the UnimodalityPattern. + """ + return f"UnimodalityPattern(type={self.type}, highlight_index={self.highlight_index})" + + def __str__(self) -> str: + """ + String representation of the UnimodalityPattern. + :return: A string representation of the UnimodalityPattern. + """ + return f"UnimodalityPattern(type={self.type}, highlight_index={self.highlight_index})" + + def __hash__(self) -> int: + """ + Hash representation of the UnimodalityPattern. + :return: A hash representation of the UnimodalityPattern. + """ + if self.hash is not None: + return self.hash + self.hash = hash(f"UnimodalityPattern(type={self.type}, highlight_index={self.highlight_index})") + return self.hash @@ -107,8 +136,9 @@ def __init__(self, source_series: pd.Series, type: Literal['Increasing', 'Decrea self.type = type self.slope = slope self.intercept = intercept + self.hash = None - def visualize(self, plt_ax): + def visualize(self, plt_ax) -> None: """ Visualize the trend pattern. :param plt_ax: @@ -123,23 +153,43 @@ def visualize(self, plt_ax): linewidth=2, label='Increasing Trend' if self.type.lower() == 'Increasing' else 'Decreasing Trend') plt_ax.legend() - plt_ax.set_title(f'Trend Pattern: {self.type}') - - def create_explanation_string(self, commonness_set, exceptions): - pass - def __eq__(self, other): + def __eq__(self, other) -> bool: """ Check if two TrendPattern objects are equal. :param other: Another TrendPattern object. :return: True if they are equal, False otherwise. They are considered equal if they have the same type - (increasing / decreasing) and are on the same index. + (increasing / decreasing) (we trust that comparisons will be done on the same series). """ if not isinstance(other, TrendPattern): return False # We do not compare the slope and intercept - we only ca - return self.source_series.index == other.source_series.index and \ - self.type == other.type + return self.type == other.type + + + def __repr__(self) -> str: + """ + String representation of the TrendPattern. + :return: A string representation of the TrendPattern. + """ + return f"TrendPattern(type={self.type})" + + def __str__(self) -> str: + """ + String representation of the TrendPattern. + :return: A string representation of the TrendPattern. + """ + return f"TrendPattern(type={self.type})" + + def __hash__(self) -> int: + """ + Hash representation of the TrendPattern. + :return: A hash representation of the TrendPattern. + """ + if self.hash is not None: + return self.hash + self.hash = hash(f"TrendPattern(type={self.type})") + return self.hash class OutlierPattern(PatternInterface): @@ -155,15 +205,57 @@ def __init__(self, source_series: pd.Series, outlier_indexes: pd.Index, outlier_ self.source_series = source_series self.outlier_indexes = outlier_indexes self.outlier_values = outlier_values + self.hash = None - def visualize(self, plt_ax): - pass + def visualize(self, plt_ax) -> None: + """ + Visualize the outlier pattern. + :param plt_ax: + :return: + """ + plt_ax.scatter(self.source_series.index, self.source_series, label='Regular Data Point') + plt_ax.set_xlabel(self.source_series.index.name if self.source_series.index.name else 'Index') + plt_ax.set_ylabel('Value') + # Emphasize the outliers + plt_ax.scatter(self.outlier_indexes, self.outlier_values, color='red', label='Outliers') + plt_ax.legend() - def create_explanation_string(self, commonness_set, exceptions): - pass def __eq__(self, other): - pass + """ + Check if two OutlierPattern objects are equal. + :param other: Another OutlierPattern object. + :return: True if they are equal, False otherwise. They are considered equal if the index set of one is a subset + of the other or vice versa. + """ + if not isinstance(other, OutlierPattern): + return False + return self.outlier_indexes.isin(other.outlier_indexes).all() or \ + other.outlier_indexes.isin(self.outlier_indexes).all() + + def __repr__(self) -> str: + """ + String representation of the OutlierPattern. + :return: A string representation of the OutlierPattern. + """ + return f"OutlierPattern(outlier_indexes={self.outlier_indexes})" + + def __str__(self) -> str: + """ + String representation of the OutlierPattern. + :return: A string representation of the OutlierPattern. + """ + return f"OutlierPattern(outlier_indexes={self.outlier_indexes})" + + def __hash__(self) -> int: + """ + Hash representation of the OutlierPattern. + :return: A hash representation of the OutlierPattern. + """ + if self.hash is not None: + return self.hash + self.hash = hash(f"OutlierPattern(outlier_indexes={self.outlier_indexes})") + return self.hash class CyclePattern(PatternInterface): @@ -173,25 +265,69 @@ def __init__(self, source_series: pd.Series, cycles: pd.DataFrame): Initialize the Cycle pattern with the provided parameters. :param source_series: The source series to evaluate. - :param cycle_length: The length of the cycle. + :param cycles: The cycles detected in the series. """ self.source_series = source_series + # Cycles is a dataframe with the columns: t_start, t_end, t_minimum, doc, duration self.cycles = cycles + self.hash = None def visualize(self, plt_ax): - pass - - def create_explanation_string(self, commonness_set, exceptions): - pass + """ + Visualize the cycle pattern. + :param plt_ax: + :return: + """ + plt_ax.plot(self.source_series) + plt_ax.set_xlabel(self.source_series.index.name if self.source_series.index.name else 'Index') + plt_ax.set_ylabel('Value') + i = 1 + # Emphasize the cycles, and alternate colors + colors = ['red', 'blue', 'green', 'orange', 'purple'] + color_index = 0 + for _, cycle in self.cycles.iterrows(): + plt_ax.axvspan(cycle['t_start'], cycle['t_end'], color=colors[color_index], alpha=0.5, label=f'Cycle {i}') + i += 1 + color_index = (color_index + 1) % len(colors) + plt_ax.legend() def __eq__(self, other): - pass + """ + Check if two CyclePattern objects are equal. + :param other: + :return: True if they are equal, False otherwise. They are considered equal if the cycles of one are a + subset of the other or vice versa. + """ + if not isinstance(other, CyclePattern): + return False + return self.cycles.isin(other.cycles).all().all() or \ + other.cycles.isin(self.cycles).all().all() + def __repr__(self) -> str: + """ + String representation of the CyclePattern. + :return: A string representation of the CyclePattern. + """ + return f"CyclePattern(cycles={self.cycles})" -if __name__ == '__main__': - data = np.random.normal(0, 1, 1000) - series = pd.Series(data) - pattern = UnimodalityPattern(series, 'Peak', 500) - fig, ax = plt.subplots() - pattern.visualize(ax) - plt.show() \ No newline at end of file + def __str__(self) -> str: + """ + String representation of the CyclePattern. + :return: A string representation of the CyclePattern. + """ + return f"CyclePattern(cycles={self.cycles})" + + def __hash__(self) -> int: + """ + Hash representation of the CyclePattern. + :return: A hash representation of the CyclePattern. + """ + if self.hash is not None: + return self.hash + # Create a hashable representation of the key cycle properties + if len(self.cycles) == 0: + return hash("empty_cycle") + # Use a tuple of tuples for cycle start/end times + cycle_tuples = tuple((row['t_start'], row['t_end']) for _, row in self.cycles.iterrows()) + self.hash = hash(cycle_tuples) + return self.hash \ No newline at end of file From c23a220e6f6b7694a2e45f00d2856e480d6c08a7 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Mon, 12 May 2025 21:22:10 +0300 Subject: [PATCH 11/27] Optimized runtimes - with expected use-cases, code now takes about 1-3 seconds to run (compared to previous over 30). --- .../metainsight_explainer/data_scope.py | 2 - .../metainsight_mining.py | 4 +- .../pattern_evaluations.py | 68 +++++++++++++------ .../metainsight_explainer/patterns.py | 6 +- 4 files changed, 53 insertions(+), 27 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py index ef4720b..fe8e42d 100644 --- a/src/external_explainers/metainsight_explainer/data_scope.py +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -1,8 +1,6 @@ import pandas as pd from typing import Dict, List, Tuple from scipy.special import kl_div -from concurrent.futures import ThreadPoolExecutor -import time import re diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index a02c6c9..3fbde04 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -176,10 +176,10 @@ def mine_metainsights(self, source_df: pd.DataFrame, if __name__ == "__main__": # Create a sample Pandas DataFrame (similar to the paper's example) df = pd.read_csv("C:\\Users\\Yuval\\PycharmProjects\\pd-explain\\Examples\\Datasets\\adult.csv") - df = df.sample(2500, random_state=42) # Sample 5000 rows for testing + df = df.sample(5000, random_state=42) # Sample 5000 rows for testing # Define dimensions, measures - dimensions = ['age', 'education-num'] + dimensions = ['age', 'education-num', 'marital-status'] measures = [('capital-gain', 'mean'), ('capital-loss', 'mean')] # Run the mining process diff --git a/src/external_explainers/metainsight_explainer/pattern_evaluations.py b/src/external_explainers/metainsight_explainer/pattern_evaluations.py index 0c9b332..d911ba3 100644 --- a/src/external_explainers/metainsight_explainer/pattern_evaluations.py +++ b/src/external_explainers/metainsight_explainer/pattern_evaluations.py @@ -1,13 +1,13 @@ from enum import Enum -from typing import List, Dict, Tuple import pandas as pd import numpy as np from diptest import diptest -from scipy.stats import gaussian_kde, zscore +from scipy.stats import zscore from external_explainers.metainsight_explainer.patterns import UnimodalityPattern, TrendPattern, OutlierPattern, \ CyclePattern import pymannkendall as mk from cydets.algorithm import detect_cycles +from singleton_decorator import singleton class PatternType(Enum): @@ -22,16 +22,20 @@ class PatternType(Enum): CYCLE = 5 +@singleton class PatternEvaluator: """ A class to evaluate different patterns in a series. """ - OUTLIER_ZSCORE_THRESHOLD = 2.0 # Z-score threshold for outlier detection - TREND_SLOPE_THRESHOLD = 0.01 # Minimum absolute slope for trend detection + def __init__(self): + self.pattern_cache = {} + self.OUTLIER_ZSCORE_THRESHOLD = 2.0 # Z-score threshold for outlier detection + self.TREND_SLOPE_THRESHOLD = 0.01 # Minimum absolute slope for trend detection - @staticmethod - def _is_time_series(series: pd.Series) -> bool: + + + def _is_time_series(self, series: pd.Series) -> bool: """ Checks if the series is a time series. We consider a series to be a time series if its index is either a datetime index or an increasing integer index. @@ -50,8 +54,8 @@ def _is_time_series(series: pd.Series) -> bool: else: return False - @staticmethod - def unimodality(series: pd.Series) -> (bool, UnimodalityPattern | None): + + def unimodality(self, series: pd.Series) -> (bool, UnimodalityPattern | None): """ Evaluates if the series is unimodal using Hartigan's Dip test. If it is, finds the peak or valley. @@ -94,8 +98,7 @@ def unimodality(series: pd.Series) -> (bool, UnimodalityPattern | None): - @staticmethod - def trend(series: pd.Series) -> (bool, TrendPattern | None): + def trend(self, series: pd.Series) -> (bool, TrendPattern | None): """ Evaluates if a time series exhibits a significant trend (upward or downward). Uses the Mann-Kendall test to check for monotonic trends. @@ -107,7 +110,7 @@ def trend(series: pd.Series) -> (bool, TrendPattern | None): return False, None # Check if the series is a time series - if not PatternEvaluator._is_time_series(series): + if not self._is_time_series(series): return False, None # Use the Mann Kendall test to check for trend. @@ -121,8 +124,7 @@ def trend(series: pd.Series) -> (bool, TrendPattern | None): - @staticmethod - def outlier(series: pd.Series) -> (bool, OutlierPattern): + def outlier(self, series: pd.Series) -> (bool, OutlierPattern): """ Evaluates if a series contains significant outliers. Uses the Z-score method. @@ -136,7 +138,7 @@ def outlier(series: pd.Series) -> (bool, OutlierPattern): z_scores = np.abs(zscore(series.dropna())) # Find indices where Z-score exceeds the threshold - outlier_indices = np.where(z_scores > PatternEvaluator.OUTLIER_ZSCORE_THRESHOLD)[0] + outlier_indices = np.where(z_scores > self.OUTLIER_ZSCORE_THRESHOLD)[0] if len(outlier_indices) == 0: return False, None outlier_values = series.iloc[outlier_indices] @@ -144,8 +146,7 @@ def outlier(series: pd.Series) -> (bool, OutlierPattern): return True, OutlierPattern(series, outlier_indexes=outlier_indexes, outlier_values=outlier_values) - @staticmethod - def cycle(series: pd.Series) -> (bool, CyclePattern): + def cycle(self, series: pd.Series) -> (bool, CyclePattern): """ Evaluates if a series exhibits cyclical patterns. Uses the Cydets library to detect cycles. @@ -155,8 +156,25 @@ def cycle(series: pd.Series) -> (bool, CyclePattern): if len(series) < 2: return False, None + # Ensure the series has enough variability to detect cycles + if series.std() < 1e-10 or (series.max() - series.min()) < 1e-8: + return False, None + + # Quick pre-filtering using autocorrelation (much faster than full detection) + # Suppress the specific divide-by-zero warnings during autocorrelation calculation + with np.errstate(divide='ignore', invalid='ignore'): + # Quick pre-filtering using autocorrelation + if len(series) >= 20: + # Handle possible NaN results from autocorrelation + try: + autocorr = pd.Series(series.values).autocorr(lag=len(series) // 4) + if pd.isna(autocorr) or abs(autocorr) < 0.3: # Check for NaN and low correlation + return False, None + except (ValueError, ZeroDivisionError): + return False, None + # Check if the series is a time series - if not PatternEvaluator._is_time_series(series): + if not self._is_time_series(series): return False, None # Detect cycles using Cydets @@ -179,13 +197,21 @@ def __call__(self, series: pd.Series, pattern_type: PatternType) -> (bool, str): :param pattern_type: The type of the pattern to evaluate. :return: (is_valid, highlight) """ + series_hash = hash(tuple(series.values)) + cache_key = (series_hash, pattern_type) + + if cache_key in self.pattern_cache: + return self.pattern_cache[cache_key] + if pattern_type == PatternType.UNIMODALITY: - return self.unimodality(series) + result = self.unimodality(series) elif pattern_type == PatternType.TREND: - return self.trend(series) + result = self.trend(series) elif pattern_type == PatternType.OUTLIER: - return self.outlier(series) + result = self.outlier(series) elif pattern_type == PatternType.CYCLE: - return self.cycle(series) + result = self.cycle(series) else: raise ValueError(f"Unsupported pattern type: {pattern_type}") + self.pattern_cache[cache_key] = result + return result diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py index 1eeaaf8..f1074ba 100644 --- a/src/external_explainers/metainsight_explainer/patterns.py +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -271,6 +271,7 @@ def __init__(self, source_series: pd.Series, cycles: pd.DataFrame): # Cycles is a dataframe with the columns: t_start, t_end, t_minimum, doc, duration self.cycles = cycles self.hash = None + self._cycle_tuples = frozenset((row['t_start'], row['t_end']) for _, row in cycles.iterrows()) def visualize(self, plt_ax): """ @@ -300,8 +301,9 @@ def __eq__(self, other): """ if not isinstance(other, CyclePattern): return False - return self.cycles.isin(other.cycles).all().all() or \ - other.cycles.isin(self.cycles).all().all() + + # Use precomputed cycle tuples instead of computing them each time + return self._cycle_tuples.issubset(other._cycle_tuples) or other._cycle_tuples.issubset(self._cycle_tuples) def __repr__(self) -> str: """ From 45f1d7b7ef033c63271acc733f5b3bb4aff9bfb0 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Tue, 13 May 2025 22:49:00 +0300 Subject: [PATCH 12/27] Added visualization of meta-insights as a whole (also for groups of meta-insights). --- .../metainsight_explainer/data_pattern.py | 3 +- .../metainsight_explainer/data_scope.py | 27 +++ .../metainsight_explainer/meta_insight.py | 155 +++++++++++++++++- .../metainsight_mining.py | 67 +++++++- .../pattern_evaluations.py | 13 +- .../metainsight_explainer/patterns.py | 33 ++-- 6 files changed, 260 insertions(+), 38 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/data_pattern.py b/src/external_explainers/metainsight_explainer/data_pattern.py index 68c70cc..6382207 100644 --- a/src/external_explainers/metainsight_explainer/data_pattern.py +++ b/src/external_explainers/metainsight_explainer/data_pattern.py @@ -3,6 +3,7 @@ from external_explainers.metainsight_explainer.data_scope import DataScope, HomogenousDataScope from external_explainers.metainsight_explainer.pattern_evaluations import PatternEvaluator, PatternType +from external_explainers.metainsight_explainer.patterns import PatternInterface class BasicDataPattern: @@ -11,7 +12,7 @@ class BasicDataPattern: Contains 3 elements: data scope, type (interpretation type) and highlight. """ - def __init__(self, data_scope: DataScope, pattern_type: PatternType, highlight: str | None): + def __init__(self, data_scope: DataScope, pattern_type: PatternType, highlight: PatternInterface | None): """ Initialize the BasicDataPattern with the provided data scope, type and highlight. diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py index fe8e42d..bc199cf 100644 --- a/src/external_explainers/metainsight_explainer/data_scope.py +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -196,6 +196,33 @@ def compute_impact(self, precomputed_source_df: pd.DataFrame = None) -> float: impact = row_proportion * kl_divergence return impact + def create_query_string(self, df_name: str = None) -> str: + """ + Create a query string for the data scope. + :param df_name: The name of the DataFrame to use in the query string. + :return: + """ + if df_name is None: + df_name = self.source_df.name if self.source_df.name else "df" + subspace_where_string = [] + for dim, value in self.subspace.items(): + # If the value is a range, we can just add it as is + pattern = rf"^.+<= {dim} <= .+$" + pattern_matched = re.match(pattern, str(value)) + if pattern_matched: + subspace_where_string.append(value) + else: + # Otherwise, we need to add it as an equality string + subspace_where_string.append(f"{dim} == '{value}'") + subspace_where_string = 'WHERE ' + ' AND '.join(subspace_where_string) + measures_select_string = f'SELECT {self.measure[1].upper()}({self.measure[0]})' + breakdown_groupby_string = f"GROUP BY {self.breakdown}" + query_string = f"{measures_select_string} FROM {df_name} {subspace_where_string} {breakdown_groupby_string}" + return query_string + + + + class HomogenousDataScope: diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index f9bef47..579a267 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -1,6 +1,10 @@ from collections import defaultdict from typing import List, Dict +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec +import textwrap + import math from external_explainers.metainsight_explainer.data_pattern import HomogenousDataPattern @@ -24,6 +28,7 @@ def __init__(self, hdp: HomogenousDataPattern, commonness_threshold: float = COMMONNESS_THRESHOLD, balance_parameter: float = BALANCE_PARAMETER, actionability_regularizer_param: float = ACTIONABILITY_REGULARIZER_PARAM, + source_name: str = None, ): """ :param hdp: list of BasicDataPattern objects @@ -37,10 +42,27 @@ def __init__(self, hdp: HomogenousDataPattern, self.commonness_threshold = commonness_threshold self.balance_parameter = balance_parameter self.actionability_regularizer_param = actionability_regularizer_param + self.source_name = source_name if source_name else "df" + self.hash = None def __repr__(self): return f"MetaInsight(score={self.score:.4f}, #HDP={len(self.hdp)}, #Commonness={len(self.commonness_set)}, #Exceptions={len(self.exceptions)})" + def __hash__(self): + if self.hash is not None: + return self.hash + self.hash = 0 + for commonness in self.commonness_set: + for pattern in commonness: + self.hash += pattern.__hash__() + return self.hash + + + def __eq__(self, other): + if not isinstance(other, MetaInsight): + return False + return self.commonness_set == other.commonness_set + @staticmethod def categorize_exceptions(commonness_set, exceptions): """ @@ -228,14 +250,129 @@ def compute_pairwise_overlap_score(self, other: 'MetaInsight') -> float: overlap_ratio = self.compute_pairwise_overlap_ratio(other) return min(self.score, other.score) * overlap_ratio - def visualize(self, plt_ax) -> None: + + def _create_commonness_set_title(self, commonness_set: List[BasicDataPattern]) -> str: """ - Visualize the metainsight - both its commonness and exceptions. - :param plt_ax: The matplotlib axis to plot on. - :return: + Create a title for the commonness set based on the patterns it contains. + :param commonness_set: A list of BasicDataPattern objects. + :return: A string representing the title for the commonness set. """ - len_commoness = len(self.commonness_set) - len_exceptions = len(self.exceptions) - # Split the axes in 2: one for commonness, one for exceptions - if len_commoness >= 1 and len_exceptions >= 1: - pass + if not commonness_set: + return "No Patterns" + title = "" + # Check the type of the first pattern in the set. All patterns in the set should be of the same type. + pattern_type = commonness_set[0].pattern_type + if pattern_type == PatternType.UNIMODALITY: + title += "Common unimodality detected " + elif pattern_type == PatternType.TREND: + title += "Common trend detected " + elif pattern_type == PatternType.OUTLIER: + title += "Common outliers detected " + elif pattern_type == PatternType.CYCLE: + title += "Common cycles detected " + # Find the common subspace of the patterns in the set + # First, get the data scope of all of the patterns in the set + data_scopes = [pattern.data_scope for pattern in commonness_set] + subspaces = [datascope.subspace for datascope in data_scopes] + # Now, find the common subspace they share. + shared_subspace = set(subspaces[0].keys()) + for subspace in subspaces[1:]: + shared_subspace.intersection_update(subspace.keys()) + title += f"for over {self.commonness_threshold * 100}% of values of {', '.join(shared_subspace)}, " + breakdowns = set([datascope.breakdown for datascope in data_scopes]) + measures = set([datascope.measure for datascope in data_scopes]) + measures_str = [] + for measure in measures: + if isinstance(measure, tuple): + measures_str.append(f"{{{measure[0]}: {measure[1]}}}") + else: + measures_str.append(measure) + title += f"when grouping by {', '.join(breakdowns)} and aggregating by {', '.join(measures_str)}" + title = textwrap.wrap(title, 70) + title = "\n".join(title) + return title + + def visualize_commonesses(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: + """ + Visualize only the commonness sets of the metainsight, with each set in its own column. + Within each column, patterns are arranged in a grid with at most 3 patterns per column. + + :param fig: Optional figure to plot on (or create a new one if None) + :param subplot_spec: Optional subplot specification to plot within + :param figsize: Figure size if creating a new figure + :return: The figure with visualization + """ + # Create figure if not provided + if fig is None: + fig = plt.figure(figsize=figsize) + + # Only proceed if there are commonness sets + if not self.commonness_set: + return fig + + # Create the main grid with one column per commonness set + num_commonness_sets = len(self.commonness_set) + + if subplot_spec is not None: + # Use the provided subplot area + outer_grid = gridspec.GridSpecFromSubplotSpec(1, num_commonness_sets, + subplot_spec=subplot_spec, + wspace=0.6, hspace=0.4) + else: + # Use the entire figure + outer_grid = gridspec.GridSpec(1, num_commonness_sets, figure=fig, wspace=0.6, hspace=0.4) + + # For each commonness set + for i, patterns in enumerate(self.commonness_set): + # Calculate how many sub-columns needed for this set + num_patterns = len(patterns) + num_cols = math.ceil(num_patterns / 3) # At most 3 patterns per column + max_patterns_per_col = min(3, math.ceil(num_patterns / num_cols)) + + # Create a sub-grid for this commonness set's title and patterns + set_grid = gridspec.GridSpecFromSubplotSpec( + max_patterns_per_col + 1, # Title row + pattern rows + num_cols, + subplot_spec=outer_grid[i], + height_ratios=[0.2] + [1] * max_patterns_per_col, # Title row smaller + hspace=1.5, # Increased spacing between rows + wspace=0.5, # Increased spacing between columns + ) + + # Add the set title spanning all columns in the first row + title_ax = fig.add_subplot(set_grid[0, :]) + set_title = self._create_commonness_set_title(patterns) + title_ax.text(0.5, 0.5, set_title, + ha='center', va='center', + fontsize=12, fontweight='bold') + title_ax.axis('off') # Hide axis for the title + + # Plot each pattern + j = 0 + for pattern in patterns: + # Visualize the pattern + if hasattr(pattern, 'highlight') and pattern.highlight is not None: + # Calculate which column and row this pattern should be in + col = j // max_patterns_per_col + row = (j % max_patterns_per_col) + 1 # +1 to skip title row + # Create subplot for this pattern + ax = fig.add_subplot(set_grid[row, col]) + + pattern.highlight.visualize(ax) + + # Rotate x-axis tick labels + plt.setp(ax.get_xticklabels(), rotation=45, ha='right', fontsize=8) + + # Instead of setting title, add text box for query below the plot + query_text = pattern.data_scope.create_query_string(df_name=self.source_name) + query_text = textwrap.fill(query_text, width=40) + + # Add text box with query string instead of title + props = dict(boxstyle='round', facecolor='wheat', alpha=0.3) + ax.text(0.5, 1.5, query_text, transform=ax.transAxes, fontsize=9, + ha='center', va='top', bbox=props) + + j += 1 + + return fig + diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index 3fbde04..e2b66dc 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -4,6 +4,7 @@ from queue import PriorityQueue import pandas as pd +from matplotlib import pyplot as plt, gridspec from external_explainers.metainsight_explainer.data_pattern import BasicDataPattern, HomogenousDataPattern from external_explainers.metainsight_explainer.meta_insight import (MetaInsight, @@ -37,6 +38,37 @@ def __init__(self, k=5, min_score=MIN_IMPACT, min_commonness=COMMONNESS_THRESHOL self.balance_factor = balance_factor self.actionability_regularizer = actionability_regularizer + def _compute_variety_factor(self, metainsight: MetaInsight, included_pattern_types_count: dict) -> float: + """ + Compute the variety factor for a given MetaInsight based on the pattern types + already present in the selected set. + + :param metainsight: The MetaInsight object to compute the variety factor for. + :param included_pattern_types_count: Dictionary tracking count of selected pattern types. + :return: The variety factor between 0 and 1. + """ + # Get pattern types in this metainsight + candidate_pattern_types = [commonness[0].pattern_type for commonness in metainsight.commonness_set] + + if not candidate_pattern_types: + return 0.0 + + # Calculate how many of this metainsight's pattern types are already included + pattern_repetition = [included_pattern_types_count.get(pt, 0) for pt in candidate_pattern_types] + if any(pt == 0 for pt in pattern_repetition): + return 1 + pattern_repetition = sum(pattern_repetition) + + # Normalize by the number of pattern types in this metainsight + avg_repetition = pattern_repetition / len(candidate_pattern_types) + + # Exponential decay: variety_factor decreases as pattern repetition increases + # The 0.5 constant controls how quickly the penalty grows + variety_factor = np.exp(-0.5 * avg_repetition) + + return variety_factor + + def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): """ Rank the MetaInsights based on their scores. @@ -49,6 +81,11 @@ def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): # Sort candidates by score initially (descending) candidate_set = sorted(list(set(metainsight_candidates)), key=lambda mi: mi.score, reverse=True) + included_pattern_types_count = { + pattern_type: 0 + for pattern_type in PatternType if pattern_type != PatternType.NONE and pattern_type != PatternType.OTHER + } + # Greedy selection of MetaInsights. # We compute the total use of the currently selected MetaInsights, then how much a candidate would add to that. # We take the candidate that adds the most to the total use, repeating until we have k MetaInsights or no candidates left. @@ -65,6 +102,9 @@ def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): mi.compute_pairwise_overlap_score(candidate) for mi in selected_metainsights)) gain = total_use_with_candidate - total_use_approx + # Added penalty for repeating the same pattern types + variety_factor = self._compute_variety_factor(candidate, included_pattern_types_count) + gain *= variety_factor if gain > max_gain: max_gain = gain @@ -73,6 +113,11 @@ def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): if best_candidate: selected_metainsights.append(best_candidate) candidate_set.remove(best_candidate) + # Store a counter for the pattern types of the selected candidates + candidate_pattern_types = [commonness[0].pattern_type for commonness in best_candidate.commonness_set] + for pattern_type in candidate_pattern_types: + if pattern_type in included_pattern_types_count: + included_pattern_types_count[pattern_type] += 1 else: # No candidate provides a positive gain, or candidate_set is empty break @@ -90,7 +135,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, :param measures: The measures to consider for mining. :return: """ - metainsight_candidates = [] + metainsight_candidates = set() datascope_cache = {} pattern_cache = {} hdp_queue = PriorityQueue() @@ -168,9 +213,9 @@ def mine_metainsights(self, source_df: pd.DataFrame, if metainsight: # Calculate and assign the score metainsight.compute_score(datascope_cache) - metainsight_candidates.append(metainsight) + metainsight_candidates.add(metainsight) - return self.rank_metainsights(metainsight_candidates) + return self.rank_metainsights(list(metainsight_candidates)) if __name__ == "__main__": @@ -179,7 +224,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, df = df.sample(5000, random_state=42) # Sample 5000 rows for testing # Define dimensions, measures - dimensions = ['age', 'education-num', 'marital-status'] + dimensions = ['marital-status', 'workclass'] measures = [('capital-gain', 'mean'), ('capital-loss', 'mean')] # Run the mining process @@ -195,8 +240,12 @@ def mine_metainsights(self, source_df: pd.DataFrame, print(f"Time taken: {end_time - start_time:.2f} seconds") print("\n--- Top MetaInsights ---") - if top_metainsights: - for i, mi in enumerate(top_metainsights): - print(f"Rank {i + 1}: {mi}") - else: - print("No MetaInsights found.") + fig = plt.figure(figsize=(30, 25)) + main_grid = gridspec.GridSpec(2, 2, figure=fig, wspace=0.2, hspace=0.3) + + for i, mi in enumerate(top_metainsights[:4]): + row, col = i // 2, i % 2 + mi.visualize_commonesses(fig=fig, subplot_spec=main_grid[row, col]) + + # plt.tight_layout() + plt.show() diff --git a/src/external_explainers/metainsight_explainer/pattern_evaluations.py b/src/external_explainers/metainsight_explainer/pattern_evaluations.py index d911ba3..84bb216 100644 --- a/src/external_explainers/metainsight_explainer/pattern_evaluations.py +++ b/src/external_explainers/metainsight_explainer/pattern_evaluations.py @@ -90,9 +90,9 @@ def unimodality(self, series: pd.Series) -> (bool, UnimodalityPattern | None): return False, None index_name = series.index.name if max_value_index: - return True, UnimodalityPattern(series, 'Peak', max_value_index) + return True, UnimodalityPattern(series, 'Peak', max_value_index, value_name=series.name) elif min_value_index: - return True, UnimodalityPattern(series, 'Valley', min_value_index) + return True, UnimodalityPattern(series, 'Valley', min_value_index, value_name=series.name) else: return False, None @@ -120,7 +120,8 @@ def trend(self, series: pd.Series) -> (bool, TrendPattern | None): if p_val > 0.05 or mk_result.trend == 'no trend': return False, None else: - return True, TrendPattern(series, type=mk_result.trend, slope=mk_result.slope, intercept=mk_result.intercept) + return True, TrendPattern(series, type=mk_result.trend, + slope=mk_result.slope, intercept=mk_result.intercept, value_name=series.name) @@ -143,7 +144,9 @@ def outlier(self, series: pd.Series) -> (bool, OutlierPattern): return False, None outlier_values = series.iloc[outlier_indices] outlier_indexes = series.index[outlier_indices] - return True, OutlierPattern(series, outlier_indexes=outlier_indexes, outlier_values=outlier_values) + return True, OutlierPattern(series, outlier_indexes=outlier_indexes, + outlier_values=outlier_values, value_name=series.name + ) def cycle(self, series: pd.Series) -> (bool, CyclePattern): @@ -181,7 +184,7 @@ def cycle(self, series: pd.Series) -> (bool, CyclePattern): try: cycle_info = detect_cycles(series) if cycle_info is not None and len(cycle_info) > 0: - return True, CyclePattern(series, cycle_info) + return True, CyclePattern(series, cycle_info, value_name=series.name) return False, None # For some godforsaken reason, Cydets throws a ValueError when it fails to detect cycles, instead of # returning None like it should. And so, we have this incredibly silly try/except block. diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py index f1074ba..43a0e61 100644 --- a/src/external_explainers/metainsight_explainer/patterns.py +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -51,19 +51,20 @@ def __hash__(self) -> int: class UnimodalityPattern(PatternInterface): - def __init__(self, source_series: pd.Series, type: Literal['Peak', 'Valley'], highlight_index): + def __init__(self, source_series: pd.Series, type: Literal['Peak', 'Valley'], highlight_index, value_name: str=None): """ Initialize the UnimodalityPattern with the provided parameters. :param source_series: The source series to evaluate. :param type: The type of the pattern. Either 'Peak' or 'Valley' is expected. :param highlight_index: The index of the peak or valley. - :param index_name: The name of the index. + :param value_name: The name of the value to display. """ self.source_series = source_series self.type = type self.highlight_index = highlight_index self.index_name = source_series.index.name if source_series.index.name else 'Index' + self.value_name = value_name if value_name else 'Value' self.hash = None def visualize(self, plt_ax) -> None: @@ -73,13 +74,13 @@ def visualize(self, plt_ax) -> None: """ plt_ax.plot(self.source_series) plt_ax.set_xlabel(self.index_name) - plt_ax.set_ylabel('Value') + plt_ax.set_ylabel(self.value_name) # Emphasize the peak or valley if self.type.lower() == 'peak': plt_ax.plot(self.highlight_index, self.source_series[self.highlight_index], 'ro', label='Peak') elif self.type.lower() == 'valley': plt_ax.plot(self.highlight_index, self.source_series[self.highlight_index], 'bo', label='Valley') - plt_ax.legend() + plt_ax.legend(loc="upper left") def __eq__(self, other) -> bool: @@ -124,7 +125,7 @@ def __hash__(self) -> int: class TrendPattern(PatternInterface): def __init__(self, source_series: pd.Series, type: Literal['Increasing', 'Decreasing'], - slope: float, intercept: float = 0): + slope: float, intercept: float = 0, value_name: str = None): """ Initialize the Trend pattern with the provided parameters. @@ -136,6 +137,7 @@ def __init__(self, source_series: pd.Series, type: Literal['Increasing', 'Decrea self.type = type self.slope = slope self.intercept = intercept + self.value_name = value_name if value_name else 'Value' self.hash = None def visualize(self, plt_ax) -> None: @@ -146,13 +148,13 @@ def visualize(self, plt_ax) -> None: """ plt_ax.plot(self.source_series) plt_ax.set_xlabel(self.source_series.index.name if self.source_series.index.name else 'Index') - plt_ax.set_ylabel('Value') + plt_ax.set_ylabel(self.value_name) x_numeric = np.arange(len(self.source_series)) # Emphasize the trend plt_ax.plot(self.source_series.index, self.slope * x_numeric + self.intercept, 'g--', linewidth=2, - label='Increasing Trend' if self.type.lower() == 'Increasing' else 'Decreasing Trend') - plt_ax.legend() + label='Increasing Trend' if self.type.lower() == 'increasing' else 'Decreasing Trend') + plt_ax.legend(loc="upper left") def __eq__(self, other) -> bool: """ @@ -194,7 +196,8 @@ def __hash__(self) -> int: class OutlierPattern(PatternInterface): - def __init__(self, source_series: pd.Series, outlier_indexes: pd.Index, outlier_values: pd.Series): + def __init__(self, source_series: pd.Series, outlier_indexes: pd.Index, outlier_values: pd.Series, + value_name: str = None): """ Initialize the Outlier pattern with the provided parameters. @@ -205,6 +208,7 @@ def __init__(self, source_series: pd.Series, outlier_indexes: pd.Index, outlier_ self.source_series = source_series self.outlier_indexes = outlier_indexes self.outlier_values = outlier_values + self.value_name = value_name if value_name else 'Value' self.hash = None def visualize(self, plt_ax) -> None: @@ -215,10 +219,10 @@ def visualize(self, plt_ax) -> None: """ plt_ax.scatter(self.source_series.index, self.source_series, label='Regular Data Point') plt_ax.set_xlabel(self.source_series.index.name if self.source_series.index.name else 'Index') - plt_ax.set_ylabel('Value') + plt_ax.set_ylabel(self.value_name) # Emphasize the outliers plt_ax.scatter(self.outlier_indexes, self.outlier_values, color='red', label='Outliers') - plt_ax.legend() + plt_ax.legend(loc="upper left") def __eq__(self, other): @@ -260,7 +264,7 @@ def __hash__(self) -> int: class CyclePattern(PatternInterface): - def __init__(self, source_series: pd.Series, cycles: pd.DataFrame): + def __init__(self, source_series: pd.Series, cycles: pd.DataFrame, value_name: str = None): """ Initialize the Cycle pattern with the provided parameters. @@ -272,6 +276,7 @@ def __init__(self, source_series: pd.Series, cycles: pd.DataFrame): self.cycles = cycles self.hash = None self._cycle_tuples = frozenset((row['t_start'], row['t_end']) for _, row in cycles.iterrows()) + self.value_name = value_name if value_name else 'Value' def visualize(self, plt_ax): """ @@ -281,7 +286,7 @@ def visualize(self, plt_ax): """ plt_ax.plot(self.source_series) plt_ax.set_xlabel(self.source_series.index.name if self.source_series.index.name else 'Index') - plt_ax.set_ylabel('Value') + plt_ax.set_ylabel(self.value_name) i = 1 # Emphasize the cycles, and alternate colors colors = ['red', 'blue', 'green', 'orange', 'purple'] @@ -290,7 +295,7 @@ def visualize(self, plt_ax): plt_ax.axvspan(cycle['t_start'], cycle['t_end'], color=colors[color_index], alpha=0.5, label=f'Cycle {i}') i += 1 color_index = (color_index + 1) % len(colors) - plt_ax.legend() + plt_ax.legend(loc="upper left") def __eq__(self, other): """ From f27407395cf23fe3d08cf4065a633cd7cf7dfd3d Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Tue, 13 May 2025 22:58:44 +0300 Subject: [PATCH 13/27] Fixed issue where NaN values were kept prior to pattern evaluation (causing exceptions). --- .../metainsight_mining.py | 24 +++++++------------ .../pattern_evaluations.py | 2 ++ 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index e2b66dc..32760d8 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -140,14 +140,14 @@ def mine_metainsights(self, source_df: pd.DataFrame, pattern_cache = {} hdp_queue = PriorityQueue() - # Example: Generate data scopes with one dimension as breakdown, all '*' subspace + # Generate data scopes with one dimension as breakdown, all '*' subspace base_data_scopes = [] for breakdown_dim in dimensions: for measure_col, agg_func in measures: base_data_scopes.append( DataScope(source_df, {}, breakdown_dim, (measure_col, agg_func))) - # Example: Generate data scopes with one filter in subspace and one breakdown + # Generate data scopes with one filter in subspace and one breakdown for filter_dim in dimensions: unique_values = source_df[filter_dim].dropna().unique() # If there are too many unique values, we bin them if it's a numeric column, or only choose the @@ -168,10 +168,6 @@ def mine_metainsights(self, source_df: pd.DataFrame, base_data_scopes.append( DataScope(source_df, {filter_dim: value}, breakdown_dim, (measure_col, agg_func))) - print(f"Generated {len(base_data_scopes)} potential base data scopes.") - - # --- Pattern-Guided HDS Generation and Evaluation --- - # For each base data scope, evaluate basic patterns and generate HDSs for base_ds in base_data_scopes: # Evaluate basic patterns for the base data scope for selected types @@ -192,20 +188,15 @@ def mine_metainsights(self, source_df: pd.DataFrame, # Pruning 2: Discard HDS with extremely low impact hds_impact = hdp.compute_impact(datascope_cache) if hds_impact < MIN_IMPACT: - # print(f"Pruning HDS for {base_ds} due to low impact ({hds_impact:.4f})") continue # Add HDS to a queue for evaluation hdp_queue.put((hdp, pattern_type)) - # --- Evaluate HDSs to find MetaInsights --- - # Process HDSs from the queue (simulating priority queue by just processing in order) - processed_hdp_count = 0 - while not hdp_queue.empty(): # and time_elapsed < time_budget: # Add time budget check + while not hdp_queue.empty(): hdp, pattern_type = hdp_queue.get() processed_hdp_count += 1 - # print(f"Processing HDS {processed_hds_count}/{len(hds_queue) + processed_hds_count} for pattern '{pattern_type}'") # Evaluate HDP to find MetaInsight metainsight = MetaInsight.create_meta_insight(hdp, commonness_threshold=self.min_commonness) @@ -224,13 +215,15 @@ def mine_metainsights(self, source_df: pd.DataFrame, df = df.sample(5000, random_state=42) # Sample 5000 rows for testing # Define dimensions, measures - dimensions = ['marital-status', 'workclass'] - measures = [('capital-gain', 'mean'), ('capital-loss', 'mean')] + dimensions = ['marital-status', 'workclass', 'age', 'education-num'] + measures = [('capital-gain', 'mean'), ('capital-loss', 'mean'), + ('hours-per-week', 'mean'), ('hours-per-week', 'std'), + ('fnlwgt', 'mean'), ('fnlwgt', 'std')] # Run the mining process import time start_time = time.time() - miner = MetaInsightMiner(k=5, min_score=0.01, min_commonness=0.5) + miner = MetaInsightMiner(k=4, min_score=0.01, min_commonness=0.5) top_metainsights = miner.mine_metainsights( df, dimensions, @@ -239,7 +232,6 @@ def mine_metainsights(self, source_df: pd.DataFrame, end_time = time.time() print(f"Time taken: {end_time - start_time:.2f} seconds") - print("\n--- Top MetaInsights ---") fig = plt.figure(figsize=(30, 25)) main_grid = gridspec.GridSpec(2, 2, figure=fig, wspace=0.2, hspace=0.3) diff --git a/src/external_explainers/metainsight_explainer/pattern_evaluations.py b/src/external_explainers/metainsight_explainer/pattern_evaluations.py index 84bb216..d999b92 100644 --- a/src/external_explainers/metainsight_explainer/pattern_evaluations.py +++ b/src/external_explainers/metainsight_explainer/pattern_evaluations.py @@ -206,6 +206,8 @@ def __call__(self, series: pd.Series, pattern_type: PatternType) -> (bool, str): if cache_key in self.pattern_cache: return self.pattern_cache[cache_key] + series = series[~series.isna()] # Remove NaN values + if pattern_type == PatternType.UNIMODALITY: result = self.unimodality(series) elif pattern_type == PatternType.TREND: From c2024644ff6f71b1e935ea89131bc70a53a13070 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Wed, 14 May 2025 21:26:38 +0300 Subject: [PATCH 14/27] Visualization updates. --- .../metainsight_explainer/meta_insight.py | 31 +++++++++++++++++-- .../metainsight_mining.py | 11 ++++--- .../metainsight_explainer/patterns.py | 5 +-- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index 579a267..f574bea 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -263,11 +263,36 @@ def _create_commonness_set_title(self, commonness_set: List[BasicDataPattern]) - # Check the type of the first pattern in the set. All patterns in the set should be of the same type. pattern_type = commonness_set[0].pattern_type if pattern_type == PatternType.UNIMODALITY: - title += "Common unimodality detected " + title += "Common unimodality detected - " + umimodality = commonness_set[0].highlight + type = umimodality.type + index = umimodality.highlight_index + title += f"common {type} at index {index} " elif pattern_type == PatternType.TREND: - title += "Common trend detected " + trend = commonness_set[0].highlight + trend_type = trend.type + title += f"Common {trend_type} trend detected " elif pattern_type == PatternType.OUTLIER: title += "Common outliers detected " + outliers = [pattern.highlight for pattern in commonness_set] + common_outlier_indexes = {} + # Create a counter for the outlier indexes + for outlier in outliers: + if outlier.outlier_indexes is not None: + for idx in outlier.outlier_indexes: + if idx in common_outlier_indexes: + common_outlier_indexes[idx] += 1 + else: + common_outlier_indexes[idx] = 1 + # Sort the outlier indexes by their count + common_outlier_indexes = sorted(common_outlier_indexes.items(), key=lambda x: x[1], reverse=True) + # Take the top 5 most common outlier indexes + num_outliers = len(common_outlier_indexes) + common_outlier_indexes = list(dict(common_outlier_indexes).keys()) + # If there are more than 5, truncate the list and add "..." + if num_outliers > 5: + common_outlier_indexes.append("...") + title += f"at indexes {' / '.join(map(str, common_outlier_indexes))}: " elif pattern_type == PatternType.CYCLE: title += "Common cycles detected " # Find the common subspace of the patterns in the set @@ -287,7 +312,7 @@ def _create_commonness_set_title(self, commonness_set: List[BasicDataPattern]) - measures_str.append(f"{{{measure[0]}: {measure[1]}}}") else: measures_str.append(measure) - title += f"when grouping by {', '.join(breakdowns)} and aggregating by {', '.join(measures_str)}" + title += f"when grouping by {', '.join(breakdowns)} and aggregating by {' or '.join(measures_str)}" title = textwrap.wrap(title, 70) title = "\n".join(title) return title diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index 32760d8..0a10876 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -20,6 +20,8 @@ class MetaInsightMiner: """ This class is responsible for the actual process of mining MetaInsights. + The full process is described in the paper " MetaInsight: Automatic Discovery of Structured Knowledge for + Exploratory Data Analysis" by Ma et al. (2021). """ def __init__(self, k=5, min_score=MIN_IMPACT, min_commonness=COMMONNESS_THRESHOLD, balance_factor=BALANCE_PARAMETER, @@ -163,7 +165,9 @@ def mine_metainsights(self, source_df: pd.DataFrame, unique_values = [v for v in unique_values if v in top_values] for value in unique_values: for breakdown_dim in dimensions: - if breakdown_dim != filter_dim: # Breakdown should be different from filter dim + # Prevents the same breakdown dimension from being used as filter. This is because it + # is generally not very useful to groupby the same dimension as the filter dimension. + if breakdown_dim != filter_dim: for measure_col, agg_func in measures: base_data_scopes.append( DataScope(source_df, {filter_dim: value}, breakdown_dim, (measure_col, agg_func))) @@ -215,10 +219,9 @@ def mine_metainsights(self, source_df: pd.DataFrame, df = df.sample(5000, random_state=42) # Sample 5000 rows for testing # Define dimensions, measures - dimensions = ['marital-status', 'workclass', 'age', 'education-num'] + dimensions = ['marital-status', 'workclass', 'education-num'] measures = [('capital-gain', 'mean'), ('capital-loss', 'mean'), - ('hours-per-week', 'mean'), ('hours-per-week', 'std'), - ('fnlwgt', 'mean'), ('fnlwgt', 'std')] + ('hours-per-week', 'mean')] # Run the mining process import time diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py index 43a0e61..3349fc5 100644 --- a/src/external_explainers/metainsight_explainer/patterns.py +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -151,9 +151,10 @@ def visualize(self, plt_ax) -> None: plt_ax.set_ylabel(self.value_name) x_numeric = np.arange(len(self.source_series)) # Emphasize the trend + label = f"y={self.slope:.2f}x + {self.intercept:.2f}" plt_ax.plot(self.source_series.index, self.slope * x_numeric + self.intercept, 'g--', linewidth=2, - label='Increasing Trend' if self.type.lower() == 'increasing' else 'Decreasing Trend') + label=label) plt_ax.legend(loc="upper left") def __eq__(self, other) -> bool: @@ -165,7 +166,7 @@ def __eq__(self, other) -> bool: """ if not isinstance(other, TrendPattern): return False - # We do not compare the slope and intercept - we only ca + # We do not compare the slope and intercept - we only care about the type of trend return self.type == other.type From 27952796569f10136ec96003c3d291d81e648815 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Sun, 18 May 2025 22:52:25 +0300 Subject: [PATCH 15/27] Completely redid visualization for more comprehensible visualization. Still need to fix size and clipping issues. --- .../metainsight_explainer/data_pattern.py | 10 +- .../metainsight_explainer/data_scope.py | 13 +- .../metainsight_explainer/meta_insight.py | 217 +++++++++- .../metainsight_mining.py | 25 +- .../pattern_evaluations.py | 3 +- .../metainsight_explainer/patterns.py | 373 +++++++++++++++++- 6 files changed, 614 insertions(+), 27 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/data_pattern.py b/src/external_explainers/metainsight_explainer/data_pattern.py index 6382207..b02283d 100644 --- a/src/external_explainers/metainsight_explainer/data_pattern.py +++ b/src/external_explainers/metainsight_explainer/data_pattern.py @@ -102,16 +102,16 @@ def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: Patt if other_type == PatternType.OTHER or other_type == PatternType.NONE: continue if other_type != pattern_type: - other_is_valid, _ = pattern_evaluator(aggregated_series, other_type) + other_is_valid, highlight = pattern_evaluator(aggregated_series, other_type) if other_is_valid: - return BasicDataPattern(data_scope, PatternType.OTHER, None) + return BasicDataPattern(data_scope, PatternType.OTHER, highlight) # If no pattern is found, return a 'No Pattern' type return BasicDataPattern(data_scope, PatternType.NONE, None) def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, hds: List[DataScope] = None, temporal_dimensions: List[str] = None, - measures: List[Tuple[str,str]] = None) -> Tuple['HomogenousDataPattern', Dict]: + measures: List[Tuple[str,str]] = None, n_bins: int = 10) -> Tuple['HomogenousDataPattern', Dict]: """ Generates a Homogenous Data Pattern (HDP) either from a given HDS or from the current DataScope. @@ -120,9 +120,11 @@ def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, :param hds: A list of DataScopes to create the HDP from. If None, it will be created from the current DataScope. :param temporal_dimensions: The temporal dimensions to extend the breakdown with. Expected as a list of strings. Only needed if hds is None. :param measures: The measures to extend the measure with. Expected to be a dict {measure_column: aggregate_function}. Only needed if hds is None. + :param n_bins: The number of bins to use for numeric columns. Defaults to 10. + :return: A tuple containing the created HomogenousDataPattern and the updated pattern cache. """ if hds is None or len(hds) == 0: - hds = self.data_scope.create_hds(temporal_dimensions=temporal_dimensions, measures=measures) + hds = self.data_scope.create_hds(temporal_dimensions=temporal_dimensions, measures=measures, n_bins=n_bins) # All the data scopes in the HDS should have the same source_df, and it should be # the same as the source_df of the current DataScope (otherwise, this pattern should not be # the one producing the HDP with this HDS). diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py index bc199cf..2daa8d9 100644 --- a/src/external_explainers/metainsight_explainer/data_scope.py +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -64,11 +64,13 @@ def apply_subspace(self) -> pd.DataFrame: filtered_df = filtered_df[filtered_df[dim] == value] return filtered_df - def _subspace_extend(self) -> List['DataScope']: + def _subspace_extend(self, n_bins: int = 10) -> List['DataScope']: """ Extends the subspace of the DataScope into its sibling group by the dimension dim_to_extend. Subspaces with the same sibling group only differ from each other in 1 non-empty filter. + :param n_bins: The number of bins to use for numeric columns. Defaults to 10. + :return: A list of new DataScope objects with the extended subspace. """ new_ds = [] @@ -77,10 +79,10 @@ def _subspace_extend(self) -> List['DataScope']: unique_values = self.source_df[dim_to_extend].dropna().unique() # If there are too many unique values, we bin them if it's a numeric column, or only choose the # top 10 most frequent values if it's a categorical column - if len(unique_values) > 10: + if len(unique_values) > n_bins: if self.source_df[dim_to_extend].dtype in ['int64', 'float64']: # Bin the numeric column - bins = pd.cut(self.source_df[dim_to_extend], bins=10, retbins=True)[1] + bins = pd.cut(self.source_df[dim_to_extend], bins=n_bins, retbins=True)[1] unique_values = [f"{bins[i]} <= {dim_to_extend} <= {bins[i + 1]}" for i in range(len(bins) - 1)] else: # Choose the top 10 most frequent values @@ -125,13 +127,14 @@ def _breakdown_extend(self, temporal_dimensions: List[str]) -> List['DataScope'] return new_ds def create_hds(self, temporal_dimensions: List[str] = None, - measures: List[Tuple[str,str]] = None) -> 'HomogenousDataScope': + measures: List[Tuple[str,str]] = None, n_bins: int = 10) -> 'HomogenousDataScope': """ Generates a Homogeneous Data Scope (HDS) from a base data scope, using subspace, measure and breakdown extensions as defined in the MetaInsight paper. :param temporal_dimensions: The temporal dimensions to extend the breakdown with. Expected as a list of strings. :param measures: The measures to extend the measure with. Expected to be a dict {measure_column: aggregate_function}. + :param n_bins: The number of bins to use for numeric columns. Defaults to 10. :return: A HDS in the form of a list of DataScope objects. """ @@ -142,7 +145,7 @@ def create_hds(self, temporal_dimensions: List[str] = None, measures = {} # Subspace Extending - hds.extend(self._subspace_extend()) + hds.extend(self._subspace_extend(n_bins=n_bins)) # Measure Extending hds.extend(self._measure_extend(measures)) diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index f574bea..c6dd0af 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -317,7 +317,7 @@ def _create_commonness_set_title(self, commonness_set: List[BasicDataPattern]) - title = "\n".join(title) return title - def visualize_commonesses(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: + def visualize_commonesses_individually(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: """ Visualize only the commonness sets of the metainsight, with each set in its own column. Within each column, patterns are arranged in a grid with at most 3 patterns per column. @@ -401,3 +401,218 @@ def visualize_commonesses(self, fig=None, subplot_spec=None, figsize=(15, 10)) - return fig + def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: + """ + Visualize the metainsight, showing commonness sets on the left and exceptions on the right. + + :param fig: Matplotlib figure to plot on. If None, a new figure is created. + :param subplot_spec: GridSpec to plot on. If None, a new GridSpec is created. + :param figsize: Size of the figure if a new one is created. + """ + # Create a new figure if not provided + n_cols = 2 if self.exceptions and len(self.exceptions) > 0 else 1 + if fig is None: + fig = plt.figure(figsize=figsize) + outer_grid = gridspec.GridSpec(1, n_cols, width_ratios=[1, 1], figure=fig, wspace=0.4) + else: + if subplot_spec is None: + outer_grid = gridspec.GridSpec(1, n_cols, width_ratios=[1, 1], figure=fig, wspace=0.4) + else: + outer_grid = gridspec.GridSpecFromSubplotSpec(1, n_cols, width_ratios=[1, 1], + subplot_spec=subplot_spec, wspace=0.4) + + # Set up the left side for commonness sets + left_grid = gridspec.GridSpecFromSubplotSpec(1, len(self.commonness_set) or 1, + subplot_spec=outer_grid[0, 0], wspace=0.3) + + # Plot each commonness set in its own column + for i, commonness_set in enumerate(self.commonness_set): + if not commonness_set: # Skip empty sets + continue + + # Create a subplot for this commonness set + ax = fig.add_subplot(left_grid[0, i]) + + # Add light orange background to commonness sets + ax.set_facecolor((1.0, 0.9, 0.8, 0.2)) # Light orange with alpha + + # Get the pattern type from the first pattern (all should be the same type) + pattern_type = commonness_set[0].pattern_type + + # Get the highlights for visualization + highlights = [pattern.highlight for pattern in commonness_set] + + # Create labels based on subspace and measure + labels = [] + for pattern in commonness_set: + # Format the subspace part + subspace_str = ", ".join([f"{key}={val}" for key, val in pattern.data_scope.subspace.items()]) + + # Format the measure part + measure = pattern.data_scope.measure + if isinstance(measure, tuple): + measure_str = f"{measure[0]}({measure[1]})" + else: + measure_str = str(measure) + + labels.append(f"{subspace_str}, {measure_str}") + + # Create title for this commonness set + title = self._create_commonness_set_title(commonness_set) + # Wrap title to prevent overflowing + title = textwrap.fill(title, width=40) + + # Call the appropriate visualize_many function based on pattern type + if highlights: + # Create a custom version of visualize_many that places the legend at the bottom + # instead of the side to prevent clipping + orig_visualize_many = highlights[0].visualize_many + + def modified_visualize_many(plt_ax, patterns, labels, title): + orig_visualize_many(plt_ax, patterns, labels, title) + # Move legend to bottom to prevent clipping into right side + if len(labels) > 3: + # For many items, use a horizontal layout at the bottom + plt_ax.legend(loc='lower center', bbox_to_anchor=(0.5, -0.3), + ncol=min(3, len(labels)), fontsize=8) + else: + # For fewer items, keep at the bottom right + plt_ax.legend(loc='lower right', fontsize=9) + + # Use our modified version + if hasattr(highlights[0], "visualize_many"): + orig_visualize = highlights[0].visualize_many + highlights[0].visualize_many = modified_visualize_many + highlights[0].visualize_many(plt_ax=ax, patterns=highlights, labels=labels, title=title) + highlights[0].visualize_many = orig_visualize + else: + ax.set_title(title) + + # Handle exceptions area if there are any + if self.exceptions and n_cols > 1: + # Set up the right side for exceptions with one row per exception type + right_grid = gridspec.GridSpecFromSubplotSpec(len(self.exceptions), 1, + subplot_spec=outer_grid[0, 1], + hspace=0.4) # Add more vertical space + + # Process each exception category + for i, (category, exception_patterns) in enumerate(self.exceptions.items()): + if not exception_patterns: # Skip empty categories + continue + + # For "None" category, just skip it. It may be a good idea to add text saying + # "Nothing found for...", but we already have an issue with visual clutter and + # clipping everywhere. + if category.lower() == "none" or category.lower() == "no-pattern": + continue + + # For "highlight change" category, visualize all in one plot + if category.lower() == "highlight-change" or category.lower() == "highlight change": + ax = fig.add_subplot(right_grid[i, 0]) + ax.set_facecolor((0.8, 0.9, 1.0, 0.2)) # Light blue with alpha + + # Get the highlights for visualization + highlights = [pattern.highlight for pattern in exception_patterns] + + # Create labels based on subspace and measure + labels = [] + for pattern in exception_patterns: + subspace_str = "" + for key, val in pattern.data_scope.subspace.items(): + split = val.split("<=") + if len(split) > 1: + subspace_str += f"{val}" + else: + subspace_str += f"{key} = {val}, " + measure = pattern.data_scope.measure + if isinstance(measure, tuple): + measure_str = f"{measure[0]}({measure[1]})" + else: + measure_str = str(measure) + + labels.append(f"{subspace_str}, {measure_str}") + + title = f"Same pattern, different highlights ({len(exception_patterns)})" + + if highlights and hasattr(highlights[0], "visualize_many"): + highlights[0].visualize_many(plt_ax=ax, patterns=highlights, labels=labels, title=title) + + # For "type change" or other categories, create a nested grid + elif category.lower() == "type-change" or category.lower() == "type change": + # Make sure there are highlights to visualize + highlights = [pattern.highlight for pattern in exception_patterns] + if all(highlight is None for highlight in highlights): + continue + + # Create a nested grid for this row with more space + type_grid = gridspec.GridSpecFromSubplotSpec(2, 1, + subplot_spec=right_grid[i, 0], + height_ratios=[1, 5], hspace=0.3, wspace=0.3) + + # Add title for the category in the first row + title_ax = fig.add_subplot(type_grid[0, 0]) + title_ax.axis('off') + title_ax.set_facecolor((0.8, 0.9, 1.0, 0.2)) + title_ax.text(0.5, 0.5, + f"Different patterns types detected ({len(exception_patterns)})", + horizontalalignment='center', + verticalalignment='center', + fontsize=12, + fontweight='bold') + + # Create subplots for each pattern in the second row + num_patterns = len(exception_patterns) + # At most 2 patterns per row + n_cols = 2 + n_rows = math.ceil(num_patterns / n_cols) + pattern_grid = gridspec.GridSpecFromSubplotSpec(n_rows, n_cols, + subplot_spec=type_grid[1, 0], + wspace=0.4) # More horizontal space + + + for j, pattern in enumerate(exception_patterns): + col_index = j % n_cols + row_index = j // n_cols + ax = fig.add_subplot(pattern_grid[row_index, col_index]) + ax.set_facecolor((0.8, 0.9, 1.0, 0.2)) # Light blue with alpha + + # Format labels for title + subspace_str = ", ".join([f"{key}={val}" for key, val in pattern.data_scope.subspace.items()]) + measure = pattern.data_scope.measure + if isinstance(measure, tuple): + measure_str = f"{measure[0]}({measure[1]})" + else: + measure_str = str(measure) + + title = "" + if pattern.pattern_type == PatternType.UNIMODALITY: + title += "Unimodality found for " + if pattern.pattern_type == PatternType.TREND: + title += "Trend found for " + if pattern.pattern_type == PatternType.OUTLIER: + title += "Outliers found for " + if pattern.pattern_type == PatternType.CYCLE: + title += "Cycles found for " + + title += f"{subspace_str}, {measure_str}" + title = textwrap.fill(title, 30) # Wrap title to prevent overflow + + # Visualize the individual pattern with internal legend + if pattern.highlight: + # Custom visualization with compact legend + def individual_exception_visualize(plt_ax): + pattern.highlight.visualize(plt_ax=plt_ax) + if hasattr(plt_ax, 'legend'): + plt_ax.legend(loc='lower center', fontsize=7) + + individual_exception_visualize(ax) + ax.set_title(title, fontsize=9) + + # # Add a main title with score information + # fig.suptitle(f"MetaInsight (Score: {self.score:.4f})", fontsize=16, y=0.98) + + # Allow more space for the figure elements + plt.subplots_adjust(bottom=0.15, top=0.9) # Adjust bottom and top margins + + return fig + diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index 0a10876..2628c3c 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -6,7 +6,7 @@ import pandas as pd from matplotlib import pyplot as plt, gridspec -from external_explainers.metainsight_explainer.data_pattern import BasicDataPattern, HomogenousDataPattern +from external_explainers.metainsight_explainer.data_pattern import BasicDataPattern from external_explainers.metainsight_explainer.meta_insight import (MetaInsight, ACTIONABILITY_REGULARIZER_PARAM, BALANCE_PARAMETER, @@ -128,13 +128,14 @@ def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): def mine_metainsights(self, source_df: pd.DataFrame, dimensions: List[str], - measures: List[Tuple[str,str]]) -> List[MetaInsight]: + measures: List[Tuple[str,str]], n_bins: int = 10) -> List[MetaInsight]: """ The main function to mine MetaInsights. Mines metainsights from the given data frame based on the provided dimensions, measures, and impact measure. :param source_df: The source DataFrame to mine MetaInsights from. :param dimensions: The dimensions to consider for mining. :param measures: The measures to consider for mining. + :param n_bins: The number of bins to use for numeric columns. :return: """ metainsight_candidates = set() @@ -154,10 +155,10 @@ def mine_metainsights(self, source_df: pd.DataFrame, unique_values = source_df[filter_dim].dropna().unique() # If there are too many unique values, we bin them if it's a numeric column, or only choose the # top 10 most frequent values if it's a categorical column - if len(unique_values) > 10: + if len(unique_values) > n_bins: if source_df[filter_dim].dtype in ['int64', 'float64']: # Bin the numeric column - bins = pd.cut(source_df[filter_dim], bins=10, retbins=True)[1] + bins = pd.cut(source_df[filter_dim], bins=n_bins, retbins=True)[1] unique_values = [f"{bins[i]} <= {filter_dim} <= {bins[i + 1]}" for i in range(len(bins) - 1)] else: # Choose the top 10 most frequent values @@ -235,12 +236,18 @@ def mine_metainsights(self, source_df: pd.DataFrame, end_time = time.time() print(f"Time taken: {end_time - start_time:.2f} seconds") - fig = plt.figure(figsize=(30, 25)) - main_grid = gridspec.GridSpec(2, 2, figure=fig, wspace=0.2, hspace=0.3) + nrows = 4 // 4 + ncols = 4 // 4 - for i, mi in enumerate(top_metainsights[:4]): - row, col = i // 2, i % 2 - mi.visualize_commonesses(fig=fig, subplot_spec=main_grid[row, col]) + fig_len = 20 * ncols + fig_height = 15 * nrows + + fig = plt.figure(figsize=(fig_len, fig_height)) + main_grid = gridspec.GridSpec(nrows, ncols, figure=fig, wspace=0.2, hspace=0.3) + + for i, mi in enumerate(top_metainsights[:1]): + row, col = i // nrows, i % ncols + mi.visualize(fig=fig, subplot_spec=main_grid[row, col]) # plt.tight_layout() plt.show() diff --git a/src/external_explainers/metainsight_explainer/pattern_evaluations.py b/src/external_explainers/metainsight_explainer/pattern_evaluations.py index d999b92..a736dd2 100644 --- a/src/external_explainers/metainsight_explainer/pattern_evaluations.py +++ b/src/external_explainers/metainsight_explainer/pattern_evaluations.py @@ -47,8 +47,6 @@ def _is_time_series(self, series: pd.Series) -> bool: if isinstance(series.index, pd.DatetimeIndex): return True elif np.issubdtype(series.index.dtype, np.number): - # Sort the index first, just in case the series it is not sorted, but it does have meaningful time intervals - series.sort_index(inplace=True) # Check if the index is strictly increasing return np.all(np.diff(series.index) > 0) else: @@ -207,6 +205,7 @@ def __call__(self, series: pd.Series, pattern_type: PatternType) -> (bool, str): return self.pattern_cache[cache_key] series = series[~series.isna()] # Remove NaN values + series = series.sort_index() # Sort the series by index if pattern_type == PatternType.UNIMODALITY: result = self.unimodality(series) diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py index 3349fc5..ce11ee9 100644 --- a/src/external_explainers/metainsight_explainer/patterns.py +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -2,7 +2,8 @@ import pandas as pd import matplotlib.pyplot as plt import numpy as np -from typing import Literal +from typing import Literal, List + class PatternInterface(ABC): """ @@ -10,7 +11,7 @@ class PatternInterface(ABC): """ @abstractmethod - def visualize(self, plt_ax) -> None: + def visualize(self, plt_ax, title: str = None) -> None: """ Visualize the pattern. """ @@ -49,8 +50,60 @@ def __hash__(self) -> int: raise NotImplementedError("Subclasses must implement this method.") + @staticmethod + @abstractmethod + def visualize_many(plt_ax, patterns: List['PatternInterface'], labels:List[str], title: str = None) -> None: + """ + Visualize many patterns of the same type on the same plot. + :param plt_ax: The matplotlib axes to plot on + :param patterns: The patterns to plot + :param labels: The labels to display in the legend. + :param title: The title of the plot + """ + raise NotImplementedError("Subclasses must implement this method.") + + class UnimodalityPattern(PatternInterface): + @staticmethod + def visualize_many(plt_ax, patterns: List['UnimodalityPattern'], labels: List[str], title: str = None) -> None: + """ + Visualize multiple unimodality patterns on a single plot. + + :param plt_ax: Matplotlib axes to plot on + :param patterns: List of UnimodalityPattern objects + :param labels: List of labels for each pattern (e.g. data scope descriptions) + """ + # Define a color cycle for lines + colors = plt.cm.tab10.colors + + for i, (pattern, label) in enumerate(zip(patterns, labels)): + color = colors[i % len(colors)] + + # Plot the series with a unique color + plt_ax.plot(pattern.source_series, color=color, alpha=0.7, label=label) + + # Highlight the peak or valley with a marker + if pattern.type.lower() == 'peak': + plt_ax.plot(pattern.highlight_index, pattern.source_series[pattern.highlight_index], + 'o', color=color, markersize=8, markeredgecolor='black') + elif pattern.type.lower() == 'valley': + plt_ax.plot(pattern.highlight_index, pattern.source_series[pattern.highlight_index], + 'v', color=color, markersize=8, markeredgecolor='black') + + # Set labels and title + plt_ax.set_xlabel(patterns[0].index_name if patterns else 'Index') + plt_ax.set_ylabel(patterns[0].value_name if patterns else 'Value') + plt_ax.set_title(f"Multiple {patterns[0].type if patterns else 'Unimodality'} Patterns" if title is None else title) + + # Add legend outside the plot + plt_ax.legend(loc='upper left') + + #plt_ax.figure.subplots_adjust(right=0.5) # Reserve 50% of width for legend + + # Rotate x-axis tick labels if needed + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=8) + def __init__(self, source_series: pd.Series, type: Literal['Peak', 'Valley'], highlight_index, value_name: str=None): """ Initialize the UnimodalityPattern with the provided parameters. @@ -67,7 +120,7 @@ def __init__(self, source_series: pd.Series, type: Literal['Peak', 'Valley'], hi self.value_name = value_name if value_name else 'Value' self.hash = None - def visualize(self, plt_ax) -> None: + def visualize(self, plt_ax, title: str = None) -> None: """ Visualize the unimodality pattern. :return: @@ -81,6 +134,8 @@ def visualize(self, plt_ax) -> None: elif self.type.lower() == 'valley': plt_ax.plot(self.highlight_index, self.source_series[self.highlight_index], 'bo', label='Valley') plt_ax.legend(loc="upper left") + if title is not None: + plt_ax.set_title(title) def __eq__(self, other) -> bool: @@ -124,6 +179,78 @@ def __hash__(self) -> int: class TrendPattern(PatternInterface): + @staticmethod + def visualize_many(plt_ax, patterns: List['TrendPattern'], labels: List[str], title: str = None, + show_data: bool = True, alpha_data: float = 0.6) -> None: + """ + Visualize multiple trend patterns on a single plot. + + :param plt_ax: Matplotlib axes to plot on + :param patterns: List of TrendPattern objects + :param labels: List of labels for each pattern + :param title: Optional custom title for the plot + :param show_data: Whether to show the raw data points (can be set to False if too cluttered) + :param alpha_data: Opacity of the raw data (lower value reduces visual clutter) + """ + # Define a color cycle for lines + colors = plt.cm.tab10.colors + + # Define line styles for additional differentiation. This is taken from matplotlib's + # docs: https://matplotlib.org/stable/gallery/lines_bars_and_markers/linestyles.html + line_styles = [ + ('loosely dotted', (0, (1, 10))), + ('dotted', (0, (1, 5))), + ('densely dotted', (0, (1, 1))), + + ('long dash with offset', (5, (10, 3))), + ('loosely dashed', (0, (5, 10))), + ('dashed', (0, (5, 5))), + ('densely dashed', (0, (5, 1))), + + ('loosely dashdotted', (0, (3, 10, 1, 10))), + ('dashdotted', (0, (3, 5, 1, 5))), + ('densely dashdotted', (0, (3, 1, 1, 1))), + + ('dashdotdotted', (0, (3, 5, 1, 5, 1, 5))), + ('loosely dashdotdotted', (0, (3, 10, 1, 10, 1, 10))), + ('densely dashdotdotted', (0, (3, 1, 1, 1, 1, 1)))] + + for i, (pattern, label) in enumerate(zip(patterns, labels)): + color = colors[i % len(colors)] + line_style = line_styles[i % len(line_styles)][1] + + # Plot the raw data with reduced opacity if requested + if show_data: + plt_ax.plot(pattern.source_series, color=color, alpha=alpha_data, linewidth=1) + + # Get x range for trend line + x_numeric = np.arange(len(pattern.source_series)) + + # Plot the trend line + trend_label = f"{label}" + plt_ax.plot(pattern.source_series.index, pattern.slope * x_numeric + pattern.intercept, + linestyle=line_style, color=color, linewidth=2, label=trend_label) + + # Set labels and title + if patterns: + plt_ax.set_xlabel(patterns[0].source_series.index.name if patterns[0].source_series.index.name else 'Index') + plt_ax.set_ylabel(patterns[0].value_name if patterns[0].value_name else 'Value') + + default_title = f"Multiple Trend Patterns" + plt_ax.set_title(title if title is not None else default_title) + + # Rotate x-axis tick labels + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=8) + + # First, adjust the subplot parameters to make room for the legend + #plt_ax.figure.subplots_adjust(right=0.5) # Reserve 50% of width for legend + + # Place legend outside the plot + plt_ax.legend(loc='upper left') + + # Ensure bottom margin for x-labels + plt_ax.figure.subplots_adjust(bottom=0.15) + def __init__(self, source_series: pd.Series, type: Literal['Increasing', 'Decreasing'], slope: float, intercept: float = 0, value_name: str = None): """ @@ -140,7 +267,7 @@ def __init__(self, source_series: pd.Series, type: Literal['Increasing', 'Decrea self.value_name = value_name if value_name else 'Value' self.hash = None - def visualize(self, plt_ax) -> None: + def visualize(self, plt_ax, title: str = None) -> None: """ Visualize the trend pattern. :param plt_ax: @@ -156,6 +283,8 @@ def visualize(self, plt_ax) -> None: linewidth=2, label=label) plt_ax.legend(loc="upper left") + if title is not None: + plt_ax.set_title(title) def __eq__(self, other) -> bool: """ @@ -197,6 +326,92 @@ def __hash__(self) -> int: class OutlierPattern(PatternInterface): + @staticmethod + def visualize_many(plt_ax, patterns: List['OutlierPattern'], labels: List[str], title: str = None, + show_regular: bool = True, alpha_regular: float = 0.5, alpha_outliers: float = 0.9) -> None: + """ + Visualize multiple outlier patterns on a single plot. + + :param plt_ax: Matplotlib axes to plot on + :param patterns: List of OutlierPattern objects + :param labels: List of labels for each pattern + :param title: Optional custom title for the plot + :param show_regular: Whether to show regular (non-outlier) data points + :param alpha_regular: Opacity for regular data points + :param alpha_outliers: Opacity for outlier points + """ + # Define a color cycle for different datasets + colors = plt.cm.tab10.colors + + # Define marker styles + regular_marker = 'o' # Circle for regular points + outlier_marker = 'X' # X mark for outliers + + # Create a legend handle for the outlier explanation + from matplotlib.lines import Line2D + custom_lines = [Line2D([0], [0], marker=outlier_marker, color='black', + markerfacecolor='black', markersize=10, linestyle='')] + custom_labels = ['Outliers (marked with X)'] + + # Plot each dataset + for i, (pattern, label) in enumerate(zip(patterns, labels)): + color = colors[i % len(colors)] + + # Plot regular data points if requested + if show_regular: + plt_ax.scatter( + pattern.source_series.index, + pattern.source_series, + color=color, + alpha=alpha_regular, + marker=regular_marker, + s=30, # Size + label=label + ) + else: + # Still add to legend even if not showing points + plt_ax.scatter([], [], color=color, marker=regular_marker, s=30, label=label) + + # Plot outliers with the same color but a different marker + if pattern.outlier_indexes is not None and len(pattern.outlier_indexes) > 0: + plt_ax.scatter( + pattern.outlier_indexes, + pattern.outlier_values, + color=color, + alpha=alpha_outliers, + marker=outlier_marker, + s=100, # Larger size for outliers + edgecolors='black', # Black edge for visibility + linewidth=1.5 + ) + + # Set labels and title + if patterns: + plt_ax.set_xlabel(patterns[0].source_series.index.name if patterns[0].source_series.index.name else 'Index') + plt_ax.set_ylabel(patterns[0].value_name if patterns[0].value_name else 'Value') + + default_title = "Multiple Outlier Patterns" + plt_ax.set_title(title if title is not None else default_title) + + # Rotate x-axis tick labels if needed + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=8) + + # Get the current handles and labels + handles, labels_current = plt_ax.get_legend_handles_labels() + + # Combine with custom outlier explanation + all_handles = handles + custom_lines + all_labels = labels_current + custom_labels + + # Adjust subplot parameters to make room for the legend + #plt_ax.figure.subplots_adjust(right=0.5) # Reserve 30% of width for legend + + # Place legend outside the plot with combined handles/labels + plt_ax.legend(all_handles, all_labels, loc='upper left') + + # Ensure bottom margin for x-labels + plt_ax.figure.subplots_adjust(bottom=0.15) + def __init__(self, source_series: pd.Series, outlier_indexes: pd.Index, outlier_values: pd.Series, value_name: str = None): """ @@ -212,7 +427,7 @@ def __init__(self, source_series: pd.Series, outlier_indexes: pd.Index, outlier_ self.value_name = value_name if value_name else 'Value' self.hash = None - def visualize(self, plt_ax) -> None: + def visualize(self, plt_ax, title: str = None) -> None: """ Visualize the outlier pattern. :param plt_ax: @@ -224,6 +439,8 @@ def visualize(self, plt_ax) -> None: # Emphasize the outliers plt_ax.scatter(self.outlier_indexes, self.outlier_values, color='red', label='Outliers') plt_ax.legend(loc="upper left") + if title is not None: + plt_ax.set_title(title) def __eq__(self, other): @@ -265,6 +482,148 @@ def __hash__(self) -> int: class CyclePattern(PatternInterface): + @staticmethod + def visualize_many(plt_ax, patterns: List['CyclePattern'], labels: List[str], title: str = None, + alpha_cycles: float = 0.3, line_alpha: float = 0.8) -> None: + """ + Visualize multiple cycle patterns on a single plot with common cycles highlighted. + + :param plt_ax: Matplotlib axes to plot on + :param patterns: List of CyclePattern objects + :param labels: List of labels for each pattern + :param title: Optional custom title for the plot + :param alpha_cycles: Opacity for the highlighted cycle regions + :param line_alpha: Opacity for the time series lines + """ + import numpy as np + import pandas as pd + + # Define a color cycle for lines + colors = plt.cm.tab10.colors + + # Color for common cycles + common_cycle_color = 'darkviolet' + + # Plot each dataset and collect legend handles + legend_handles = [] + legend_labels = [] + + # First, identify time ranges covered by cycles for each pattern + all_cycle_data = [] + + for pattern in patterns: + if hasattr(pattern, 'cycles') and not pattern.cycles.empty: + for _, cycle in pattern.cycles.iterrows(): + all_cycle_data.append((cycle['t_start'], cycle['t_end'])) + + # Find common cycle periods + common_periods = [] + if len(patterns) > 1 and all_cycle_data: + # Handle datetime objects by creating a time_points array differently + # Get all unique timestamps from starts and ends + all_timestamps = sorted(list(set([t for start, end in all_cycle_data for t in [start, end]]))) + + # Create additional points between timestamps if needed + if len(all_timestamps) > 1: + time_points = [] + for i in range(len(all_timestamps) - 1): + # Add the current timestamp + time_points.append(all_timestamps[i]) + + # Add intermediate points if the gap is large enough + curr = pd.Timestamp(all_timestamps[i]) + next_ts = pd.Timestamp(all_timestamps[i + 1]) + if (next_ts - curr).total_seconds() > 60: # If gap is more than a minute + # Add 10 intermediate points + delta = (next_ts - curr) / 11 + for j in range(1, 11): + time_points.append(curr + delta * j) + + # Add the last timestamp + time_points.append(all_timestamps[-1]) + else: + time_points = all_timestamps + + # For each time point, check if it falls within a cycle for each pattern + overlap_counts = np.zeros(len(time_points)) + + for pattern in patterns: + if hasattr(pattern, 'cycles') and not pattern.cycles.empty: + pattern_mask = np.zeros(len(time_points), dtype=bool) + for _, cycle in pattern.cycles.iterrows(): + start, end = cycle['t_start'], cycle['t_end'] + pattern_mask = pattern_mask | ( + (np.array(time_points) >= start) & (np.array(time_points) <= end)) + overlap_counts += pattern_mask + + # Find regions where all patterns have a cycle + common_mask = overlap_counts == len(patterns) + + # Find contiguous regions of common cycles + if np.any(common_mask): + changes = np.diff(np.concatenate(([0], common_mask.astype(int), [0]))) + start_indices = np.where(changes == 1)[0] + end_indices = np.where(changes == -1)[0] - 1 + + for start_idx, end_idx in zip(start_indices, end_indices): + common_periods.append((time_points[start_idx], time_points[end_idx])) + + # Plot each pattern + for i, (pattern, label) in enumerate(zip(patterns, labels)): + color = colors[i % len(colors)] + + # Plot the time series + line, = plt_ax.plot(pattern.source_series, color=color, alpha=line_alpha, linewidth=2) + legend_handles.append(line) + legend_labels.append(label) + + # Highlight each cycle with a semi-transparent fill + if hasattr(pattern, 'cycles') and not pattern.cycles.empty: + # Add individual cycle legend element + cycle_patch = plt.Rectangle((0, 0), 1, 1, color=color, alpha=alpha_cycles) + + for _, cycle in pattern.cycles.iterrows(): + # Highlight the cycle only if it is not in the common cycles - we highlight those later. + if not any( + start <= cycle['t_start'] <= end and start <= cycle['t_end'] <= end + for start, end in common_periods + ): + t_start = cycle['t_start'] + t_end = cycle['t_end'] + + # Highlight the cycle region + plt_ax.axvspan(t_start, t_end, color=color, alpha=alpha_cycles) + + # Highlight common cycles + if common_periods: + for start, end in common_periods: + plt_ax.axvspan(start, end, color=common_cycle_color, alpha=alpha_cycles * 1.5, zorder=-1) + + # Add legend item for common cycles + common_patch = plt.Rectangle((0, 0), 1, 1, color=common_cycle_color, alpha=alpha_cycles * 1.5) + legend_handles.append(common_patch) + legend_labels.append('Common cycles (all patterns)') + + # Set labels and title + if patterns: + plt_ax.set_xlabel(patterns[0].source_series.index.name if patterns[0].source_series.index.name else 'Index') + plt_ax.set_ylabel(patterns[0].value_name if patterns[0].value_name else 'Value') + + default_title = "Multiple Cycle Patterns" + plt_ax.set_title(title if title is not None else default_title) + + # Rotate x-axis tick labels + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=8) + + # Adjust subplot parameters to make room for the legend + #plt_ax.figure.subplots_adjust(right=0.5) # Reserve 50% of width for legend + + # Place legend outside the plot + plt_ax.legend(legend_handles, legend_labels, loc='upper left') + + # Ensure bottom margin for x-labels + plt_ax.figure.subplots_adjust(bottom=0.15) + def __init__(self, source_series: pd.Series, cycles: pd.DataFrame, value_name: str = None): """ Initialize the Cycle pattern with the provided parameters. @@ -279,7 +638,7 @@ def __init__(self, source_series: pd.Series, cycles: pd.DataFrame, value_name: s self._cycle_tuples = frozenset((row['t_start'], row['t_end']) for _, row in cycles.iterrows()) self.value_name = value_name if value_name else 'Value' - def visualize(self, plt_ax): + def visualize(self, plt_ax, title: str = None): """ Visualize the cycle pattern. :param plt_ax: @@ -297,6 +656,8 @@ def visualize(self, plt_ax): i += 1 color_index = (color_index + 1) % len(colors) plt_ax.legend(loc="upper left") + if title is not None: + plt_ax.set_title(title) def __eq__(self, other): """ From d78fb7ac5011aa612b42adb7d589f5a893a6a23a Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Tue, 20 May 2025 17:41:12 +0300 Subject: [PATCH 16/27] Improved visualizations to the point they are now almost entirely coherent and legible when using them in pd-explain. --- .../metainsight_explainer/data_pattern.py | 36 +++-- .../metainsight_explainer/data_scope.py | 29 ++-- .../metainsight_explainer/meta_insight.py | 126 ++++++++---------- .../metainsight_mining.py | 45 ++++--- .../pattern_evaluations.py | 31 +++-- .../metainsight_explainer/patterns.py | 24 ++-- 6 files changed, 161 insertions(+), 130 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/data_pattern.py b/src/external_explainers/metainsight_explainer/data_pattern.py index b02283d..2594b43 100644 --- a/src/external_explainers/metainsight_explainer/data_pattern.py +++ b/src/external_explainers/metainsight_explainer/data_pattern.py @@ -1,3 +1,5 @@ +import typing + import pandas as pd from typing import Dict, List, Tuple @@ -60,7 +62,7 @@ def __repr__(self): return f"BasicDataPattern(ds={self.data_scope}, type='{self.pattern_type}', highlight={self.highlight})" @staticmethod - def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: PatternType) -> 'BasicDataPattern': + def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: PatternType) -> List['BasicDataPattern']: """ Evaluates a specific pattern type for the data distribution of a data scope. :param data_scope: The data scope to evaluate. @@ -92,10 +94,14 @@ def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: Patt aggregated_series = aggregated_series.sort_index() # Evaluate the specific pattern type + returned_patterns = [] pattern_evaluator = PatternEvaluator() is_valid, highlight = pattern_evaluator(aggregated_series, pattern_type) if is_valid: - return BasicDataPattern(data_scope, pattern_type, highlight) + # A returned highlight can contain multiple highlights, for example, if a peak and a valley are found + # in the same series. + for hl in highlight: + returned_patterns.append(BasicDataPattern(data_scope, pattern_type, hl)) else: # Check for other pattern types for other_type in PatternType: @@ -104,14 +110,19 @@ def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: Patt if other_type != pattern_type: other_is_valid, highlight = pattern_evaluator(aggregated_series, other_type) if other_is_valid: - return BasicDataPattern(data_scope, PatternType.OTHER, highlight) + for hl in highlight: + returned_patterns.append(BasicDataPattern(data_scope, PatternType.OTHER, hl)) + + if len(returned_patterns) == 0: + # If no pattern is found, return a 'No Pattern' type + return [BasicDataPattern(data_scope, PatternType.NONE, None)] - # If no pattern is found, return a 'No Pattern' type - return BasicDataPattern(data_scope, PatternType.NONE, None) + return returned_patterns def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, hds: List[DataScope] = None, temporal_dimensions: List[str] = None, - measures: List[Tuple[str,str]] = None, n_bins: int = 10) -> Tuple['HomogenousDataPattern', Dict]: + measures: List[Tuple[str,str]] = None, n_bins: int = 10, + extend_by_measure: bool = False) -> Tuple['HomogenousDataPattern', Dict]: """ Generates a Homogenous Data Pattern (HDP) either from a given HDS or from the current DataScope. @@ -124,7 +135,8 @@ def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, :return: A tuple containing the created HomogenousDataPattern and the updated pattern cache. """ if hds is None or len(hds) == 0: - hds = self.data_scope.create_hds(temporal_dimensions=temporal_dimensions, measures=measures, n_bins=n_bins) + hds = self.data_scope.create_hds(dims=temporal_dimensions, measures=measures, + n_bins=n_bins, extend_by_measure=extend_by_measure) # All the data scopes in the HDS should have the same source_df, and it should be # the same as the source_df of the current DataScope (otherwise, this pattern should not be # the one producing the HDP with this HDS). @@ -148,9 +160,15 @@ def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, dp = self.evaluate_pattern(ds, source_df, pattern_type) pattern_cache[cache_key] = dp # Store in cache + # Some evaluation functions can return multiple patterns, so it is simpler to just + # convert it to a list and then treat it as an iterable. + if not isinstance(dp, typing.Iterable): + dp = [dp] + # Only add patterns that are not 'No Pattern' to the HDP for MetaInsight evaluation - if dp.pattern_type != PatternType.NONE: - hdp.append(dp) + for d in dp: + if d is not None and d.pattern_type != PatternType.NONE: + hdp.append(d) self.pattern_cache = pattern_cache diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py index 2daa8d9..97f31b2 100644 --- a/src/external_explainers/metainsight_explainer/data_scope.py +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -110,48 +110,51 @@ def _measure_extend(self, measures: List[Tuple[str, str]]) -> List['DataScope']: new_ds.append(DataScope(self.source_df, self.subspace, self.breakdown, (measure_col, agg_func))) return new_ds - def _breakdown_extend(self, temporal_dimensions: List[str]) -> List['DataScope']: + def _breakdown_extend(self, dims: List[str]) -> List['DataScope']: """ Extends the breakdown of the DataScope while keeping the same subspace and measure. - :param temporal_dimensions: The temporal dimensions to extend the breakdown with. + :param dims: The dimensions to extend the breakdown with. :return: A list of new DataScope objects with the extended breakdown. """ new_ds = [] - temporal_dimensions = [d for d in temporal_dimensions if - self.source_df[d].dtype in ['datetime64[ns]', 'period[M]', 'int64']] - for breakdown_dim in temporal_dimensions: + for breakdown_dim in dims: if breakdown_dim != self.breakdown: new_ds.append(DataScope(self.source_df, self.subspace, breakdown_dim, self.measure)) return new_ds - def create_hds(self, temporal_dimensions: List[str] = None, - measures: List[Tuple[str,str]] = None, n_bins: int = 10) -> 'HomogenousDataScope': + def create_hds(self, dims: List[str] = None, + measures: List[Tuple[str,str]] = None, n_bins: int = 10, + extend_by_measure: bool = False) -> 'HomogenousDataScope': """ Generates a Homogeneous Data Scope (HDS) from a base data scope, using subspace, measure and breakdown extensions as defined in the MetaInsight paper. - :param temporal_dimensions: The temporal dimensions to extend the breakdown with. Expected as a list of strings. + :param dims: The temporal dimensions to extend the breakdown with. Expected as a list of strings. :param measures: The measures to extend the measure with. Expected to be a dict {measure_column: aggregate_function}. :param n_bins: The number of bins to use for numeric columns. Defaults to 10. + :param extend_by_measure: Whether to use measure extension or not. Defaults to False. Setting this to true + can lead to metainsights with mixed aggregation functions, which may often be undesirable. :return: A HDS in the form of a list of DataScope objects. """ hds = [self] - if temporal_dimensions is None: - temporal_dimensions = [] + if dims is None: + dims = [] if measures is None: measures = {} # Subspace Extending hds.extend(self._subspace_extend(n_bins=n_bins)) - # Measure Extending - hds.extend(self._measure_extend(measures)) + # Measure Extending. + # We may not want to do it though, if we want our HDS to only contain the original measure. + if extend_by_measure: + hds.extend(self._measure_extend(measures)) # Breakdown Extending - hds.extend(self._breakdown_extend(temporal_dimensions)) + hds.extend(self._breakdown_extend(dims)) return HomogenousDataScope(hds) diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index c6dd0af..187ffa4 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -59,9 +59,30 @@ def __hash__(self): def __eq__(self, other): + """ + Compares two MetaInsight objects for equality. + Two MetaInsight objects are considered equal if they have the same commonness sets. + :param other: + :return: + """ if not isinstance(other, MetaInsight): return False - return self.commonness_set == other.commonness_set + # If the commonness sets are not the same size, they are not equal + if len(self.commonness_set) != len(other.commonness_set): + return False + all_equal = True + for self_commonness in self.commonness_set: + for other_commonness in other.commonness_set: + # Check if the commonness sets are equal + if len(self_commonness) != len(other_commonness): + all_equal = False + break + for pattern in self_commonness: + if pattern not in other_commonness: + all_equal = False + break + + return all_equal @staticmethod def categorize_exceptions(commonness_set, exceptions): @@ -401,6 +422,26 @@ def visualize_commonesses_individually(self, fig=None, subplot_spec=None, figsiz return fig + + def _create_labels(self, patterns: List[BasicDataPattern]) -> List[str]: + """ + Create labels for the patterns in a commonness set. + :param patterns: A list of BasicDataPattern objects. + :return: A list of strings representing the labels for the patterns. + """ + labels = [] + for pattern in patterns: + subspace_str = "" + for key, val in pattern.data_scope.subspace.items(): + split = val.split("<=") + if len(split) > 1: + subspace_str += f"{val}" + else: + subspace_str += f"{key} = {val}, " + + labels.append(f"{subspace_str}") + return labels + def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: """ Visualize the metainsight, showing commonness sets on the left and exceptions on the right. @@ -413,13 +454,13 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: n_cols = 2 if self.exceptions and len(self.exceptions) > 0 else 1 if fig is None: fig = plt.figure(figsize=figsize) - outer_grid = gridspec.GridSpec(1, n_cols, width_ratios=[1, 1], figure=fig, wspace=0.4) + outer_grid = gridspec.GridSpec(1, n_cols, width_ratios=[1, 1], figure=fig, wspace=0.2) else: if subplot_spec is None: - outer_grid = gridspec.GridSpec(1, n_cols, width_ratios=[1, 1], figure=fig, wspace=0.4) + outer_grid = gridspec.GridSpec(1, n_cols, width_ratios=[1, 1], figure=fig, wspace=0.2) else: outer_grid = gridspec.GridSpecFromSubplotSpec(1, n_cols, width_ratios=[1, 1], - subplot_spec=subplot_spec, wspace=0.4) + subplot_spec=subplot_spec, wspace=0.2) # Set up the left side for commonness sets left_grid = gridspec.GridSpecFromSubplotSpec(1, len(self.commonness_set) or 1, @@ -434,28 +475,13 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: ax = fig.add_subplot(left_grid[0, i]) # Add light orange background to commonness sets - ax.set_facecolor((1.0, 0.9, 0.8, 0.2)) # Light orange with alpha - - # Get the pattern type from the first pattern (all should be the same type) - pattern_type = commonness_set[0].pattern_type + # ax.set_facecolor((1.0, 0.9, 0.8, 0.2)) # Light orange with alpha # Get the highlights for visualization highlights = [pattern.highlight for pattern in commonness_set] - # Create labels based on subspace and measure - labels = [] - for pattern in commonness_set: - # Format the subspace part - subspace_str = ", ".join([f"{key}={val}" for key, val in pattern.data_scope.subspace.items()]) - - # Format the measure part - measure = pattern.data_scope.measure - if isinstance(measure, tuple): - measure_str = f"{measure[0]}({measure[1]})" - else: - measure_str = str(measure) - - labels.append(f"{subspace_str}, {measure_str}") + # Create labels based on subspace + labels = self._create_labels(commonness_set) # Create title for this commonness set title = self._create_commonness_set_title(commonness_set) @@ -464,27 +490,8 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: # Call the appropriate visualize_many function based on pattern type if highlights: - # Create a custom version of visualize_many that places the legend at the bottom - # instead of the side to prevent clipping - orig_visualize_many = highlights[0].visualize_many - - def modified_visualize_many(plt_ax, patterns, labels, title): - orig_visualize_many(plt_ax, patterns, labels, title) - # Move legend to bottom to prevent clipping into right side - if len(labels) > 3: - # For many items, use a horizontal layout at the bottom - plt_ax.legend(loc='lower center', bbox_to_anchor=(0.5, -0.3), - ncol=min(3, len(labels)), fontsize=8) - else: - # For fewer items, keep at the bottom right - plt_ax.legend(loc='lower right', fontsize=9) - - # Use our modified version if hasattr(highlights[0], "visualize_many"): - orig_visualize = highlights[0].visualize_many - highlights[0].visualize_many = modified_visualize_many highlights[0].visualize_many(plt_ax=ax, patterns=highlights, labels=labels, title=title) - highlights[0].visualize_many = orig_visualize else: ax.set_title(title) @@ -509,28 +516,13 @@ def modified_visualize_many(plt_ax, patterns, labels, title): # For "highlight change" category, visualize all in one plot if category.lower() == "highlight-change" or category.lower() == "highlight change": ax = fig.add_subplot(right_grid[i, 0]) - ax.set_facecolor((0.8, 0.9, 1.0, 0.2)) # Light blue with alpha + # ax.set_facecolor((0.8, 0.9, 1.0, 0.2)) # Light blue with alpha # Get the highlights for visualization highlights = [pattern.highlight for pattern in exception_patterns] # Create labels based on subspace and measure - labels = [] - for pattern in exception_patterns: - subspace_str = "" - for key, val in pattern.data_scope.subspace.items(): - split = val.split("<=") - if len(split) > 1: - subspace_str += f"{val}" - else: - subspace_str += f"{key} = {val}, " - measure = pattern.data_scope.measure - if isinstance(measure, tuple): - measure_str = f"{measure[0]}({measure[1]})" - else: - measure_str = str(measure) - - labels.append(f"{subspace_str}, {measure_str}") + labels = self._create_labels(exception_patterns) title = f"Same pattern, different highlights ({len(exception_patterns)})" @@ -547,7 +539,7 @@ def modified_visualize_many(plt_ax, patterns, labels, title): # Create a nested grid for this row with more space type_grid = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=right_grid[i, 0], - height_ratios=[1, 5], hspace=0.3, wspace=0.3) + height_ratios=[1, 5], hspace=0.5, wspace=0.3) # Add title for the category in the first row title_ax = fig.add_subplot(type_grid[0, 0]) @@ -574,15 +566,10 @@ def modified_visualize_many(plt_ax, patterns, labels, title): col_index = j % n_cols row_index = j // n_cols ax = fig.add_subplot(pattern_grid[row_index, col_index]) - ax.set_facecolor((0.8, 0.9, 1.0, 0.2)) # Light blue with alpha + # ax.set_facecolor((0.8, 0.9, 1.0, 0.2)) # Light blue with alpha # Format labels for title subspace_str = ", ".join([f"{key}={val}" for key, val in pattern.data_scope.subspace.items()]) - measure = pattern.data_scope.measure - if isinstance(measure, tuple): - measure_str = f"{measure[0]}({measure[1]})" - else: - measure_str = str(measure) title = "" if pattern.pattern_type == PatternType.UNIMODALITY: @@ -594,7 +581,7 @@ def modified_visualize_many(plt_ax, patterns, labels, title): if pattern.pattern_type == PatternType.CYCLE: title += "Cycles found for " - title += f"{subspace_str}, {measure_str}" + title += f"{subspace_str}," title = textwrap.fill(title, 30) # Wrap title to prevent overflow # Visualize the individual pattern with internal legend @@ -603,13 +590,10 @@ def modified_visualize_many(plt_ax, patterns, labels, title): def individual_exception_visualize(plt_ax): pattern.highlight.visualize(plt_ax=plt_ax) if hasattr(plt_ax, 'legend'): - plt_ax.legend(loc='lower center', fontsize=7) + plt_ax.legend(loc='lower center', fontsize=10) individual_exception_visualize(ax) - ax.set_title(title, fontsize=9) - - # # Add a main title with score information - # fig.suptitle(f"MetaInsight (Score: {self.score:.4f})", fontsize=16, y=0.98) + ax.set_title(title, fontsize=10) # Allow more space for the figure elements plt.subplots_adjust(bottom=0.15, top=0.9) # Adjust bottom and top margins diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index 2628c3c..cef2144 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -25,7 +25,8 @@ class MetaInsightMiner: """ def __init__(self, k=5, min_score=MIN_IMPACT, min_commonness=COMMONNESS_THRESHOLD, balance_factor=BALANCE_PARAMETER, - actionability_regularizer=ACTIONABILITY_REGULARIZER_PARAM): + actionability_regularizer=ACTIONABILITY_REGULARIZER_PARAM + ): """ Initialize the MetaInsightMiner with the provided parameters. @@ -128,7 +129,9 @@ def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): def mine_metainsights(self, source_df: pd.DataFrame, dimensions: List[str], - measures: List[Tuple[str,str]], n_bins: int = 10) -> List[MetaInsight]: + measures: List[Tuple[str,str]], n_bins: int = 10, + extend_by_measure: bool = False + ) -> List[MetaInsight]: """ The main function to mine MetaInsights. Mines metainsights from the given data frame based on the provided dimensions, measures, and impact measure. @@ -179,24 +182,26 @@ def mine_metainsights(self, source_df: pd.DataFrame, for pattern_type in PatternType: if pattern_type == PatternType.OTHER or pattern_type == PatternType.NONE: continue - base_dp = BasicDataPattern.evaluate_pattern(base_ds, source_df, pattern_type) + base_dps = BasicDataPattern.evaluate_pattern(base_ds, source_df, pattern_type) - if base_dp.pattern_type not in [PatternType.NONE, PatternType.OTHER]: - # If a valid basic pattern is found, extend the data scope to generate HDS - hdp, pattern_cache = base_dp.create_hdp(temporal_dimensions=dimensions, measures=measures, - pattern_type=pattern_type, pattern_cache=pattern_cache) + for base_dp in base_dps: + if base_dp.pattern_type not in [PatternType.NONE, PatternType.OTHER]: + # If a valid basic pattern is found, extend the data scope to generate HDS + hdp, pattern_cache = base_dp.create_hdp(temporal_dimensions=dimensions, measures=measures, + pattern_type=pattern_type, pattern_cache=pattern_cache, + extend_by_measure=extend_by_measure) - # Pruning 1 - if the HDP is unlikely to form a commonness, discard it - if len(hdp) < len(hdp.data_scopes) * self.min_commonness: - continue + # Pruning 1 - if the HDP is unlikely to form a commonness, discard it + if len(hdp) < len(hdp.data_scopes) * self.min_commonness: + continue - # Pruning 2: Discard HDS with extremely low impact - hds_impact = hdp.compute_impact(datascope_cache) - if hds_impact < MIN_IMPACT: - continue + # Pruning 2: Discard HDS with extremely low impact + hds_impact = hdp.compute_impact(datascope_cache) + if hds_impact < MIN_IMPACT: + continue - # Add HDS to a queue for evaluation - hdp_queue.put((hdp, pattern_type)) + # Add HDS to a queue for evaluation + hdp_queue.put((hdp, pattern_type)) processed_hdp_count = 0 while not hdp_queue.empty(): @@ -236,8 +241,8 @@ def mine_metainsights(self, source_df: pd.DataFrame, end_time = time.time() print(f"Time taken: {end_time - start_time:.2f} seconds") - nrows = 4 // 4 - ncols = 4 // 4 + nrows = 4 + ncols = 1 fig_len = 20 * ncols fig_height = 15 * nrows @@ -245,8 +250,8 @@ def mine_metainsights(self, source_df: pd.DataFrame, fig = plt.figure(figsize=(fig_len, fig_height)) main_grid = gridspec.GridSpec(nrows, ncols, figure=fig, wspace=0.2, hspace=0.3) - for i, mi in enumerate(top_metainsights[:1]): - row, col = i // nrows, i % ncols + for i, mi in enumerate(top_metainsights[:4]): + row, col = i, 0 mi.visualize(fig=fig, subplot_spec=main_grid[row, col]) # plt.tight_layout() diff --git a/src/external_explainers/metainsight_explainer/pattern_evaluations.py b/src/external_explainers/metainsight_explainer/pattern_evaluations.py index a736dd2..3523f0a 100644 --- a/src/external_explainers/metainsight_explainer/pattern_evaluations.py +++ b/src/external_explainers/metainsight_explainer/pattern_evaluations.py @@ -1,10 +1,13 @@ +import typing from enum import Enum +from typing import List + import pandas as pd import numpy as np from diptest import diptest from scipy.stats import zscore from external_explainers.metainsight_explainer.patterns import UnimodalityPattern, TrendPattern, OutlierPattern, \ - CyclePattern + CyclePattern, PatternInterface import pymannkendall as mk from cydets.algorithm import detect_cycles from singleton_decorator import singleton @@ -53,7 +56,7 @@ def _is_time_series(self, series: pd.Series) -> bool: return False - def unimodality(self, series: pd.Series) -> (bool, UnimodalityPattern | None): + def unimodality(self, series: pd.Series) -> (bool, List[UnimodalityPattern] | None): """ Evaluates if the series is unimodal using Hartigan's Dip test. If it is, finds the peak or valley. @@ -86,13 +89,15 @@ def unimodality(self, series: pd.Series) -> (bool, UnimodalityPattern | None): if (max_value_index is not None and (max_value_index == series.index[0] or max_value_index == series.index[-1])) and \ (min_value_index is not None and (min_value_index == series.index[0] or min_value_index == series.index[-1])): return False, None - index_name = series.index.name + to_return = [] + # If both a peak and a valley exists, we can return both. If none exists, we return None. if max_value_index: - return True, UnimodalityPattern(series, 'Peak', max_value_index, value_name=series.name) + to_return.append(UnimodalityPattern(series, 'Peak', max_value_index, value_name=series.name)) elif min_value_index: - return True, UnimodalityPattern(series, 'Valley', min_value_index, value_name=series.name) - else: + to_return.append(UnimodalityPattern(series, 'Valley', min_value_index, value_name=series.name)) + if len(to_return) == 0: return False, None + return True, frozenset(to_return) @@ -191,7 +196,7 @@ def cycle(self, series: pd.Series) -> (bool, CyclePattern): - def __call__(self, series: pd.Series, pattern_type: PatternType) -> (bool, str): + def __call__(self, series: pd.Series, pattern_type: PatternType) -> (bool, frozenset[PatternInterface] | None): """ Calls the appropriate pattern evaluation method based on the pattern type. :param series: The series to evaluate. @@ -217,5 +222,13 @@ def __call__(self, series: pd.Series, pattern_type: PatternType) -> (bool, str): result = self.cycle(series) else: raise ValueError(f"Unsupported pattern type: {pattern_type}") - self.pattern_cache[cache_key] = result - return result + is_valid = result[0] if isinstance(result, tuple) else False + patterns = result[1] if isinstance(result, tuple) else None + # If the returned patterns are not a frozenset, convert them to one + if not isinstance(patterns, frozenset): + if not isinstance(patterns, typing.Iterable): + patterns = frozenset([patterns]) + else: + patterns = frozenset(patterns) + self.pattern_cache[cache_key] = (is_valid, patterns) + return is_valid, patterns diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py index ce11ee9..bfc20d3 100644 --- a/src/external_explainers/metainsight_explainer/patterns.py +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -97,12 +97,12 @@ def visualize_many(plt_ax, patterns: List['UnimodalityPattern'], labels: List[st plt_ax.set_title(f"Multiple {patterns[0].type if patterns else 'Unimodality'} Patterns" if title is None else title) # Add legend outside the plot - plt_ax.legend(loc='upper left') + plt_ax.legend() #plt_ax.figure.subplots_adjust(right=0.5) # Reserve 50% of width for legend # Rotate x-axis tick labels if needed - plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=8) + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=16) def __init__(self, source_series: pd.Series, type: Literal['Peak', 'Valley'], highlight_index, value_name: str=None): """ @@ -134,6 +134,8 @@ def visualize(self, plt_ax, title: str = None) -> None: elif self.type.lower() == 'valley': plt_ax.plot(self.highlight_index, self.source_series[self.highlight_index], 'bo', label='Valley') plt_ax.legend(loc="upper left") + # Rotate x-axis tick labels + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=12) if title is not None: plt_ax.set_title(title) @@ -240,13 +242,13 @@ def visualize_many(plt_ax, patterns: List['TrendPattern'], labels: List[str], ti plt_ax.set_title(title if title is not None else default_title) # Rotate x-axis tick labels - plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=8) + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=16) # First, adjust the subplot parameters to make room for the legend #plt_ax.figure.subplots_adjust(right=0.5) # Reserve 50% of width for legend # Place legend outside the plot - plt_ax.legend(loc='upper left') + plt_ax.legend() # Ensure bottom margin for x-labels plt_ax.figure.subplots_adjust(bottom=0.15) @@ -283,6 +285,8 @@ def visualize(self, plt_ax, title: str = None) -> None: linewidth=2, label=label) plt_ax.legend(loc="upper left") + # Rotate x-axis tick labels + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=12) if title is not None: plt_ax.set_title(title) @@ -394,7 +398,7 @@ def visualize_many(plt_ax, patterns: List['OutlierPattern'], labels: List[str], plt_ax.set_title(title if title is not None else default_title) # Rotate x-axis tick labels if needed - plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=8) + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=16) # Get the current handles and labels handles, labels_current = plt_ax.get_legend_handles_labels() @@ -407,7 +411,7 @@ def visualize_many(plt_ax, patterns: List['OutlierPattern'], labels: List[str], #plt_ax.figure.subplots_adjust(right=0.5) # Reserve 30% of width for legend # Place legend outside the plot with combined handles/labels - plt_ax.legend(all_handles, all_labels, loc='upper left') + plt_ax.legend(all_handles, all_labels) # Ensure bottom margin for x-labels plt_ax.figure.subplots_adjust(bottom=0.15) @@ -439,6 +443,8 @@ def visualize(self, plt_ax, title: str = None) -> None: # Emphasize the outliers plt_ax.scatter(self.outlier_indexes, self.outlier_values, color='red', label='Outliers') plt_ax.legend(loc="upper left") + # Rotate x-axis tick labels + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=12) if title is not None: plt_ax.set_title(title) @@ -613,13 +619,13 @@ def visualize_many(plt_ax, patterns: List['CyclePattern'], labels: List[str], ti plt_ax.set_title(title if title is not None else default_title) # Rotate x-axis tick labels - plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=8) + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=16) # Adjust subplot parameters to make room for the legend #plt_ax.figure.subplots_adjust(right=0.5) # Reserve 50% of width for legend # Place legend outside the plot - plt_ax.legend(legend_handles, legend_labels, loc='upper left') + plt_ax.legend(legend_handles, legend_labels) # Ensure bottom margin for x-labels plt_ax.figure.subplots_adjust(bottom=0.15) @@ -656,6 +662,8 @@ def visualize(self, plt_ax, title: str = None): i += 1 color_index = (color_index + 1) % len(colors) plt_ax.legend(loc="upper left") + # Rotate x-axis tick labels + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=12) if title is not None: plt_ax.set_title(title) From d42b9fc72005d594fdb0e559d6a4c9e51f675322 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Tue, 20 May 2025 22:07:00 +0300 Subject: [PATCH 17/27] Metainsight visualizations possibly done --- .../metainsight_explainer/data_pattern.py | 13 +- .../metainsight_explainer/data_scope.py | 9 +- .../metainsight_explainer/meta_insight.py | 42 +-- .../metainsight_mining.py | 20 +- .../metainsight_explainer/patterns.py | 352 ++++++++++++------ 5 files changed, 288 insertions(+), 148 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/data_pattern.py b/src/external_explainers/metainsight_explainer/data_pattern.py index 2594b43..d4a431f 100644 --- a/src/external_explainers/metainsight_explainer/data_pattern.py +++ b/src/external_explainers/metainsight_explainer/data_pattern.py @@ -75,19 +75,19 @@ def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: Patt # Group by breakdown dimension and aggregate measure if data_scope.breakdown not in filtered_df.columns: # Cannot group by breakdown if it's not in the filtered data - return BasicDataPattern(data_scope, PatternType.NONE, None) + return [BasicDataPattern(data_scope, PatternType.NONE, None)] measure_col, agg_func = data_scope.measure if measure_col not in filtered_df.columns: # Cannot aggregate if measure column is not in the data - return BasicDataPattern(data_scope, PatternType.NONE, None) + return [BasicDataPattern(data_scope, PatternType.NONE, None)] try: # Perform the aggregation aggregated_series = filtered_df.groupby(data_scope.breakdown)[measure_col].agg(agg_func) except Exception as e: print(f"Error during aggregation for {data_scope}: {e}") - return BasicDataPattern(data_scope, PatternType.NONE, None) + return [BasicDataPattern(data_scope, PatternType.NONE, None)] # Ensure series is sortable if breakdown is temporal if df[data_scope.breakdown].dtype in ['datetime64[ns]', 'period[M]', 'int64']: @@ -122,7 +122,7 @@ def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: Patt def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, hds: List[DataScope] = None, temporal_dimensions: List[str] = None, measures: List[Tuple[str,str]] = None, n_bins: int = 10, - extend_by_measure: bool = False) -> Tuple['HomogenousDataPattern', Dict]: + extend_by_measure: bool = False, extend_by_breakdown: bool = False) -> Tuple['HomogenousDataPattern', Dict]: """ Generates a Homogenous Data Pattern (HDP) either from a given HDS or from the current DataScope. @@ -132,11 +132,14 @@ def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, :param temporal_dimensions: The temporal dimensions to extend the breakdown with. Expected as a list of strings. Only needed if hds is None. :param measures: The measures to extend the measure with. Expected to be a dict {measure_column: aggregate_function}. Only needed if hds is None. :param n_bins: The number of bins to use for numeric columns. Defaults to 10. + :param extend_by_measure: Whether to extend the hds by measure. Defaults to False. + :param extend_by_breakdown: Whether to extend the hds by breakdown. Defaults to False. :return: A tuple containing the created HomogenousDataPattern and the updated pattern cache. """ if hds is None or len(hds) == 0: hds = self.data_scope.create_hds(dims=temporal_dimensions, measures=measures, - n_bins=n_bins, extend_by_measure=extend_by_measure) + n_bins=n_bins, extend_by_measure=extend_by_measure, + extend_by_breakdown=extend_by_breakdown) # All the data scopes in the HDS should have the same source_df, and it should be # the same as the source_df of the current DataScope (otherwise, this pattern should not be # the one producing the HDP with this HDS). diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py index 97f31b2..af26645 100644 --- a/src/external_explainers/metainsight_explainer/data_scope.py +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -126,7 +126,9 @@ def _breakdown_extend(self, dims: List[str]) -> List['DataScope']: def create_hds(self, dims: List[str] = None, measures: List[Tuple[str,str]] = None, n_bins: int = 10, - extend_by_measure: bool = False) -> 'HomogenousDataScope': + extend_by_measure: bool = False, + extend_by_breakdown: bool = False, + ) -> 'HomogenousDataScope': """ Generates a Homogeneous Data Scope (HDS) from a base data scope, using subspace, measure and breakdown extensions as defined in the MetaInsight paper. @@ -136,6 +138,8 @@ def create_hds(self, dims: List[str] = None, :param n_bins: The number of bins to use for numeric columns. Defaults to 10. :param extend_by_measure: Whether to use measure extension or not. Defaults to False. Setting this to true can lead to metainsights with mixed aggregation functions, which may often be undesirable. + :param extend_by_breakdown: Whether to use breakdown extension or not. Defaults to False. Setting this to True + can lead to metainsights with several disjoint indexes, which may often be undesirable. :return: A HDS in the form of a list of DataScope objects. """ @@ -154,7 +158,8 @@ def create_hds(self, dims: List[str] = None, hds.extend(self._measure_extend(measures)) # Breakdown Extending - hds.extend(self._breakdown_extend(dims)) + if extend_by_breakdown: + hds.extend(self._breakdown_extend(dims)) return HomogenousDataScope(hds) diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index 187ffa4..03a4f39 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -333,7 +333,7 @@ def _create_commonness_set_title(self, commonness_set: List[BasicDataPattern]) - measures_str.append(f"{{{measure[0]}: {measure[1]}}}") else: measures_str.append(measure) - title += f"when grouping by {', '.join(breakdowns)} and aggregating by {' or '.join(measures_str)}" + title += f"when grouping by {' or '.join(breakdowns)} and aggregating by {' or '.join(measures_str)}" title = textwrap.wrap(title, 70) title = "\n".join(title) return title @@ -342,6 +342,8 @@ def visualize_commonesses_individually(self, fig=None, subplot_spec=None, figsiz """ Visualize only the commonness sets of the metainsight, with each set in its own column. Within each column, patterns are arranged in a grid with at most 3 patterns per column. + This was the initial visualization method, but it was too cluttered and not very useful, so it was renamed and + replaced with the more compact and informative visualize method. :param fig: Optional figure to plot on (or create a new one if None) :param subplot_spec: Optional subplot specification to plot within @@ -500,7 +502,7 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: # Set up the right side for exceptions with one row per exception type right_grid = gridspec.GridSpecFromSubplotSpec(len(self.exceptions), 1, subplot_spec=outer_grid[0, 1], - hspace=0.4) # Add more vertical space + hspace=0.5) # Add more vertical space # Process each exception category for i, (category, exception_patterns) in enumerate(self.exceptions.items()): @@ -539,18 +541,19 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: # Create a nested grid for this row with more space type_grid = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=right_grid[i, 0], - height_ratios=[1, 5], hspace=0.5, wspace=0.3) + height_ratios=[1, 15], hspace=0.5, wspace=0.3) # Add title for the category in the first row title_ax = fig.add_subplot(type_grid[0, 0]) title_ax.axis('off') title_ax.set_facecolor((0.8, 0.9, 1.0, 0.2)) - title_ax.text(0.5, 0.5, - f"Different patterns types detected ({len(exception_patterns)})", + title_ax.text(0.5, 0, + s=f"Different patterns types detected ({len(exception_patterns)})", horizontalalignment='center', verticalalignment='center', - fontsize=12, - fontweight='bold') + fontsize=16, + fontweight='bold' + ) # Create subplots for each pattern in the second row num_patterns = len(exception_patterns) @@ -559,7 +562,7 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: n_rows = math.ceil(num_patterns / n_cols) pattern_grid = gridspec.GridSpecFromSubplotSpec(n_rows, n_cols, subplot_spec=type_grid[1, 0], - wspace=0.4) # More horizontal space + wspace=0.4, hspace=0.4) # More horizontal space for j, pattern in enumerate(exception_patterns): @@ -571,29 +574,12 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: # Format labels for title subspace_str = ", ".join([f"{key}={val}" for key, val in pattern.data_scope.subspace.items()]) - title = "" - if pattern.pattern_type == PatternType.UNIMODALITY: - title += "Unimodality found for " - if pattern.pattern_type == PatternType.TREND: - title += "Trend found for " - if pattern.pattern_type == PatternType.OUTLIER: - title += "Outliers found for " - if pattern.pattern_type == PatternType.CYCLE: - title += "Cycles found for " - - title += f"{subspace_str}," - title = textwrap.fill(title, 30) # Wrap title to prevent overflow + title = f"{pattern.highlight.__name__} when {subspace_str}" + title = "\n".join(textwrap.wrap(title, 30)) # Wrap title to prevent overflow # Visualize the individual pattern with internal legend if pattern.highlight: - # Custom visualization with compact legend - def individual_exception_visualize(plt_ax): - pattern.highlight.visualize(plt_ax=plt_ax) - if hasattr(plt_ax, 'legend'): - plt_ax.legend(loc='lower center', fontsize=10) - - individual_exception_visualize(ax) - ax.set_title(title, fontsize=10) + pattern.highlight.visualize(ax, title=title) # Allow more space for the figure elements plt.subplots_adjust(bottom=0.15, top=0.9) # Adjust bottom and top margins diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index cef2144..a3fdbbe 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -130,7 +130,8 @@ def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): def mine_metainsights(self, source_df: pd.DataFrame, dimensions: List[str], measures: List[Tuple[str,str]], n_bins: int = 10, - extend_by_measure: bool = False + extend_by_measure: bool = False, + extend_by_breakdown: bool = False ) -> List[MetaInsight]: """ The main function to mine MetaInsights. @@ -139,9 +140,13 @@ def mine_metainsights(self, source_df: pd.DataFrame, :param dimensions: The dimensions to consider for mining. :param measures: The measures to consider for mining. :param n_bins: The number of bins to use for numeric columns. + :param extend_by_measure: Whether to extend the data scope by measure. Settings this to true can cause strange results, + because we will consider multiple aggregation functions on the same filter dimension. + :param extend_by_breakdown: Whether to extend the data scope by breakdown. Settings this to true can cause strange results, + because we will consider multiple different groupby dimensions on the same filter dimension, which can lead to + having a metainsight on 2 disjoint sets of indexes. :return: """ - metainsight_candidates = set() datascope_cache = {} pattern_cache = {} hdp_queue = PriorityQueue() @@ -189,7 +194,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, # If a valid basic pattern is found, extend the data scope to generate HDS hdp, pattern_cache = base_dp.create_hdp(temporal_dimensions=dimensions, measures=measures, pattern_type=pattern_type, pattern_cache=pattern_cache, - extend_by_measure=extend_by_measure) + extend_by_measure=extend_by_measure, extend_by_breakdown=extend_by_breakdown) # Pruning 1 - if the HDP is unlikely to form a commonness, discard it if len(hdp) < len(hdp.data_scopes) * self.min_commonness: @@ -204,6 +209,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, hdp_queue.put((hdp, pattern_type)) processed_hdp_count = 0 + metainsight_candidates = {} while not hdp_queue.empty(): hdp, pattern_type = hdp_queue.get() processed_hdp_count += 1 @@ -214,7 +220,13 @@ def mine_metainsights(self, source_df: pd.DataFrame, if metainsight: # Calculate and assign the score metainsight.compute_score(datascope_cache) - metainsight_candidates.add(metainsight) + if metainsight in metainsight_candidates: + other_metainsight = metainsight_candidates[metainsight] + if metainsight.score > other_metainsight.score: + # If the new metainsight is better, replace the old one + metainsight_candidates[metainsight] = metainsight + else: + metainsight_candidates[metainsight] = metainsight return self.rank_metainsights(list(metainsight_candidates)) diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py index bfc20d3..1526a4e 100644 --- a/src/external_explainers/metainsight_explainer/patterns.py +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -49,6 +49,28 @@ def __hash__(self) -> int: """ raise NotImplementedError("Subclasses must implement this method.") + @staticmethod + def prepare_patterns_for_visualization(patterns): + """ + Prepare patterns for visualization by creating a consistent numeric position mapping. + Returns a mapping of original indices to numeric positions for plotting. + + :param patterns: List of pattern objects with source_series attribute + :return: Dictionary mapping original indices to positions and sorted unique indices + """ + # Collect all unique indices from all patterns + all_indices = set() + for pattern in patterns: + all_indices.update(pattern.source_series.index) + + # Sort indices in their natural order - this works for dates, numbers, etc. + sorted_indices = sorted(list(all_indices)) + + # Create mapping from original index to position (0, 1, 2, ...) + index_to_position = {idx: pos for pos, idx in enumerate(sorted_indices)} + + return index_to_position, sorted_indices + @staticmethod @abstractmethod @@ -62,9 +84,13 @@ def visualize_many(plt_ax, patterns: List['PatternInterface'], labels:List[str], """ raise NotImplementedError("Subclasses must implement this method.") + __name__ = "PatternInterface" + class UnimodalityPattern(PatternInterface): + __name__ = "Unimodality pattern" + @staticmethod def visualize_many(plt_ax, patterns: List['UnimodalityPattern'], labels: List[str], title: str = None) -> None: """ @@ -77,6 +103,25 @@ def visualize_many(plt_ax, patterns: List['UnimodalityPattern'], labels: List[st # Define a color cycle for lines colors = plt.cm.tab10.colors + # Get a union of the indexes of all patterns. We do this because some patterns may be missing + # some of the indexes due to filters, which can cause missing x axis labels as a result. + all_indexes = set() + for pattern in patterns: + # Convert all indexes to strings, to avoid issues with type mismatches causing exceptions + all_indexes.update(pattern.source_series.index) + + all_indexes = list(all_indexes) + all_indexes.sort() + + # Add the missing parts of the index + for pattern in patterns: + new_series = pd.Series(index=all_indexes, dtype=pattern.source_series.dtype) + for idx in all_indexes: + if idx in pattern.source_series.index: + new_series[idx] = pattern.source_series[idx] + pattern.source_series = new_series + + for i, (pattern, label) in enumerate(zip(patterns, labels)): color = colors[i % len(colors)] @@ -85,10 +130,10 @@ def visualize_many(plt_ax, patterns: List['UnimodalityPattern'], labels: List[st # Highlight the peak or valley with a marker if pattern.type.lower() == 'peak': - plt_ax.plot(pattern.highlight_index, pattern.source_series[pattern.highlight_index], + plt_ax.plot(pattern.highlight_index, pattern.source_series.loc[pattern.highlight_index], 'o', color=color, markersize=8, markeredgecolor='black') elif pattern.type.lower() == 'valley': - plt_ax.plot(pattern.highlight_index, pattern.source_series[pattern.highlight_index], + plt_ax.plot(pattern.highlight_index, pattern.source_series.loc[pattern.highlight_index], 'v', color=color, markersize=8, markeredgecolor='black') # Set labels and title @@ -104,6 +149,67 @@ def visualize_many(plt_ax, patterns: List['UnimodalityPattern'], labels: List[st # Rotate x-axis tick labels if needed plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=16) + @staticmethod + def visualize_many(plt_ax, patterns: List['UnimodalityPattern'], labels: List[str], title: str = None) -> None: + """ + Visualize multiple unimodality patterns on a single plot. + + :param plt_ax: Matplotlib axes to plot on + :param patterns: List of UnimodalityPattern objects + :param labels: List of labels for each pattern (e.g. data scope descriptions) + """ + # Define a color cycle for lines + colors = plt.cm.tab10.colors + + # Prepare patterns with consistent numeric positions + index_to_position, sorted_indices = PatternInterface.prepare_patterns_for_visualization(patterns) + + # Plot each pattern + for i, (pattern, label) in enumerate(zip(patterns, labels)): + color = colors[i % len(colors)] + + # Map series to numeric positions for plotting + x_positions = [index_to_position[idx] for idx in pattern.source_series.index] + values = pattern.source_series.values + + # Plot the series with a unique color + plt_ax.plot(x_positions, values, color=color, alpha=0.7, label=label) + + # Highlight the peak or valley with a marker + if pattern.type.lower() == 'peak' and pattern.highlight_index in pattern.source_series.index: + highlight_pos = index_to_position[pattern.highlight_index] + plt_ax.plot(highlight_pos, pattern.source_series.loc[pattern.highlight_index], + 'o', color=color, markersize=8, markeredgecolor='black') + elif pattern.type.lower() == 'valley' and pattern.highlight_index in pattern.source_series.index: + highlight_pos = index_to_position[pattern.highlight_index] + plt_ax.plot(highlight_pos, pattern.source_series.loc[pattern.highlight_index], + 'v', color=color, markersize=8, markeredgecolor='black') + + # Set x-ticks to show original index values + if sorted_indices: + # For large datasets, show fewer tick labels + step = max(1, len(sorted_indices) // 10) + positions = list(range(0, len(sorted_indices), step)) + tick_labels = [str(sorted_indices[pos]) for pos in positions] + + plt_ax.set_xticks(positions) + plt_ax.set_xticklabels(tick_labels, rotation=45, ha='right', fontsize=16) + + # Set labels and title + plt_ax.set_xlabel(patterns[0].index_name if patterns else 'Index') + plt_ax.set_ylabel(patterns[0].value_name if patterns else 'Value') + plt_ax.set_title( + f"Multiple {patterns[0].type if patterns else 'Unimodality'} Patterns" if title is None else title) + + # Add legend + plt_ax.legend() + + # Rotate x-axis tick labels + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=16) + + # Ensure bottom margin for x-labels + plt_ax.figure.subplots_adjust(bottom=0.15) + def __init__(self, source_series: pd.Series, type: Literal['Peak', 'Valley'], highlight_index, value_name: str=None): """ Initialize the UnimodalityPattern with the provided parameters. @@ -181,6 +287,8 @@ def __hash__(self) -> int: class TrendPattern(PatternInterface): + __name__ = "Trend pattern" + @staticmethod def visualize_many(plt_ax, patterns: List['TrendPattern'], labels: List[str], title: str = None, show_data: bool = True, alpha_data: float = 0.6) -> None: @@ -197,42 +305,54 @@ def visualize_many(plt_ax, patterns: List['TrendPattern'], labels: List[str], ti # Define a color cycle for lines colors = plt.cm.tab10.colors - # Define line styles for additional differentiation. This is taken from matplotlib's - # docs: https://matplotlib.org/stable/gallery/lines_bars_and_markers/linestyles.html + # Define line styles for additional differentiation. + # Taken from the matplotlib docs. line_styles = [ ('loosely dotted', (0, (1, 10))), ('dotted', (0, (1, 5))), ('densely dotted', (0, (1, 1))), - ('long dash with offset', (5, (10, 3))), ('loosely dashed', (0, (5, 10))), ('dashed', (0, (5, 5))), ('densely dashed', (0, (5, 1))), - ('loosely dashdotted', (0, (3, 10, 1, 10))), ('dashdotted', (0, (3, 5, 1, 5))), ('densely dashdotted', (0, (3, 1, 1, 1))), - ('dashdotdotted', (0, (3, 5, 1, 5, 1, 5))), ('loosely dashdotdotted', (0, (3, 10, 1, 10, 1, 10))), ('densely dashdotdotted', (0, (3, 1, 1, 1, 1, 1)))] + # Prepare patterns with consistent numeric positions + index_to_position, sorted_indices = PatternInterface.prepare_patterns_for_visualization(patterns) + for i, (pattern, label) in enumerate(zip(patterns, labels)): color = colors[i % len(colors)] - line_style = line_styles[i % len(line_styles)][1] + line_style = line_styles[i % len(line_styles)][1] + + # Map series to numeric positions for plotting + x_positions = [index_to_position[idx] for idx in pattern.source_series.index] + values = pattern.source_series.values # Plot the raw data with reduced opacity if requested if show_data: - plt_ax.plot(pattern.source_series, color=color, alpha=alpha_data, linewidth=1) - - # Get x range for trend line - x_numeric = np.arange(len(pattern.source_series)) + plt_ax.plot(x_positions, values, color=color, alpha=alpha_data, linewidth=1) - # Plot the trend line + # Plot the trend line using numeric positions trend_label = f"{label}" - plt_ax.plot(pattern.source_series.index, pattern.slope * x_numeric + pattern.intercept, + x_range = np.arange(len(sorted_indices)) + plt_ax.plot(x_range, pattern.slope * x_range + pattern.intercept, linestyle=line_style, color=color, linewidth=2, label=trend_label) + # Set x-ticks to show original index values + if sorted_indices: + # For large datasets, show fewer tick labels + step = max(1, len(sorted_indices) // 10) + positions = list(range(0, len(sorted_indices), step)) + tick_labels = [str(sorted_indices[pos]) for pos in positions] + + plt_ax.set_xticks(positions) + plt_ax.set_xticklabels(tick_labels, rotation=45, ha='right', fontsize=16) + # Set labels and title if patterns: plt_ax.set_xlabel(patterns[0].source_series.index.name if patterns[0].source_series.index.name else 'Index') @@ -244,10 +364,7 @@ def visualize_many(plt_ax, patterns: List['TrendPattern'], labels: List[str], ti # Rotate x-axis tick labels plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=16) - # First, adjust the subplot parameters to make room for the legend - #plt_ax.figure.subplots_adjust(right=0.5) # Reserve 50% of width for legend - - # Place legend outside the plot + # Add legend plt_ax.legend() # Ensure bottom margin for x-labels @@ -330,89 +447,96 @@ def __hash__(self) -> int: class OutlierPattern(PatternInterface): + __name__ = "Outlier pattern" + @staticmethod def visualize_many(plt_ax, patterns: List['OutlierPattern'], labels: List[str], title: str = None, show_regular: bool = True, alpha_regular: float = 0.5, alpha_outliers: float = 0.9) -> None: """ Visualize multiple outlier patterns on a single plot. - - :param plt_ax: Matplotlib axes to plot on - :param patterns: List of OutlierPattern objects - :param labels: List of labels for each pattern - :param title: Optional custom title for the plot - :param show_regular: Whether to show regular (non-outlier) data points - :param alpha_regular: Opacity for regular data points - :param alpha_outliers: Opacity for outlier points """ - # Define a color cycle for different datasets colors = plt.cm.tab10.colors + regular_marker = 'o' + outlier_marker = 'X' - # Define marker styles - regular_marker = 'o' # Circle for regular points - outlier_marker = 'X' # X mark for outliers + # Prepare patterns with consistent numeric positions + index_to_position, sorted_indices = PatternInterface.prepare_patterns_for_visualization(patterns) - # Create a legend handle for the outlier explanation - from matplotlib.lines import Line2D - custom_lines = [Line2D([0], [0], marker=outlier_marker, color='black', - markerfacecolor='black', markersize=10, linestyle='')] - custom_labels = ['Outliers (marked with X)'] - - # Plot each dataset + # Plot each pattern for i, (pattern, label) in enumerate(zip(patterns, labels)): color = colors[i % len(colors)] - # Plot regular data points if requested + # Plot regular data points if show_regular: + # Get positions and values for plotting + positions = [index_to_position[idx] for idx in pattern.source_series.index] + values = pattern.source_series.values + plt_ax.scatter( - pattern.source_series.index, - pattern.source_series, + positions, + values, color=color, alpha=alpha_regular, marker=regular_marker, - s=30, # Size + s=30, label=label ) else: - # Still add to legend even if not showing points plt_ax.scatter([], [], color=color, marker=regular_marker, s=30, label=label) - # Plot outliers with the same color but a different marker + # Plot outliers if pattern.outlier_indexes is not None and len(pattern.outlier_indexes) > 0: + # Map outliers to positions + outlier_positions = [] + outlier_values = [] + + for idx in pattern.outlier_indexes: + if idx in pattern.source_series.index: + outlier_positions.append(index_to_position[idx]) + outlier_values.append(pattern.source_series.loc[idx]) + plt_ax.scatter( - pattern.outlier_indexes, - pattern.outlier_values, + outlier_positions, + outlier_values, color=color, alpha=alpha_outliers, marker=outlier_marker, - s=100, # Larger size for outliers - edgecolors='black', # Black edge for visibility + s=100, + edgecolors='black', linewidth=1.5 ) + # Set x-ticks to show original index values + if sorted_indices: + # For large datasets, show fewer tick labels + step = max(1, len(sorted_indices) // 10) + positions = list(range(0, len(sorted_indices), step)) + labels = [str(sorted_indices[pos]) for pos in positions] + + plt_ax.set_xticks(positions) + plt_ax.set_xticklabels(labels, rotation=45, ha='right', fontsize=16) + + # Setup the rest of the plot + from matplotlib.lines import Line2D + custom_lines = [Line2D([0], [0], marker=outlier_marker, color='black', + markerfacecolor='black', markersize=10, linestyle='')] + custom_labels = ['Outliers (marked with X)'] + # Set labels and title if patterns: plt_ax.set_xlabel(patterns[0].source_series.index.name if patterns[0].source_series.index.name else 'Index') plt_ax.set_ylabel(patterns[0].value_name if patterns[0].value_name else 'Value') - default_title = "Multiple Outlier Patterns" - plt_ax.set_title(title if title is not None else default_title) - - # Rotate x-axis tick labels if needed - plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=16) + plt_ax.set_title(title if title is not None else "Multiple Outlier Patterns") - # Get the current handles and labels + # Setup legend handles, labels_current = plt_ax.get_legend_handles_labels() - - # Combine with custom outlier explanation all_handles = handles + custom_lines all_labels = labels_current + custom_labels - - # Adjust subplot parameters to make room for the legend - #plt_ax.figure.subplots_adjust(right=0.5) # Reserve 30% of width for legend - - # Place legend outside the plot with combined handles/labels plt_ax.legend(all_handles, all_labels) + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=16) + # Ensure bottom margin for x-labels plt_ax.figure.subplots_adjust(bottom=0.15) @@ -488,6 +612,8 @@ def __hash__(self) -> int: class CyclePattern(PatternInterface): + __name__ = "Cycle pattern" + @staticmethod def visualize_many(plt_ax, patterns: List['CyclePattern'], labels: List[str], title: str = None, alpha_cycles: float = 0.3, line_alpha: float = 0.8) -> None: @@ -502,11 +628,13 @@ def visualize_many(plt_ax, patterns: List['CyclePattern'], labels: List[str], ti :param line_alpha: Opacity for the time series lines """ import numpy as np - import pandas as pd # Define a color cycle for lines colors = plt.cm.tab10.colors + # Prepare patterns with consistent numeric positions + index_to_position, sorted_indices = PatternInterface.prepare_patterns_for_visualization(patterns) + # Color for common cycles common_cycle_color = 'darkviolet' @@ -520,46 +648,36 @@ def visualize_many(plt_ax, patterns: List['CyclePattern'], labels: List[str], ti for pattern in patterns: if hasattr(pattern, 'cycles') and not pattern.cycles.empty: for _, cycle in pattern.cycles.iterrows(): - all_cycle_data.append((cycle['t_start'], cycle['t_end'])) + # Map to numeric positions + t_start_pos = index_to_position.get(cycle['t_start'], None) + t_end_pos = index_to_position.get(cycle['t_end'], None) + if t_start_pos is not None and t_end_pos is not None: + all_cycle_data.append((t_start_pos, t_end_pos)) - # Find common cycle periods + # Find common cycle periods (using numeric positions) common_periods = [] if len(patterns) > 1 and all_cycle_data: - # Handle datetime objects by creating a time_points array differently - # Get all unique timestamps from starts and ends - all_timestamps = sorted(list(set([t for start, end in all_cycle_data for t in [start, end]]))) - - # Create additional points between timestamps if needed - if len(all_timestamps) > 1: - time_points = [] - for i in range(len(all_timestamps) - 1): - # Add the current timestamp - time_points.append(all_timestamps[i]) - - # Add intermediate points if the gap is large enough - curr = pd.Timestamp(all_timestamps[i]) - next_ts = pd.Timestamp(all_timestamps[i + 1]) - if (next_ts - curr).total_seconds() > 60: # If gap is more than a minute - # Add 10 intermediate points - delta = (next_ts - curr) / 11 - for j in range(1, 11): - time_points.append(curr + delta * j) - - # Add the last timestamp - time_points.append(all_timestamps[-1]) + # Get all unique numeric positions from starts and ends + all_positions = sorted(list(set([pos for start, end in all_cycle_data for pos in [start, end]]))) + + # Create additional points between positions if needed + if len(all_positions) > 1: + position_points = np.linspace(min(all_positions), max(all_positions), 100) else: - time_points = all_timestamps + position_points = all_positions - # For each time point, check if it falls within a cycle for each pattern - overlap_counts = np.zeros(len(time_points)) + # For each position point, check if it falls within a cycle for each pattern + overlap_counts = np.zeros(len(position_points)) for pattern in patterns: if hasattr(pattern, 'cycles') and not pattern.cycles.empty: - pattern_mask = np.zeros(len(time_points), dtype=bool) + pattern_mask = np.zeros(len(position_points), dtype=bool) for _, cycle in pattern.cycles.iterrows(): - start, end = cycle['t_start'], cycle['t_end'] - pattern_mask = pattern_mask | ( - (np.array(time_points) >= start) & (np.array(time_points) <= end)) + t_start_pos = index_to_position.get(cycle['t_start'], None) + t_end_pos = index_to_position.get(cycle['t_end'], None) + if t_start_pos is not None and t_end_pos is not None: + pattern_mask = pattern_mask | ( + (position_points >= t_start_pos) & (position_points <= t_end_pos)) overlap_counts += pattern_mask # Find regions where all patterns have a cycle @@ -572,14 +690,18 @@ def visualize_many(plt_ax, patterns: List['CyclePattern'], labels: List[str], ti end_indices = np.where(changes == -1)[0] - 1 for start_idx, end_idx in zip(start_indices, end_indices): - common_periods.append((time_points[start_idx], time_points[end_idx])) + common_periods.append((position_points[start_idx], position_points[end_idx])) # Plot each pattern for i, (pattern, label) in enumerate(zip(patterns, labels)): color = colors[i % len(colors)] + # Map series to numeric positions for plotting + x_positions = [index_to_position[idx] for idx in pattern.source_series.index] + values = pattern.source_series.values + # Plot the time series - line, = plt_ax.plot(pattern.source_series, color=color, alpha=line_alpha, linewidth=2) + line, = plt_ax.plot(x_positions, values, color=color, alpha=line_alpha, linewidth=2, label=label) legend_handles.append(line) legend_labels.append(label) @@ -589,16 +711,22 @@ def visualize_many(plt_ax, patterns: List['CyclePattern'], labels: List[str], ti cycle_patch = plt.Rectangle((0, 0), 1, 1, color=color, alpha=alpha_cycles) for _, cycle in pattern.cycles.iterrows(): - # Highlight the cycle only if it is not in the common cycles - we highlight those later. - if not any( - start <= cycle['t_start'] <= end and start <= cycle['t_end'] <= end - for start, end in common_periods - ): - t_start = cycle['t_start'] - t_end = cycle['t_end'] + t_start_pos = index_to_position.get(cycle['t_start'], None) + t_end_pos = index_to_position.get(cycle['t_end'], None) + + if t_start_pos is None or t_end_pos is None: + continue + + # Check if this cycle overlaps with common cycles + is_common = any( + start <= t_start_pos <= end and start <= t_end_pos <= end + for start, end in common_periods + ) + # Highlight the cycle only if it is not in the common cycles + if not is_common: # Highlight the cycle region - plt_ax.axvspan(t_start, t_end, color=color, alpha=alpha_cycles) + plt_ax.axvspan(t_start_pos, t_end_pos, color=color, alpha=alpha_cycles) # Highlight common cycles if common_periods: @@ -610,6 +738,16 @@ def visualize_many(plt_ax, patterns: List['CyclePattern'], labels: List[str], ti legend_handles.append(common_patch) legend_labels.append('Common cycles (all patterns)') + # Set x-ticks to show original index values + if sorted_indices: + # For large datasets, show fewer tick labels + step = max(1, len(sorted_indices) // 10) + positions = list(range(0, len(sorted_indices), step)) + tick_labels = [str(sorted_indices[pos]) for pos in positions] + + plt_ax.set_xticks(positions) + plt_ax.set_xticklabels(tick_labels, rotation=45, ha='right', fontsize=16) + # Set labels and title if patterns: plt_ax.set_xlabel(patterns[0].source_series.index.name if patterns[0].source_series.index.name else 'Index') @@ -618,15 +756,11 @@ def visualize_many(plt_ax, patterns: List['CyclePattern'], labels: List[str], ti default_title = "Multiple Cycle Patterns" plt_ax.set_title(title if title is not None else default_title) - # Rotate x-axis tick labels - plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=16) - - # Adjust subplot parameters to make room for the legend - #plt_ax.figure.subplots_adjust(right=0.5) # Reserve 50% of width for legend - - # Place legend outside the plot + # Add legend plt_ax.legend(legend_handles, legend_labels) + plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=16) + # Ensure bottom margin for x-labels plt_ax.figure.subplots_adjust(bottom=0.15) From 5cf183735fe812f7aca04833076d53b93a8e0725 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Thu, 22 May 2025 22:29:09 +0300 Subject: [PATCH 18/27] Fixed bugs in computations and visualizations, added more caching for improved performance. --- .../metainsight_explainer/data_pattern.py | 17 ++++--- .../metainsight_explainer/data_scope.py | 49 +++++++++++++------ .../metainsight_explainer/meta_insight.py | 14 +++--- .../metainsight_mining.py | 47 +++++++++++++----- 4 files changed, 86 insertions(+), 41 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/data_pattern.py b/src/external_explainers/metainsight_explainer/data_pattern.py index d4a431f..b9e0a83 100644 --- a/src/external_explainers/metainsight_explainer/data_pattern.py +++ b/src/external_explainers/metainsight_explainer/data_pattern.py @@ -73,7 +73,7 @@ def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: Patt filtered_df = data_scope.apply_subspace() # Group by breakdown dimension and aggregate measure - if data_scope.breakdown not in filtered_df.columns: + if any([dim for dim in data_scope.breakdown if dim not in filtered_df.columns]): # Cannot group by breakdown if it's not in the filtered data return [BasicDataPattern(data_scope, PatternType.NONE, None)] @@ -84,13 +84,18 @@ def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: Patt try: # Perform the aggregation - aggregated_series = filtered_df.groupby(data_scope.breakdown)[measure_col].agg(agg_func) + if agg_func != "std": + aggregated_series = filtered_df.groupby(data_scope.breakdown)[measure_col].agg(agg_func) + else: + # For standard deviation, we need to use the std function directly + aggregated_series = filtered_df.groupby(data_scope.breakdown)[measure_col].std(ddof=1) except Exception as e: print(f"Error during aggregation for {data_scope}: {e}") return [BasicDataPattern(data_scope, PatternType.NONE, None)] # Ensure series is sortable if breakdown is temporal - if df[data_scope.breakdown].dtype in ['datetime64[ns]', 'period[M]', 'int64']: + if all([True for dim in data_scope.breakdown if df[dim].dtype.kind in 'iuMmfc']): + # If the breakdown is temporal or at-least can be sorted, sort the series aggregated_series = aggregated_series.sort_index() # Evaluate the specific pattern type @@ -120,7 +125,7 @@ def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: Patt return returned_patterns def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, - hds: List[DataScope] = None, temporal_dimensions: List[str] = None, + hds: List[DataScope] = None, group_by_dims: List[List[str]] = None, measures: List[Tuple[str,str]] = None, n_bins: int = 10, extend_by_measure: bool = False, extend_by_breakdown: bool = False) -> Tuple['HomogenousDataPattern', Dict]: """ @@ -129,7 +134,7 @@ def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, :param pattern_type: The type of the pattern (e.g., 'Unimodality', 'Trend', etc.), provided as a PatternType enum. :param pattern_cache: A cache for the pattern, if available. :param hds: A list of DataScopes to create the HDP from. If None, it will be created from the current DataScope. - :param temporal_dimensions: The temporal dimensions to extend the breakdown with. Expected as a list of strings. Only needed if hds is None. + :param group_by_dims: The temporal dimensions to extend the breakdown with. Expected as a list of lists of strings. :param measures: The measures to extend the measure with. Expected to be a dict {measure_column: aggregate_function}. Only needed if hds is None. :param n_bins: The number of bins to use for numeric columns. Defaults to 10. :param extend_by_measure: Whether to extend the hds by measure. Defaults to False. @@ -137,7 +142,7 @@ def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, :return: A tuple containing the created HomogenousDataPattern and the updated pattern cache. """ if hds is None or len(hds) == 0: - hds = self.data_scope.create_hds(dims=temporal_dimensions, measures=measures, + hds = self.data_scope.create_hds(dims=group_by_dims, measures=measures, n_bins=n_bins, extend_by_measure=extend_by_measure, extend_by_breakdown=extend_by_breakdown) # All the data scopes in the HDS should have the same source_df, and it should be diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py index af26645..2521d02 100644 --- a/src/external_explainers/metainsight_explainer/data_scope.py +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -12,19 +12,25 @@ class DataScope: The subspace is {City: Los Angeles, Month: *}, the breakdown is {Month} and the measure is {SUM(Sales)}. """ - def __init__(self, source_df: pd.DataFrame, subspace: Dict[str, str], breakdown: str, measure: tuple): + def __init__(self, source_df: pd.DataFrame, subspace: Dict[str, str], + breakdown: str | List[str], + measure: tuple): """ Initialize the DataScope with the provided subspace, breakdown and measure. :param source_df: The DataFrame containing the data. :param subspace: dict of filters, e.g., {'City': 'Los Angeles', 'Month': '*'} - :param breakdown: str, the dimension for group-by + :param breakdown: The dimension(s) to group by. Can be a string or a list of strings. :param measure: tuple, (measure_column_name, aggregate_function_name) """ + # We want to allow for multi-value groupbys, so we work with lists of strings + if isinstance(breakdown, str): + breakdown = [breakdown] self.source_df = source_df self.subspace = subspace self.breakdown = breakdown self.measure = measure + self.breakdown_frozen = frozenset(self.breakdown) self.hash = None def __hash__(self): @@ -33,7 +39,7 @@ def __hash__(self): # Need a hashable representation of subspace for hashing subspace_tuple = tuple(sorted(self.subspace.items())) if isinstance(self.subspace, dict) else tuple( self.subspace) - self.hash = hash((subspace_tuple, self.breakdown, self.measure)) + self.hash = hash((subspace_tuple, frozenset(self.breakdown), self.measure)) return self.hash def __repr__(self): @@ -110,7 +116,7 @@ def _measure_extend(self, measures: List[Tuple[str, str]]) -> List['DataScope']: new_ds.append(DataScope(self.source_df, self.subspace, self.breakdown, (measure_col, agg_func))) return new_ds - def _breakdown_extend(self, dims: List[str]) -> List['DataScope']: + def _breakdown_extend(self, dims: List[List[str]]) -> List['DataScope']: """ Extends the breakdown of the DataScope while keeping the same subspace and measure. @@ -124,7 +130,7 @@ def _breakdown_extend(self, dims: List[str]) -> List['DataScope']: new_ds.append(DataScope(self.source_df, self.subspace, breakdown_dim, self.measure)) return new_ds - def create_hds(self, dims: List[str] = None, + def create_hds(self, dims: List[List[str]] = None, measures: List[Tuple[str,str]] = None, n_bins: int = 10, extend_by_measure: bool = False, extend_by_breakdown: bool = False, @@ -163,7 +169,7 @@ def create_hds(self, dims: List[str] = None, return HomogenousDataScope(hds) - def compute_impact(self, precomputed_source_df: pd.DataFrame = None) -> float: + def compute_impact(self, groupby_cache) -> float: """ Computes the impact of the data scope based on the provided impact measure. We define impact as the proportion of rows between the data scope and the total date scope, multiplied @@ -180,7 +186,7 @@ def compute_impact(self, precomputed_source_df: pd.DataFrame = None) -> float: # Perform subspace filtering filtered_df = self.apply_subspace() # Group by breakdown dimension and aggregate measure - if self.breakdown not in filtered_df.columns: + if any([True for dim in self.breakdown if dim not in filtered_df.columns]): # Cannot group by breakdown if it's not in the filtered data return 0 if impact_col not in filtered_df.columns: @@ -189,12 +195,23 @@ def compute_impact(self, precomputed_source_df: pd.DataFrame = None) -> float: try: numeric_columns = filtered_df.select_dtypes(include=['number']).columns.tolist() # Perform the aggregation - aggregated_series = filtered_df.groupby(impact_col)[numeric_columns].agg(agg_func) - if precomputed_source_df is None: - aggregated_source = self.source_df.groupby(impact_col)[numeric_columns].agg(agg_func) + if agg_func != "std": + aggregated_series = filtered_df.groupby(impact_col)[numeric_columns].agg(agg_func) else: - aggregated_source = precomputed_source_df.groupby(impact_col)[[numeric_columns]].agg(agg_func) + # If the aggregation is std, we need to manually provide ddof + aggregated_series = filtered_df.groupby(impact_col)[numeric_columns].std(ddof=1) + if (impact_col, agg_func) in groupby_cache: + # If the aggregation is not in the cache, compute it and add it to the cache + aggregated_source = groupby_cache[(impact_col, agg_func)] + else: + if agg_func != "std": + aggregated_source = self.source_df.groupby(impact_col)[numeric_columns].agg(agg_func) + else: + # If the aggregation is std, we need to manually provide ddof + aggregated_source = self.source_df.groupby(impact_col)[numeric_columns].std(ddof=1) + groupby_cache[(impact_col, agg_func)] = aggregated_source except Exception as e: + # raise e print(f"Error during aggregation for {self}: {e}") return 0 @@ -283,20 +300,20 @@ def __lt__(self, other): # We use the negative impact, since we want to use a max-heap but only have min-heap available return - self.impact < - other.impact - def compute_impact(self, cache) -> float: + def compute_impact(self, datascope_cache, groupby_cache) -> float: """ Computes the impact of the HDS. This is the sum of the impacts of all data scopes in the HDS. :return: The total impact of the HDS. """ impact = 0 for ds in self.data_scopes: - if ds in cache: + if ds in datascope_cache: # Use the cached impact if available to avoid recomputation, since computing the impact # is the single most expensive operation in the entire pipeline - ds_impact = cache[ds] + ds_impact = datascope_cache[ds] else: - ds_impact = ds.compute_impact() - cache[ds] = ds_impact + ds_impact = ds.compute_impact(groupby_cache) + datascope_cache[ds] = ds_impact impact += ds_impact self.impact = impact return impact diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index 03a4f39..606377d 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -325,7 +325,7 @@ def _create_commonness_set_title(self, commonness_set: List[BasicDataPattern]) - for subspace in subspaces[1:]: shared_subspace.intersection_update(subspace.keys()) title += f"for over {self.commonness_threshold * 100}% of values of {', '.join(shared_subspace)}, " - breakdowns = set([datascope.breakdown for datascope in data_scopes]) + breakdowns = set([str(datascope.breakdown) for datascope in data_scopes]) measures = set([datascope.measure for datascope in data_scopes]) measures_str = [] for measure in measures: @@ -456,12 +456,12 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: n_cols = 2 if self.exceptions and len(self.exceptions) > 0 else 1 if fig is None: fig = plt.figure(figsize=figsize) - outer_grid = gridspec.GridSpec(1, n_cols, width_ratios=[1, 1], figure=fig, wspace=0.2) + outer_grid = gridspec.GridSpec(1, n_cols, width_ratios=[1] * n_cols, figure=fig, wspace=0.2) else: if subplot_spec is None: - outer_grid = gridspec.GridSpec(1, n_cols, width_ratios=[1, 1], figure=fig, wspace=0.2) + outer_grid = gridspec.GridSpec(1, n_cols, width_ratios=[1] * n_cols, figure=fig, wspace=0.2) else: - outer_grid = gridspec.GridSpecFromSubplotSpec(1, n_cols, width_ratios=[1, 1], + outer_grid = gridspec.GridSpecFromSubplotSpec(1, n_cols, width_ratios=[1] * n_cols, subplot_spec=subplot_spec, wspace=0.2) # Set up the left side for commonness sets @@ -541,7 +541,7 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: # Create a nested grid for this row with more space type_grid = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=right_grid[i, 0], - height_ratios=[1, 15], hspace=0.5, wspace=0.3) + height_ratios=[1, 15], hspace=0.6, wspace=0.3) # Add title for the category in the first row title_ax = fig.add_subplot(type_grid[0, 0]) @@ -558,11 +558,11 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: # Create subplots for each pattern in the second row num_patterns = len(exception_patterns) # At most 2 patterns per row - n_cols = 2 + n_cols = 2 if num_patterns >= 2 else 1 n_rows = math.ceil(num_patterns / n_cols) pattern_grid = gridspec.GridSpecFromSubplotSpec(n_rows, n_cols, subplot_spec=type_grid[1, 0], - wspace=0.4, hspace=0.4) # More horizontal space + wspace=0.4, hspace=0.6) # More horizontal space for j, pattern in enumerate(exception_patterns): diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index a3fdbbe..3059100 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -128,17 +128,19 @@ def rank_metainsights(self, metainsight_candidates: List[MetaInsight]): return selected_metainsights def mine_metainsights(self, source_df: pd.DataFrame, - dimensions: List[str], + filter_dimensions: List[str], measures: List[Tuple[str,str]], n_bins: int = 10, extend_by_measure: bool = False, - extend_by_breakdown: bool = False + extend_by_breakdown: bool = False, + breakdown_dimensions: List[List[str]] = None, ) -> List[MetaInsight]: """ The main function to mine MetaInsights. Mines metainsights from the given data frame based on the provided dimensions, measures, and impact measure. :param source_df: The source DataFrame to mine MetaInsights from. - :param dimensions: The dimensions to consider for mining. - :param measures: The measures to consider for mining. + :param breakdown_dimensions: The dimensions to consider for breakdown (groupby). + :param filter_dimensions: The dimensions to consider for applying filters on. + :param measures: The measures (aggregations) to consider for mining. :param n_bins: The number of bins to use for numeric columns. :param extend_by_measure: Whether to extend the data scope by measure. Settings this to true can cause strange results, because we will consider multiple aggregation functions on the same filter dimension. @@ -151,15 +153,18 @@ def mine_metainsights(self, source_df: pd.DataFrame, pattern_cache = {} hdp_queue = PriorityQueue() + if breakdown_dimensions is None: + breakdown_dimensions = filter_dimensions + # Generate data scopes with one dimension as breakdown, all '*' subspace base_data_scopes = [] - for breakdown_dim in dimensions: + for breakdown_dim in breakdown_dimensions: for measure_col, agg_func in measures: base_data_scopes.append( DataScope(source_df, {}, breakdown_dim, (measure_col, agg_func))) # Generate data scopes with one filter in subspace and one breakdown - for filter_dim in dimensions: + for filter_dim in filter_dimensions: unique_values = source_df[filter_dim].dropna().unique() # If there are too many unique values, we bin them if it's a numeric column, or only choose the # top 10 most frequent values if it's a categorical column @@ -173,7 +178,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, top_values = source_df[filter_dim].value_counts().nlargest(10).index.tolist() unique_values = [v for v in unique_values if v in top_values] for value in unique_values: - for breakdown_dim in dimensions: + for breakdown_dim in breakdown_dimensions: # Prevents the same breakdown dimension from being used as filter. This is because it # is generally not very useful to groupby the same dimension as the filter dimension. if breakdown_dim != filter_dim: @@ -181,6 +186,19 @@ def mine_metainsights(self, source_df: pd.DataFrame, base_data_scopes.append( DataScope(source_df, {filter_dim: value}, breakdown_dim, (measure_col, agg_func))) + # The source dataframe with a groupby on various dimensions and measures can be precomputed, + # instead of computed each time we need it. + groupby_cache = {} + numeric_columns = source_df.select_dtypes(include=[np.number]).columns.tolist() + for col, agg_func in measures: + groupby_key = (col, agg_func) + if groupby_key not in groupby_cache: + # Handle 'std' aggregation specially + if agg_func == 'std': + groupby_cache[groupby_key] = source_df.groupby(col)[numeric_columns].std(ddof=1) + else: + groupby_cache[groupby_key] = source_df.groupby(col)[numeric_columns].agg(agg_func) + for base_ds in base_data_scopes: # Evaluate basic patterns for the base data scope for selected types @@ -192,7 +210,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, for base_dp in base_dps: if base_dp.pattern_type not in [PatternType.NONE, PatternType.OTHER]: # If a valid basic pattern is found, extend the data scope to generate HDS - hdp, pattern_cache = base_dp.create_hdp(temporal_dimensions=dimensions, measures=measures, + hdp, pattern_cache = base_dp.create_hdp(group_by_dims=breakdown_dimensions, measures=measures, pattern_type=pattern_type, pattern_cache=pattern_cache, extend_by_measure=extend_by_measure, extend_by_breakdown=extend_by_breakdown) @@ -201,7 +219,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, continue # Pruning 2: Discard HDS with extremely low impact - hds_impact = hdp.compute_impact(datascope_cache) + hds_impact = hdp.compute_impact(datascope_cache, groupby_cache) if hds_impact < MIN_IMPACT: continue @@ -235,9 +253,13 @@ def mine_metainsights(self, source_df: pd.DataFrame, # Create a sample Pandas DataFrame (similar to the paper's example) df = pd.read_csv("C:\\Users\\Yuval\\PycharmProjects\\pd-explain\\Examples\\Datasets\\adult.csv") df = df.sample(5000, random_state=42) # Sample 5000 rows for testing + print(df.columns) # Define dimensions, measures dimensions = ['marital-status', 'workclass', 'education-num'] + breakdown_dimensions = [['race', 'marital-status'], + ['native-country', 'label'], + ['race', 'label']] measures = [('capital-gain', 'mean'), ('capital-loss', 'mean'), ('hours-per-week', 'mean')] @@ -246,9 +268,10 @@ def mine_metainsights(self, source_df: pd.DataFrame, start_time = time.time() miner = MetaInsightMiner(k=4, min_score=0.01, min_commonness=0.5) top_metainsights = miner.mine_metainsights( - df, - dimensions, - measures, + source_df=df, + filter_dimensions=dimensions, + measures=measures, + breakdown_dimensions=breakdown_dimensions, ) end_time = time.time() print(f"Time taken: {end_time - start_time:.2f} seconds") From 07279643ff23c490defdfeac1ba2b688117d5d39 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Fri, 23 May 2025 18:56:58 +0300 Subject: [PATCH 19/27] Changed so NONE patterns are included and taken into account, changed so HDS only bin numeric columns and use all columns from categorical columns, added None patterns to visualizations. --- .../metainsight_explainer/data_pattern.py | 9 +++- .../metainsight_explainer/data_scope.py | 10 ++--- .../metainsight_explainer/meta_insight.py | 43 ++++++++++++++++--- .../metainsight_mining.py | 2 - 4 files changed, 49 insertions(+), 15 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/data_pattern.py b/src/external_explainers/metainsight_explainer/data_pattern.py index b9e0a83..c675a01 100644 --- a/src/external_explainers/metainsight_explainer/data_pattern.py +++ b/src/external_explainers/metainsight_explainer/data_pattern.py @@ -173,9 +173,14 @@ def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, if not isinstance(dp, typing.Iterable): dp = [dp] - # Only add patterns that are not 'No Pattern' to the HDP for MetaInsight evaluation + # # Only add patterns that are not 'No Pattern' to the HDP for MetaInsight evaluation + # for d in dp: + # if d is not None and d.pattern_type != PatternType.NONE: + # hdp.append(d) + + # Add all patterns, including 'No Pattern', since it is important to know that we had a 'No Pattern'. for d in dp: - if d is not None and d.pattern_type != PatternType.NONE: + if dp is not None: hdp.append(d) self.pattern_cache = pattern_cache diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py index 2521d02..00a2922 100644 --- a/src/external_explainers/metainsight_explainer/data_scope.py +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -86,14 +86,14 @@ def _subspace_extend(self, n_bins: int = 10) -> List['DataScope']: # If there are too many unique values, we bin them if it's a numeric column, or only choose the # top 10 most frequent values if it's a categorical column if len(unique_values) > n_bins: - if self.source_df[dim_to_extend].dtype in ['int64', 'float64']: + if self.source_df[dim_to_extend].dtype.kind in 'biufcmM': # Bin the numeric column bins = pd.cut(self.source_df[dim_to_extend], bins=n_bins, retbins=True)[1] unique_values = [f"{bins[i]} <= {dim_to_extend} <= {bins[i + 1]}" for i in range(len(bins) - 1)] - else: - # Choose the top 10 most frequent values - top_values = self.source_df[dim_to_extend].value_counts().nlargest(10).index.tolist() - unique_values = [v for v in unique_values if v in top_values] + # else: + # # Choose the top 10 most frequent values + # top_values = self.source_df[dim_to_extend].value_counts().nlargest(10).index.tolist() + # unique_values = [v for v in unique_values if v in top_values] for value in unique_values: # Ensure it's a sibling if self.subspace.get(dim_to_extend) != value: diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index 606377d..6b479a1 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -499,22 +499,53 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: # Handle exceptions area if there are any if self.exceptions and n_cols > 1: + none_patterns_exist = self.exceptions.get("No-Pattern", None) is not None # Set up the right side for exceptions with one row per exception type - right_grid = gridspec.GridSpecFromSubplotSpec(len(self.exceptions), 1, - subplot_spec=outer_grid[0, 1], - hspace=0.5) # Add more vertical space + if len(self.exceptions) == 1 and none_patterns_exist: + right_grid = gridspec.GridSpecFromSubplotSpec(len(self.exceptions), 1, + subplot_spec=outer_grid[0, 1], + hspace=1) # Add more vertical space + else: + # If there are None exceptions, place them at the bottom with very little space + right_grid = gridspec.GridSpecFromSubplotSpec(len(self.exceptions), 1, + subplot_spec=outer_grid[0, 1], + height_ratios=[10] * (len(self.exceptions) - 1) + [1], + hspace=1) # Add more vertical space + # Get the None patterns and "summarize" them in a dictionary + exception_patterns = self.exceptions.get("No-Pattern", []) + non_exceptions = [pattern for pattern in exception_patterns if pattern.pattern_type == PatternType.NONE] + non_exceptions_subspaces = [pattern.data_scope.subspace for pattern in non_exceptions] + non_exceptions_dict = defaultdict(list) + for subspace in non_exceptions_subspaces: + for key, val in subspace.items(): + non_exceptions_dict[key].append(val) + # Create a title for the None patterns + title = f"No patterns detected ({len(non_exceptions)})" + title = textwrap.fill(title, width=40) + # Create text saying all the values for which no patterns were detected + no_patterns_text = "" + for key, val in non_exceptions_dict.items(): + no_patterns_text += f"{key} = {val}\n" + no_patterns_text = textwrap.fill(no_patterns_text, width=40) + # Create a subplot for the None patterns + ax = fig.add_subplot(right_grid[len(self.exceptions) - 1, 0]) + # Add title and text + ax.set_title(title, y=0.1, fontsize=18, fontweight='bold') + ax.text(0.3, -0.2, no_patterns_text, + ha='center', va='center', + fontsize=18) + ax.axis('off') # Hide axis for the title # Process each exception category for i, (category, exception_patterns) in enumerate(self.exceptions.items()): if not exception_patterns: # Skip empty categories continue - # For "None" category, just skip it. It may be a good idea to add text saying - # "Nothing found for...", but we already have an issue with visual clutter and - # clipping everywhere. + # For "None" category, already handled it above if category.lower() == "none" or category.lower() == "no-pattern": continue + # For "highlight change" category, visualize all in one plot if category.lower() == "highlight-change" or category.lower() == "highlight change": ax = fig.add_subplot(right_grid[i, 0]) diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index 3059100..a92a260 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -226,11 +226,9 @@ def mine_metainsights(self, source_df: pd.DataFrame, # Add HDS to a queue for evaluation hdp_queue.put((hdp, pattern_type)) - processed_hdp_count = 0 metainsight_candidates = {} while not hdp_queue.empty(): hdp, pattern_type = hdp_queue.get() - processed_hdp_count += 1 # Evaluate HDP to find MetaInsight metainsight = MetaInsight.create_meta_insight(hdp, commonness_threshold=self.min_commonness) From 18dcd49057b7f9d538bf34fc87ebd0c36aa00d42 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Tue, 27 May 2025 22:42:44 +0300 Subject: [PATCH 20/27] Fixed spacing and location issues in visualizations, fixed bugs in computation that could cause crashes in cases where multi-index series existed. --- .../metainsight_explainer/meta_insight.py | 36 +++++--- .../metainsight_explainer/patterns.py | 89 ++++++------------- 2 files changed, 53 insertions(+), 72 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index 6b479a1..00c6a28 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -453,7 +453,10 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: :param figsize: Size of the figure if a new one is created. """ # Create a new figure if not provided - n_cols = 2 if self.exceptions and len(self.exceptions) > 0 else 1 + # n_cols = 2 if self.exceptions and len(self.exceptions) > 0 else 1 + # Above line makes it so the plot of the commonness sets takes up the entire figure if there are no exceptions. + # However, this can potentially make for some confusion, so I elected to always use 2 columns. + n_cols = 2 if fig is None: fig = plt.figure(figsize=figsize) outer_grid = gridspec.GridSpec(1, n_cols, width_ratios=[1] * n_cols, figure=fig, wspace=0.2) @@ -501,16 +504,19 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: if self.exceptions and n_cols > 1: none_patterns_exist = self.exceptions.get("No-Pattern", None) is not None # Set up the right side for exceptions with one row per exception type - if len(self.exceptions) == 1 and none_patterns_exist: + # If there are no exceptions, we create a grid with equal height ratios for each exception type. + # Else, we create a grid where the last row is smaller if there are None exceptions. + if not none_patterns_exist: right_grid = gridspec.GridSpecFromSubplotSpec(len(self.exceptions), 1, subplot_spec=outer_grid[0, 1], - hspace=1) # Add more vertical space + hspace=1.2) # Add more vertical space else: - # If there are None exceptions, place them at the bottom with very little space + # If there are None exceptions, place them at the bottom with very little space, since it just text + height_ratios = [10] * (len(self.exceptions) - 1) + [1] if len(self.exceptions) > 1 else [1] right_grid = gridspec.GridSpecFromSubplotSpec(len(self.exceptions), 1, subplot_spec=outer_grid[0, 1], - height_ratios=[10] * (len(self.exceptions) - 1) + [1], - hspace=1) # Add more vertical space + height_ratios=height_ratios, + hspace=1.4) # Add more vertical space # Get the None patterns and "summarize" them in a dictionary exception_patterns = self.exceptions.get("No-Pattern", []) non_exceptions = [pattern for pattern in exception_patterns if pattern.pattern_type == PatternType.NONE] @@ -526,18 +532,26 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: no_patterns_text = "" for key, val in non_exceptions_dict.items(): no_patterns_text += f"{key} = {val}\n" - no_patterns_text = textwrap.fill(no_patterns_text, width=40) + no_patterns_text = textwrap.fill(no_patterns_text, width=60) # Create a subplot for the None patterns ax = fig.add_subplot(right_grid[len(self.exceptions) - 1, 0]) # Add title and text - ax.set_title(title, y=0.1, fontsize=18, fontweight='bold') - ax.text(0.3, -0.2, no_patterns_text, + if len(self.exceptions) == 1: + title_y = None + text_y = 0.9 + else: + title_y = -0.3 + text_y = -1.1 + text_x = 0.5 + ax.set_title(title, y=title_y, fontsize=18, fontweight='bold') + ax.text(text_x, text_y, no_patterns_text, ha='center', va='center', fontsize=18) ax.axis('off') # Hide axis for the title # Process each exception category - for i, (category, exception_patterns) in enumerate(self.exceptions.items()): + i = 0 + for category, exception_patterns in self.exceptions.items(): if not exception_patterns: # Skip empty categories continue @@ -612,6 +626,8 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: if pattern.highlight: pattern.highlight.visualize(ax, title=title) + i += 1 + # Allow more space for the figure elements plt.subplots_adjust(bottom=0.15, top=0.9) # Adjust bottom and top margins diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py index 1526a4e..1df0813 100644 --- a/src/external_explainers/metainsight_explainer/patterns.py +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -91,64 +91,6 @@ class UnimodalityPattern(PatternInterface): __name__ = "Unimodality pattern" - @staticmethod - def visualize_many(plt_ax, patterns: List['UnimodalityPattern'], labels: List[str], title: str = None) -> None: - """ - Visualize multiple unimodality patterns on a single plot. - - :param plt_ax: Matplotlib axes to plot on - :param patterns: List of UnimodalityPattern objects - :param labels: List of labels for each pattern (e.g. data scope descriptions) - """ - # Define a color cycle for lines - colors = plt.cm.tab10.colors - - # Get a union of the indexes of all patterns. We do this because some patterns may be missing - # some of the indexes due to filters, which can cause missing x axis labels as a result. - all_indexes = set() - for pattern in patterns: - # Convert all indexes to strings, to avoid issues with type mismatches causing exceptions - all_indexes.update(pattern.source_series.index) - - all_indexes = list(all_indexes) - all_indexes.sort() - - # Add the missing parts of the index - for pattern in patterns: - new_series = pd.Series(index=all_indexes, dtype=pattern.source_series.dtype) - for idx in all_indexes: - if idx in pattern.source_series.index: - new_series[idx] = pattern.source_series[idx] - pattern.source_series = new_series - - - for i, (pattern, label) in enumerate(zip(patterns, labels)): - color = colors[i % len(colors)] - - # Plot the series with a unique color - plt_ax.plot(pattern.source_series, color=color, alpha=0.7, label=label) - - # Highlight the peak or valley with a marker - if pattern.type.lower() == 'peak': - plt_ax.plot(pattern.highlight_index, pattern.source_series.loc[pattern.highlight_index], - 'o', color=color, markersize=8, markeredgecolor='black') - elif pattern.type.lower() == 'valley': - plt_ax.plot(pattern.highlight_index, pattern.source_series.loc[pattern.highlight_index], - 'v', color=color, markersize=8, markeredgecolor='black') - - # Set labels and title - plt_ax.set_xlabel(patterns[0].index_name if patterns else 'Index') - plt_ax.set_ylabel(patterns[0].value_name if patterns else 'Value') - plt_ax.set_title(f"Multiple {patterns[0].type if patterns else 'Unimodality'} Patterns" if title is None else title) - - # Add legend outside the plot - plt_ax.legend() - - #plt_ax.figure.subplots_adjust(right=0.5) # Reserve 50% of width for legend - - # Rotate x-axis tick labels if needed - plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=16) - @staticmethod def visualize_many(plt_ax, patterns: List['UnimodalityPattern'], labels: List[str], title: str = None) -> None: """ @@ -255,6 +197,8 @@ def __eq__(self, other) -> bool: """ if not isinstance(other, UnimodalityPattern): return False + if not type(self.highlight_index) == type(other.highlight_index): + return False return (self.type == other.type and self.highlight_index == other.highlight_index) @@ -561,14 +505,32 @@ def visualize(self, plt_ax, title: str = None) -> None: :param plt_ax: :return: """ - plt_ax.scatter(self.source_series.index, self.source_series, label='Regular Data Point') + index_to_position, sorted_indices = PatternInterface.prepare_patterns_for_visualization([self]) + positions = [index_to_position[idx] for idx in self.source_series.index] + values = self.source_series.values + plt_ax.scatter(positions, values, label='Regular Data Point') plt_ax.set_xlabel(self.source_series.index.name if self.source_series.index.name else 'Index') plt_ax.set_ylabel(self.value_name) # Emphasize the outliers - plt_ax.scatter(self.outlier_indexes, self.outlier_values, color='red', label='Outliers') + # Map outliers to positions + outlier_positions = [] + outlier_values = [] + + for idx in self.outlier_indexes: + if idx in self.source_series.index: + outlier_positions.append(index_to_position[idx]) + outlier_values.append(self.source_series.loc[idx]) + plt_ax.scatter(outlier_positions, outlier_values, color='red', label='Outliers') plt_ax.legend(loc="upper left") - # Rotate x-axis tick labels - plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=12) + # Set x-ticks to show original index values + if sorted_indices: + # For large datasets, show fewer tick labels + step = max(1, len(sorted_indices) // 10) + positions = list(range(0, len(sorted_indices), step)) + labels = [str(sorted_indices[pos]) for pos in positions] + + plt_ax.set_xticks(positions) + plt_ax.set_xticklabels(labels, rotation=45, ha='right', fontsize=16) if title is not None: plt_ax.set_title(title) @@ -582,6 +544,9 @@ def __eq__(self, other): """ if not isinstance(other, OutlierPattern): return False + # If one index is a multi-index and the other is not, for example, they cannot be equal + if not type(self.outlier_indexes) == type(other.outlier_indexes): + return False return self.outlier_indexes.isin(other.outlier_indexes).all() or \ other.outlier_indexes.isin(self.outlier_indexes).all() From cf74ff0dcadd8ed07e4c9e9f04aa3f649fb12e60 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Sat, 31 May 2025 19:13:10 +0300 Subject: [PATCH 21/27] Added __str__ method to MetaInsight class. --- .../metainsight_explainer/meta_insight.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index 00c6a28..92e81af 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -84,6 +84,16 @@ def __eq__(self, other): return all_equal + + def __str__(self): + """ + :return: A string representation of the MetaInsight, describing all of the commonnesses in it. + """ + ret_str = "" + for commonness in self.commonness_set: + ret_str += self._create_commonness_set_title(commonness) + return ret_str + @staticmethod def categorize_exceptions(commonness_set, exceptions): """ From 31c55fab6e78edfd12e67b335681d5672950dbbe Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Mon, 2 Jun 2025 17:46:10 +0300 Subject: [PATCH 22/27] Changed caching to work globally using a singleton LRU cache class. --- .../metainsight_explainer/cache.py | 118 ++++++++++++++++++ .../metainsight_explainer/data_pattern.py | 30 ++--- .../metainsight_explainer/data_scope.py | 30 +++-- .../metainsight_explainer/meta_insight.py | 4 +- .../metainsight_mining.py | 23 ++-- .../pattern_evaluations.py | 12 +- 6 files changed, 169 insertions(+), 48 deletions(-) create mode 100644 src/external_explainers/metainsight_explainer/cache.py diff --git a/src/external_explainers/metainsight_explainer/cache.py b/src/external_explainers/metainsight_explainer/cache.py new file mode 100644 index 0000000..87bf162 --- /dev/null +++ b/src/external_explainers/metainsight_explainer/cache.py @@ -0,0 +1,118 @@ +from singleton_decorator import singleton +from collections import OrderedDict + +PATTERN_CACHE_MAX_SIZE = 40000 +DATASCOPE_CACHE_MAX_SIZE = 40000 +PATTERN_EVAL_CACHE_MAX_SIZE = 40000 +GROUPBY_CACHE_MAX_SIZE = 5000 + +@singleton +class Cache: + """ + A singleton class to hold various caches used in the MetaInsight explainer. + This helps in avoiding redundant computations and speeds up the evaluation process. + We use a singleton pattern to make the cache: + 1. Global across the application. + 2. Persistent throughout the lifetime of the application. + This cache is a simple LRU (Least Recently Used) cache implementation, removing the least recently used items when the cache exceeds its maximum size. + The caches in this class are: + - pattern_cache: Stores the data pattern objects evaluated for different data scopes and patterns. + - datascope_cache: Stores the scores for different data scopes. + - groupby_cache: Stores the results of groupby operations. + - pattern_eval_cache: Stores the results of pattern evaluations on series. + """ + + def __init__(self): + self._pattern_cache = OrderedDict() + self._datascope_cache = OrderedDict() + self._groupby_cache = OrderedDict() + self._pattern_eval_cache = OrderedDict() + self.pattern_cache_max_size = PATTERN_CACHE_MAX_SIZE + self.datascope_cache_max_size = DATASCOPE_CACHE_MAX_SIZE + self.groupby_cache_max_size = GROUPBY_CACHE_MAX_SIZE + self.pattern_eval_cache_max_size = PATTERN_EVAL_CACHE_MAX_SIZE + + + def _add_to_cache(self, cache, key, value, max_size) -> None: + """ + Adds a key-value pair to the specified cache. + If the cache exceeds its maximum size, it removes the least recently used item. + """ + if key in cache: + # Update the value and mark as recently used + cache.move_to_end(key) + cache[key] = value + if len(cache) > max_size: + # Pop the first item (least recently used) + cache.popitem(last=False) + + + def _get_from_cache(self, cache, key) -> any: + """ + Retrieves a value from the specified cache by key. + If the key exists, it marks the key as recently used. + """ + if key in cache: + # Move the accessed item to the end to mark it as recently used + cache.move_to_end(key) + return cache[key] + return None + + + def add_to_pattern_cache(self, key, value) -> None: + """ + Adds a key-value pair to the pattern cache. + If the cache exceeds its maximum size, it removes the least recently used item. + """ + self._add_to_cache(self._pattern_cache, key, value, PATTERN_CACHE_MAX_SIZE) + + + def add_to_datascope_cache(self, key, value) -> None: + """ + Adds a key-value pair to the datascope cache. + If the cache exceeds its maximum size, it removes the least recently used item. + """ + self._add_to_cache(self._datascope_cache, key, value, DATASCOPE_CACHE_MAX_SIZE) + + def add_to_groupby_cache(self, key, value): + """ + Adds a key-value pair to the groupby cache. + If the cache exceeds its maximum size, it removes the least recently used item. + """ + self._add_to_cache(self._groupby_cache, key, value, GROUPBY_CACHE_MAX_SIZE) + + def add_to_pattern_eval_cache(self, key, value) -> None: + """ + Adds a key-value pair to the pattern evaluation cache. + If the cache exceeds its maximum size, it removes the least recently used item. + """ + self._add_to_cache(self._pattern_eval_cache, key, value, PATTERN_EVAL_CACHE_MAX_SIZE) + + + def get_from_pattern_cache(self, key): + """ + Retrieves a value from the pattern cache by key. + If the key exists, it marks the key as recently used. + """ + return self._get_from_cache(self._pattern_cache, key) + + def get_from_datascope_cache(self, key): + """ + Retrieves a value from the datascope cache by key. + If the key exists, it marks the key as recently used. + """ + return self._get_from_cache(self._datascope_cache, key) + + def get_from_groupby_cache(self, key): + """ + Retrieves a value from the groupby cache by key. + If the key exists, it marks the key as recently used. + """ + return self._get_from_cache(self._groupby_cache, key) + + def get_from_pattern_eval_cache(self, key): + """ + Retrieves a value from the pattern evaluation cache by key. + If the key exists, it marks the key as recently used. + """ + return self._get_from_cache(self._pattern_eval_cache, key) \ No newline at end of file diff --git a/src/external_explainers/metainsight_explainer/data_pattern.py b/src/external_explainers/metainsight_explainer/data_pattern.py index c675a01..f10d909 100644 --- a/src/external_explainers/metainsight_explainer/data_pattern.py +++ b/src/external_explainers/metainsight_explainer/data_pattern.py @@ -6,6 +6,7 @@ from external_explainers.metainsight_explainer.data_scope import DataScope, HomogenousDataScope from external_explainers.metainsight_explainer.pattern_evaluations import PatternEvaluator, PatternType from external_explainers.metainsight_explainer.patterns import PatternInterface +from external_explainers.metainsight_explainer.cache import Cache class BasicDataPattern: @@ -13,6 +14,7 @@ class BasicDataPattern: A data pattern, as defined in the MetaInsight paper. Contains 3 elements: data scope, type (interpretation type) and highlight. """ + cache = Cache() def __init__(self, data_scope: DataScope, pattern_type: PatternType, highlight: PatternInterface | None): """ @@ -25,7 +27,6 @@ def __init__(self, data_scope: DataScope, pattern_type: PatternType, highlight: self.data_scope = data_scope self.pattern_type = pattern_type self.highlight = highlight - self.pattern_cache = {} self.hash = None def __eq__(self, other): @@ -124,22 +125,21 @@ def evaluate_pattern(data_scope: DataScope, df: pd.DataFrame, pattern_type: Patt return returned_patterns - def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, + def create_hdp(self, pattern_type: PatternType, hds: List[DataScope] = None, group_by_dims: List[List[str]] = None, measures: List[Tuple[str,str]] = None, n_bins: int = 10, - extend_by_measure: bool = False, extend_by_breakdown: bool = False) -> Tuple['HomogenousDataPattern', Dict]: + extend_by_measure: bool = False, extend_by_breakdown: bool = False) -> 'HomogenousDataPattern': """ Generates a Homogenous Data Pattern (HDP) either from a given HDS or from the current DataScope. :param pattern_type: The type of the pattern (e.g., 'Unimodality', 'Trend', etc.), provided as a PatternType enum. - :param pattern_cache: A cache for the pattern, if available. :param hds: A list of DataScopes to create the HDP from. If None, it will be created from the current DataScope. :param group_by_dims: The temporal dimensions to extend the breakdown with. Expected as a list of lists of strings. :param measures: The measures to extend the measure with. Expected to be a dict {measure_column: aggregate_function}. Only needed if hds is None. :param n_bins: The number of bins to use for numeric columns. Defaults to 10. :param extend_by_measure: Whether to extend the hds by measure. Defaults to False. :param extend_by_breakdown: Whether to extend the hds by breakdown. Defaults to False. - :return: A tuple containing the created HomogenousDataPattern and the updated pattern cache. + :return: The HomogenousDataPattern object containing the evaluated patterns. """ if hds is None or len(hds) == 0: hds = self.data_scope.create_hds(dims=group_by_dims, measures=measures, @@ -150,23 +150,19 @@ def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, # the one producing the HDP with this HDS). source_df = self.data_scope.source_df - # Append the existing cache if available - if pattern_cache is None: - pattern_cache = {} - pattern_cache.update(self.pattern_cache) - # Create the HDP hdp = [] for ds in hds: if ds != self.data_scope: # Check pattern cache first - cache_key = (ds, pattern_type) - if cache_key in pattern_cache: - dp = pattern_cache[cache_key] + cache_key = (ds.__hash__(), pattern_type) + cache_result = self.cache.get_from_pattern_cache(cache_key) + if cache_result is not None: + dp = cache_result else: - # Evaluate the pattern if not in cache + # Evaluate the pattern if not in cache, and add to cache dp = self.evaluate_pattern(ds, source_df, pattern_type) - pattern_cache[cache_key] = dp # Store in cache + self.cache.add_to_pattern_cache(cache_key, dp) # Some evaluation functions can return multiple patterns, so it is simpler to just # convert it to a list and then treat it as an iterable. @@ -183,14 +179,12 @@ def create_hdp(self, pattern_type: PatternType, pattern_cache: Dict = None, if dp is not None: hdp.append(d) - self.pattern_cache = pattern_cache - if self.pattern_type != PatternType.NONE: # Add the current pattern to the HDP hdp.append(self) hdp = HomogenousDataPattern(hdp) - return hdp, pattern_cache + return hdp class HomogenousDataPattern(HomogenousDataScope): diff --git a/src/external_explainers/metainsight_explainer/data_scope.py b/src/external_explainers/metainsight_explainer/data_scope.py index 00a2922..7d5d265 100644 --- a/src/external_explainers/metainsight_explainer/data_scope.py +++ b/src/external_explainers/metainsight_explainer/data_scope.py @@ -2,7 +2,9 @@ from typing import Dict, List, Tuple from scipy.special import kl_div import re +from external_explainers.metainsight_explainer.cache import Cache +cache = Cache() class DataScope: """ @@ -12,6 +14,7 @@ class DataScope: The subspace is {City: Los Angeles, Month: *}, the breakdown is {Month} and the measure is {SUM(Sales)}. """ + def __init__(self, source_df: pd.DataFrame, subspace: Dict[str, str], breakdown: str | List[str], measure: tuple): @@ -169,7 +172,7 @@ def create_hds(self, dims: List[List[str]] = None, return HomogenousDataScope(hds) - def compute_impact(self, groupby_cache) -> float: + def compute_impact(self) -> float: """ Computes the impact of the data scope based on the provided impact measure. We define impact as the proportion of rows between the data scope and the total date scope, multiplied @@ -200,16 +203,18 @@ def compute_impact(self, groupby_cache) -> float: else: # If the aggregation is std, we need to manually provide ddof aggregated_series = filtered_df.groupby(impact_col)[numeric_columns].std(ddof=1) - if (impact_col, agg_func) in groupby_cache: - # If the aggregation is not in the cache, compute it and add it to the cache - aggregated_source = groupby_cache[(impact_col, agg_func)] + cache_result = cache.get_from_groupby_cache((impact_col, agg_func)) + if cache_result is not None: + # If the aggregation is in the cache, use it + aggregated_source = cache_result else: if agg_func != "std": aggregated_source = self.source_df.groupby(impact_col)[numeric_columns].agg(agg_func) else: # If the aggregation is std, we need to manually provide ddof aggregated_source = self.source_df.groupby(impact_col)[numeric_columns].std(ddof=1) - groupby_cache[(impact_col, agg_func)] = aggregated_source + # Cache the result of the groupby operation + cache.add_to_groupby_cache((impact_col, agg_func), aggregated_source) except Exception as e: # raise e print(f"Error during aggregation for {self}: {e}") @@ -300,20 +305,21 @@ def __lt__(self, other): # We use the negative impact, since we want to use a max-heap but only have min-heap available return - self.impact < - other.impact - def compute_impact(self, datascope_cache, groupby_cache) -> float: + def compute_impact(self) -> float: """ Computes the impact of the HDS. This is the sum of the impacts of all data scopes in the HDS. :return: The total impact of the HDS. """ impact = 0 for ds in self.data_scopes: - if ds in datascope_cache: - # Use the cached impact if available to avoid recomputation, since computing the impact - # is the single most expensive operation in the entire pipeline - ds_impact = datascope_cache[ds] + # Use the cached impact if available to avoid recomputation, since computing the impact + # is the single most expensive operation in the entire pipeline + cache_result = cache.get_from_datascope_cache(ds.__hash__()) + if cache_result is not None: + ds_impact = cache_result else: - ds_impact = ds.compute_impact(groupby_cache) - datascope_cache[ds] = ds_impact + ds_impact = ds.compute_impact() + cache.add_to_datascope_cache(ds.__hash__(), ds_impact) impact += ds_impact self.impact = impact return impact diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index 92e81af..57fe9af 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -236,7 +236,7 @@ def calculate_conciseness(self) -> float: # Ensure conciseness is within a reasonable range, e.g., [0, 1] return conciseness - def compute_score(self, cache) -> float: + def compute_score(self) -> float: """ Computes the score of the MetaInsight. The score is the multiple of the conciseness of the MetaInsight and the impact score of the HDS @@ -246,7 +246,7 @@ def compute_score(self, cache) -> float: """ conciseness = self.calculate_conciseness() # If the impact has already been computed, use it - hds_score = self.hdp.impact if self.hdp.impact != 0 else self.hdp.compute_impact(cache) + hds_score = self.hdp.impact if self.hdp.impact != 0 else self.hdp.compute_impact() self.score = conciseness * hds_score return self.score diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index a92a260..8f93bc8 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -13,6 +13,7 @@ COMMONNESS_THRESHOLD) from external_explainers.metainsight_explainer.data_scope import DataScope from external_explainers.metainsight_explainer.pattern_evaluations import PatternType +from external_explainers.metainsight_explainer.cache import Cache MIN_IMPACT = 0.01 @@ -149,8 +150,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, having a metainsight on 2 disjoint sets of indexes. :return: """ - datascope_cache = {} - pattern_cache = {} + cache = Cache() hdp_queue = PriorityQueue() if breakdown_dimensions is None: @@ -188,16 +188,16 @@ def mine_metainsights(self, source_df: pd.DataFrame, # The source dataframe with a groupby on various dimensions and measures can be precomputed, # instead of computed each time we need it. - groupby_cache = {} numeric_columns = source_df.select_dtypes(include=[np.number]).columns.tolist() for col, agg_func in measures: groupby_key = (col, agg_func) - if groupby_key not in groupby_cache: + cache_result = cache.get_from_groupby_cache(groupby_key) + if cache_result is not None: # Handle 'std' aggregation specially if agg_func == 'std': - groupby_cache[groupby_key] = source_df.groupby(col)[numeric_columns].std(ddof=1) + cache.add_to_groupby_cache(groupby_key, source_df.groupby(col)[numeric_columns].std(ddof=1)) else: - groupby_cache[groupby_key] = source_df.groupby(col)[numeric_columns].agg(agg_func) + cache.add_to_groupby_cache(groupby_key, source_df.groupby(col)[numeric_columns].agg(agg_func)) for base_ds in base_data_scopes: @@ -210,8 +210,8 @@ def mine_metainsights(self, source_df: pd.DataFrame, for base_dp in base_dps: if base_dp.pattern_type not in [PatternType.NONE, PatternType.OTHER]: # If a valid basic pattern is found, extend the data scope to generate HDS - hdp, pattern_cache = base_dp.create_hdp(group_by_dims=breakdown_dimensions, measures=measures, - pattern_type=pattern_type, pattern_cache=pattern_cache, + hdp = base_dp.create_hdp(group_by_dims=breakdown_dimensions, measures=measures, + pattern_type=pattern_type, extend_by_measure=extend_by_measure, extend_by_breakdown=extend_by_breakdown) # Pruning 1 - if the HDP is unlikely to form a commonness, discard it @@ -219,7 +219,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, continue # Pruning 2: Discard HDS with extremely low impact - hds_impact = hdp.compute_impact(datascope_cache, groupby_cache) + hds_impact = hdp.compute_impact() if hds_impact < MIN_IMPACT: continue @@ -235,7 +235,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, if metainsight: # Calculate and assign the score - metainsight.compute_score(datascope_cache) + metainsight.compute_score() if metainsight in metainsight_candidates: other_metainsight = metainsight_candidates[metainsight] if metainsight.score > other_metainsight.score: @@ -258,8 +258,7 @@ def mine_metainsights(self, source_df: pd.DataFrame, breakdown_dimensions = [['race', 'marital-status'], ['native-country', 'label'], ['race', 'label']] - measures = [('capital-gain', 'mean'), ('capital-loss', 'mean'), - ('hours-per-week', 'mean')] + measures = [('capital-gain', 'mean'), ('capital-loss', 'mean'),('hours-per-week', 'mean')] # Run the mining process import time diff --git a/src/external_explainers/metainsight_explainer/pattern_evaluations.py b/src/external_explainers/metainsight_explainer/pattern_evaluations.py index 3523f0a..89091b3 100644 --- a/src/external_explainers/metainsight_explainer/pattern_evaluations.py +++ b/src/external_explainers/metainsight_explainer/pattern_evaluations.py @@ -11,6 +11,7 @@ import pymannkendall as mk from cydets.algorithm import detect_cycles from singleton_decorator import singleton +from external_explainers.metainsight_explainer.cache import Cache class PatternType(Enum): @@ -32,7 +33,7 @@ class PatternEvaluator: """ def __init__(self): - self.pattern_cache = {} + self.cache = Cache() self.OUTLIER_ZSCORE_THRESHOLD = 2.0 # Z-score threshold for outlier detection self.TREND_SLOPE_THRESHOLD = 0.01 # Minimum absolute slope for trend detection @@ -206,8 +207,10 @@ def __call__(self, series: pd.Series, pattern_type: PatternType) -> (bool, froze series_hash = hash(tuple(series.values)) cache_key = (series_hash, pattern_type) - if cache_key in self.pattern_cache: - return self.pattern_cache[cache_key] + cache_result = self.cache.get_from_pattern_eval_cache(cache_key) + if cache_result is not None: + # If the result is already cached, return it + return cache_result series = series[~series.isna()] # Remove NaN values series = series.sort_index() # Sort the series by index @@ -230,5 +233,6 @@ def __call__(self, series: pd.Series, pattern_type: PatternType) -> (bool, froze patterns = frozenset([patterns]) else: patterns = frozenset(patterns) - self.pattern_cache[cache_key] = (is_valid, patterns) + # Add the result to the cache + self.cache.add_to_pattern_eval_cache(cache_key, (is_valid, patterns)) return is_valid, patterns From 44693c2be3e5f5ee0a322c0f60e5a0c22acdf554 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Sun, 8 Jun 2025 23:20:26 +0300 Subject: [PATCH 23/27] Added string representation function for exceptions in meta insights. --- .../metainsight_explainer/meta_insight.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index 57fe9af..6b17edb 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -94,6 +94,55 @@ def __str__(self): ret_str += self._create_commonness_set_title(commonness) return ret_str + + def _write_exceptions_list_string(self, category: PatternType, patterns: List[BasicDataPattern], category_name: str) -> str: + """ + Helper function to create a string representation of a list of exception patterns. + :param category: The category of the exceptions. + :param patterns: The list of BasicDataPattern objects in this category. + :param category_name: The name of the category. + :return: A string representation of the exceptions in this category. + """ + if not patterns: + return "" + if category_name.lower() not in ["no pattern", "no-pattern", "none", "highlight-change", "highlight change"]: + # If the category is "No Pattern" or "Highlight Change", we don't need to write anything + exceptions = [pattern for pattern in patterns if pattern.pattern_type not in [PatternType.NONE, PatternType.OTHER]] + else: + exceptions = [pattern for pattern in patterns if pattern.pattern_type == category] + subspaces = [pattern.data_scope.subspace for pattern in exceptions] + subspace_dict = defaultdict(list) + for subspace in subspaces: + for key, val in subspace.items(): + subspace_dict[key].append(val) + out_str = f"Exceptions in category '{category_name}' ({len(exceptions)}): [" + for key, val in subspace_dict.items(): + out_str += f"{key} = {val}, " + out_str = out_str[:-2] + "]\n" + return out_str + + def get_exceptions_string(self): + """ + A string representation of the list of exception categories. + :return: + """ + exceptions_string = "" + for category, patterns in self.exceptions.items(): + if not patterns: + continue + # No-Pattern category: create an array of + if category.lower() == "no-pattern" or category.lower() == "none": + exceptions_string += self._write_exceptions_list_string(PatternType.NONE, patterns, "No Pattern") + if category.lower() == "highlight-change" or category.lower() == "highlight change": + # Doesn't matter which PatternType we use here, so long as it is not None or PatternType.OTHER. + exceptions_string += self._write_exceptions_list_string(PatternType.UNIMODALITY, patterns, "Same pattern, different highlight") + elif category.lower() == "type-change" or category.lower() == "type change": + exceptions_string += self._write_exceptions_list_string(PatternType.OTHER, patterns, "Pattern type change") + if not exceptions_string: + exceptions_string = "All values belong to a commonness set, no exceptions found." + return exceptions_string + + @staticmethod def categorize_exceptions(commonness_set, exceptions): """ From 4ed7772c13132a77de441a6a1e6b21d4f77dd4a0 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Mon, 9 Jun 2025 21:34:40 +0300 Subject: [PATCH 24/27] Fixed issue causing visualization to sometimes fail with multi-index series due to matplotlib not expecting tuple input. --- .../metainsight_explainer/patterns.py | 125 +++++------------- 1 file changed, 34 insertions(+), 91 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py index 1df0813..0a0ec83 100644 --- a/src/external_explainers/metainsight_explainer/patterns.py +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -15,6 +15,8 @@ def visualize(self, plt_ax, title: str = None) -> None: """ Visualize the pattern. """ + # Note for all the implementations below: all of them just use the visualize_many method internally, + # because that one handles all the complex cases already and can also visualize just one pattern. raise NotImplementedError("Subclasses must implement this method.") @abstractmethod @@ -72,6 +74,22 @@ def prepare_patterns_for_visualization(patterns): return index_to_position, sorted_indices + @staticmethod + def handle_sorted_indices(plt_ax, sorted_indices): + """ + Handle setting x-ticks and labels for the plot based on sorted indices. + :param plt_ax: The matplotlib axes to set ticks on + :param sorted_indices: The sorted indices to use for x-ticks + """ + # For large datasets, show fewer tick labels + step = max(1, len(sorted_indices) // 10) + positions = list(range(0, len(sorted_indices), step)) + tick_labels = [str(sorted_indices[pos]) for pos in positions] + + plt_ax.set_xticks(positions) + plt_ax.set_xticklabels(tick_labels, rotation=45, ha='right', fontsize=16) + + @staticmethod @abstractmethod def visualize_many(plt_ax, patterns: List['PatternInterface'], labels:List[str], title: str = None) -> None: @@ -129,13 +147,7 @@ def visualize_many(plt_ax, patterns: List['UnimodalityPattern'], labels: List[st # Set x-ticks to show original index values if sorted_indices: - # For large datasets, show fewer tick labels - step = max(1, len(sorted_indices) // 10) - positions = list(range(0, len(sorted_indices), step)) - tick_labels = [str(sorted_indices[pos]) for pos in positions] - - plt_ax.set_xticks(positions) - plt_ax.set_xticklabels(tick_labels, rotation=45, ha='right', fontsize=16) + PatternInterface.handle_sorted_indices(plt_ax, sorted_indices) # Set labels and title plt_ax.set_xlabel(patterns[0].index_name if patterns else 'Index') @@ -173,19 +185,11 @@ def visualize(self, plt_ax, title: str = None) -> None: Visualize the unimodality pattern. :return: """ - plt_ax.plot(self.source_series) - plt_ax.set_xlabel(self.index_name) - plt_ax.set_ylabel(self.value_name) - # Emphasize the peak or valley - if self.type.lower() == 'peak': - plt_ax.plot(self.highlight_index, self.source_series[self.highlight_index], 'ro', label='Peak') - elif self.type.lower() == 'valley': - plt_ax.plot(self.highlight_index, self.source_series[self.highlight_index], 'bo', label='Valley') - plt_ax.legend(loc="upper left") - # Rotate x-axis tick labels - plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=12) + self.visualize_many(plt_ax, [self], [self.value_name], title=None) if title is not None: plt_ax.set_title(title) + else: + plt_ax.set_title(f"{self.type} at {self.highlight_index} in {self.value_name}") def __eq__(self, other) -> bool: @@ -289,13 +293,7 @@ def visualize_many(plt_ax, patterns: List['TrendPattern'], labels: List[str], ti # Set x-ticks to show original index values if sorted_indices: - # For large datasets, show fewer tick labels - step = max(1, len(sorted_indices) // 10) - positions = list(range(0, len(sorted_indices), step)) - tick_labels = [str(sorted_indices[pos]) for pos in positions] - - plt_ax.set_xticks(positions) - plt_ax.set_xticklabels(tick_labels, rotation=45, ha='right', fontsize=16) + PatternInterface.handle_sorted_indices(plt_ax, sorted_indices) # Set labels and title if patterns: @@ -336,20 +334,11 @@ def visualize(self, plt_ax, title: str = None) -> None: :param plt_ax: :return: """ - plt_ax.plot(self.source_series) - plt_ax.set_xlabel(self.source_series.index.name if self.source_series.index.name else 'Index') - plt_ax.set_ylabel(self.value_name) - x_numeric = np.arange(len(self.source_series)) - # Emphasize the trend - label = f"y={self.slope:.2f}x + {self.intercept:.2f}" - plt_ax.plot(self.source_series.index, self.slope * x_numeric + self.intercept, 'g--', - linewidth=2, - label=label) - plt_ax.legend(loc="upper left") - # Rotate x-axis tick labels - plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=12) + self.visualize_many(plt_ax, [self], [self.value_name], title=None) if title is not None: plt_ax.set_title(title) + else: + plt_ax.set_title(f"{self.type} trend in {self.value_name} with slope {self.slope:.2f} and intercept {self.intercept:.2f}") def __eq__(self, other) -> bool: """ @@ -452,13 +441,7 @@ def visualize_many(plt_ax, patterns: List['OutlierPattern'], labels: List[str], # Set x-ticks to show original index values if sorted_indices: - # For large datasets, show fewer tick labels - step = max(1, len(sorted_indices) // 10) - positions = list(range(0, len(sorted_indices), step)) - labels = [str(sorted_indices[pos]) for pos in positions] - - plt_ax.set_xticks(positions) - plt_ax.set_xticklabels(labels, rotation=45, ha='right', fontsize=16) + PatternInterface.handle_sorted_indices(plt_ax, sorted_indices) # Setup the rest of the plot from matplotlib.lines import Line2D @@ -505,34 +488,11 @@ def visualize(self, plt_ax, title: str = None) -> None: :param plt_ax: :return: """ - index_to_position, sorted_indices = PatternInterface.prepare_patterns_for_visualization([self]) - positions = [index_to_position[idx] for idx in self.source_series.index] - values = self.source_series.values - plt_ax.scatter(positions, values, label='Regular Data Point') - plt_ax.set_xlabel(self.source_series.index.name if self.source_series.index.name else 'Index') - plt_ax.set_ylabel(self.value_name) - # Emphasize the outliers - # Map outliers to positions - outlier_positions = [] - outlier_values = [] - - for idx in self.outlier_indexes: - if idx in self.source_series.index: - outlier_positions.append(index_to_position[idx]) - outlier_values.append(self.source_series.loc[idx]) - plt_ax.scatter(outlier_positions, outlier_values, color='red', label='Outliers') - plt_ax.legend(loc="upper left") - # Set x-ticks to show original index values - if sorted_indices: - # For large datasets, show fewer tick labels - step = max(1, len(sorted_indices) // 10) - positions = list(range(0, len(sorted_indices), step)) - labels = [str(sorted_indices[pos]) for pos in positions] - - plt_ax.set_xticks(positions) - plt_ax.set_xticklabels(labels, rotation=45, ha='right', fontsize=16) + self.visualize_many(plt_ax, [self], [self.value_name], title=None) if title is not None: plt_ax.set_title(title) + else: + plt_ax.set_title(f"Outliers in {self.value_name} at {self.outlier_indexes.tolist()}") def __eq__(self, other): @@ -705,13 +665,7 @@ def visualize_many(plt_ax, patterns: List['CyclePattern'], labels: List[str], ti # Set x-ticks to show original index values if sorted_indices: - # For large datasets, show fewer tick labels - step = max(1, len(sorted_indices) // 10) - positions = list(range(0, len(sorted_indices), step)) - tick_labels = [str(sorted_indices[pos]) for pos in positions] - - plt_ax.set_xticks(positions) - plt_ax.set_xticklabels(tick_labels, rotation=45, ha='right', fontsize=16) + PatternInterface.handle_sorted_indices(plt_ax, sorted_indices) # Set labels and title if patterns: @@ -749,22 +703,11 @@ def visualize(self, plt_ax, title: str = None): :param plt_ax: :return: """ - plt_ax.plot(self.source_series) - plt_ax.set_xlabel(self.source_series.index.name if self.source_series.index.name else 'Index') - plt_ax.set_ylabel(self.value_name) - i = 1 - # Emphasize the cycles, and alternate colors - colors = ['red', 'blue', 'green', 'orange', 'purple'] - color_index = 0 - for _, cycle in self.cycles.iterrows(): - plt_ax.axvspan(cycle['t_start'], cycle['t_end'], color=colors[color_index], alpha=0.5, label=f'Cycle {i}') - i += 1 - color_index = (color_index + 1) % len(colors) - plt_ax.legend(loc="upper left") - # Rotate x-axis tick labels - plt.setp(plt_ax.get_xticklabels(), rotation=45, ha='right', fontsize=12) + self.visualize_many(plt_ax, [self], [self.value_name], title=None, alpha_cycles=0.5, line_alpha=0.8) if title is not None: plt_ax.set_title(title) + else: + plt_ax.set_title(f"Cycles in {self.value_name} at {self._cycle_tuples}") def __eq__(self, other): """ From 62c8a76a098a5b2c0b42f0116331f5e39197e6b9 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Tue, 24 Jun 2025 23:15:00 +0300 Subject: [PATCH 25/27] Added support for MetaInsights to add text to the figure, allowing for pd-explain to add LLM reasoning to them. --- .../metainsight_explainer/meta_insight.py | 50 +++++++++++++++---- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index 6b17edb..8323290 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -143,6 +143,17 @@ def get_exceptions_string(self): return exceptions_string + def to_str_full(self): + """ + :return: A full string representation of the MetaInsight, including commonness sets and exceptions. + """ + ret_str = self.__str__() + if len(self.exceptions) > 0: + ret_str += f"Exceptions to this pattern were found:\n" + ret_str += self.get_exceptions_string() + return ret_str + + @staticmethod def categorize_exceptions(commonness_set, exceptions): """ @@ -503,13 +514,14 @@ def _create_labels(self, patterns: List[BasicDataPattern]) -> List[str]: labels.append(f"{subspace_str}") return labels - def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: + def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10), additional_text: str = None) -> None: """ Visualize the metainsight, showing commonness sets on the left and exceptions on the right. :param fig: Matplotlib figure to plot on. If None, a new figure is created. :param subplot_spec: GridSpec to plot on. If None, a new GridSpec is created. :param figsize: Size of the figure if a new one is created. + :param additional_text: Optional additional text to display in the bottom-middle of the figure. """ # Create a new figure if not provided # n_cols = 2 if self.exceptions and len(self.exceptions) > 0 else 1 @@ -518,17 +530,28 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: n_cols = 2 if fig is None: fig = plt.figure(figsize=figsize) + if subplot_spec is None: outer_grid = gridspec.GridSpec(1, n_cols, width_ratios=[1] * n_cols, figure=fig, wspace=0.2) else: - if subplot_spec is None: - outer_grid = gridspec.GridSpec(1, n_cols, width_ratios=[1] * n_cols, figure=fig, wspace=0.2) - else: - outer_grid = gridspec.GridSpecFromSubplotSpec(1, n_cols, width_ratios=[1] * n_cols, - subplot_spec=subplot_spec, wspace=0.2) + outer_grid = gridspec.GridSpecFromSubplotSpec(1, n_cols, width_ratios=[1] * n_cols, + subplot_spec=subplot_spec, wspace=0.2) + + # Wrap the existing 1x2 layout in a 2-row local GridSpec + if additional_text: + wrapper_gs = gridspec.GridSpecFromSubplotSpec( + 2, 1, subplot_spec=subplot_spec, height_ratios=[10, 1], hspace=0.8 + ) + else: + wrapper_gs = gridspec.GridSpecFromSubplotSpec( + 1, 1, subplot_spec=subplot_spec + ) + top_gs = gridspec.GridSpecFromSubplotSpec( + 1, 2, subplot_spec=wrapper_gs[0], wspace=0.2 + ) # Set up the left side for commonness sets - left_grid = gridspec.GridSpecFromSubplotSpec(1, len(self.commonness_set) or 1, - subplot_spec=outer_grid[0, 0], wspace=0.3) + left_grid = gridspec.GridSpecFromSubplotSpec(1, len(self.commonness_set), + subplot_spec=top_gs[0, 0], wspace=0.3) # Plot each commonness set in its own column for i, commonness_set in enumerate(self.commonness_set): @@ -567,7 +590,7 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: # Else, we create a grid where the last row is smaller if there are None exceptions. if not none_patterns_exist: right_grid = gridspec.GridSpecFromSubplotSpec(len(self.exceptions), 1, - subplot_spec=outer_grid[0, 1], + subplot_spec=top_gs[0, 1], hspace=1.2) # Add more vertical space else: # If there are None exceptions, place them at the bottom with very little space, since it just text @@ -687,6 +710,15 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10)) -> None: i += 1 + # If there is additional text, add it to the bottom middle of the grid + if additional_text: + text_ax = fig.add_subplot(wrapper_gs[1]) + text_ax.axis('off') + text_ax.text( + 0.5, 0.5, additional_text, + ha='center', va='center' + ) + # Allow more space for the figure elements plt.subplots_adjust(bottom=0.15, top=0.9) # Adjust bottom and top margins From 01c071ef2d0b16bb208f90c75fea4da82b0f11b5 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Thu, 26 Jun 2025 23:28:14 +0300 Subject: [PATCH 26/27] Changed visualization of trend pattern to use the mean over the distribution, instead of displaying raw data. --- .../metainsight_explainer/meta_insight.py | 2 +- .../metainsight_mining.py | 18 +++++++++---- .../metainsight_explainer/patterns.py | 27 ++++++++++++++++--- 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index 8323290..108cc83 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -716,7 +716,7 @@ def visualize(self, fig=None, subplot_spec=None, figsize=(15, 10), additional_te text_ax.axis('off') text_ax.text( 0.5, 0.5, additional_text, - ha='center', va='center' + ha='center', va='center', fontsize=18 ) # Allow more space for the figure elements diff --git a/src/external_explainers/metainsight_explainer/metainsight_mining.py b/src/external_explainers/metainsight_explainer/metainsight_mining.py index 8f93bc8..4971410 100644 --- a/src/external_explainers/metainsight_explainer/metainsight_mining.py +++ b/src/external_explainers/metainsight_explainer/metainsight_mining.py @@ -254,11 +254,19 @@ def mine_metainsights(self, source_df: pd.DataFrame, print(df.columns) # Define dimensions, measures - dimensions = ['marital-status', 'workclass', 'education-num'] - breakdown_dimensions = [['race', 'marital-status'], - ['native-country', 'label'], - ['race', 'label']] - measures = [('capital-gain', 'mean'), ('capital-loss', 'mean'),('hours-per-week', 'mean')] + dimensions = ['education', 'occupation', 'marital-status'] + breakdown_dimensions = [['age'], + ['education-num'], + ['occupation'], + ['marital-status'], + ] + measures = [ + ('capital-gain', 'mean'), + ('capital-loss', 'mean'), + ('hours-per-week', 'mean'), + ('income', 'count'), + ('education-num', 'mean'), + ] # Run the mining process import time diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py index 0a0ec83..d60c789 100644 --- a/src/external_explainers/metainsight_explainer/patterns.py +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -281,10 +281,6 @@ def visualize_many(plt_ax, patterns: List['TrendPattern'], labels: List[str], ti x_positions = [index_to_position[idx] for idx in pattern.source_series.index] values = pattern.source_series.values - # Plot the raw data with reduced opacity if requested - if show_data: - plt_ax.plot(x_positions, values, color=color, alpha=alpha_data, linewidth=1) - # Plot the trend line using numeric positions trend_label = f"{label}" x_range = np.arange(len(sorted_indices)) @@ -295,6 +291,29 @@ def visualize_many(plt_ax, patterns: List['TrendPattern'], labels: List[str], ti if sorted_indices: PatternInterface.handle_sorted_indices(plt_ax, sorted_indices) + # Compute the mean value across the data as a whole, and visualize that line, if show_data is True + if show_data: + # Collect all data points from all patterns + mean_dict = { + idx: [] for idx in index_to_position.keys() + } + for idx in index_to_position: + for pattern in patterns: + if idx in pattern.source_series.index: + mean_dict[idx].append(pattern.source_series.loc[idx]) + # Compute the overall mean series + overall_mean_series = pd.Series( + {idx: np.mean(values) for idx, values in mean_dict.items()}, + name='Overall Mean Data', + index=index_to_position + ) + mean_x_positions = [index_to_position.get(idx) for idx in overall_mean_series.index if + idx in index_to_position] + mean_values = [overall_mean_series.loc[idx] for idx in overall_mean_series.index if + idx in index_to_position] + plt_ax.plot(mean_x_positions, mean_values, color='gray', alpha=1, linewidth=5, + label='Mean Over Distribution') + # Set labels and title if patterns: plt_ax.set_xlabel(patterns[0].source_series.index.name if patterns[0].source_series.index.name else 'Index') From 55f89ca0928db82775593d684d66a01c09a5ddf1 Mon Sep 17 00:00:00 2001 From: Yuval Uner Date: Fri, 27 Jun 2025 22:13:48 +0300 Subject: [PATCH 27/27] Fixed bug causing a crash when the subspace value was not a string. Fixed mean values line in trend patterns being too opaque. --- .../metainsight_explainer/meta_insight.py | 9 ++++++--- .../metainsight_explainer/patterns.py | 8 ++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/external_explainers/metainsight_explainer/meta_insight.py b/src/external_explainers/metainsight_explainer/meta_insight.py index 108cc83..77aca1a 100644 --- a/src/external_explainers/metainsight_explainer/meta_insight.py +++ b/src/external_explainers/metainsight_explainer/meta_insight.py @@ -505,9 +505,12 @@ def _create_labels(self, patterns: List[BasicDataPattern]) -> List[str]: for pattern in patterns: subspace_str = "" for key, val in pattern.data_scope.subspace.items(): - split = val.split("<=") - if len(split) > 1: - subspace_str += f"{val}" + if isinstance(val, str): + split = val.split("<=") + if len(split) > 1: + subspace_str += f"{val}" + else: + subspace_str += f"{key} = {val}, " else: subspace_str += f"{key} = {val}, " diff --git a/src/external_explainers/metainsight_explainer/patterns.py b/src/external_explainers/metainsight_explainer/patterns.py index d60c789..681c5a2 100644 --- a/src/external_explainers/metainsight_explainer/patterns.py +++ b/src/external_explainers/metainsight_explainer/patterns.py @@ -239,7 +239,7 @@ class TrendPattern(PatternInterface): @staticmethod def visualize_many(plt_ax, patterns: List['TrendPattern'], labels: List[str], title: str = None, - show_data: bool = True, alpha_data: float = 0.6) -> None: + show_data: bool = True, alpha_data: float = 0.5) -> None: """ Visualize multiple trend patterns on a single plot. @@ -285,7 +285,7 @@ def visualize_many(plt_ax, patterns: List['TrendPattern'], labels: List[str], ti trend_label = f"{label}" x_range = np.arange(len(sorted_indices)) plt_ax.plot(x_range, pattern.slope * x_range + pattern.intercept, - linestyle=line_style, color=color, linewidth=2, label=trend_label) + linestyle=line_style, color=color, linewidth=2, label=trend_label + " (trend line)") # Set x-ticks to show original index values if sorted_indices: @@ -311,8 +311,8 @@ def visualize_many(plt_ax, patterns: List['TrendPattern'], labels: List[str], ti idx in index_to_position] mean_values = [overall_mean_series.loc[idx] for idx in overall_mean_series.index if idx in index_to_position] - plt_ax.plot(mean_x_positions, mean_values, color='gray', alpha=1, linewidth=5, - label='Mean Over Distribution') + plt_ax.plot(mean_x_positions, mean_values, color='gray', alpha=alpha_data, linewidth=5, + label='Mean Over All Data') # Set labels and title if patterns: