# Top-k Hybrid HUOPM (Phase 3)

This notebook implements a Top-k Hybrid HUOPM miner that replaces fixed thresholds with a dynamic Top-k heap and a best-first search strategy.

<details>
<summary><strong>Table of Contents</strong></summary>

1. [Imports and Setup](#imports-and-setup)
   Libraries and shared utilities.
2. [Data Structures](#data-structures)
   PUON elements/lists for best-first expansion.
3. [Database](#database)
   Quantitative database representation.
4. [Top-k Hybrid HUOPM](#top-k-hybrid-huopm)
   Threshold-free top-k mining algorithm.
5. [Example Run (Toy Data)](#example-run-toy-data)
   Sanity check run.
</details>


## Imports and Setup


In [1]:
import time
import heapq
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Dict, List, Tuple, Set, Optional

import matplotlib.pyplot as plt

## Data Structures
We reuse PUON elements/lists from Phase 2 and add a `priority` field for best-first expansion.

In [2]:
@dataclass
class PUONElement:
    """
    Element in a PUON-list (Prefix Utility Occupancy Node list).

    Attributes:
        nid (str): Transaction identifier.
        nu (float): Utility of the itemset in the transaction.
        nru (float): Remaining utility after the last item in the itemset.
        npu (float): Prefix utility (kept for completeness).
        tu (float): Transaction utility.
        nsup (int): Support counter for this element (default 1).
        ubrem_rel (float): Upper-bound remaining relative utility (nu+nru)/tu.

    Example:
        >>> e = PUONElement('t1', nu=10, nru=5, npu=0, tu=30)
        >>> e.ubrem_rel = (e.nu + e.nru) / e.tu
        >>> print(e.nid, e.ubrem_rel)
        t1 0.5
    """
    nid: str
    nu: float
    nru: float
    npu: float
    tu: float
    nsup: int = 1
    ubrem_rel: float = 0.0


@dataclass
class PUONList:
    """
    PUON-list structure for an itemset.

    Attributes:
        itemset (tuple): The itemset represented by this list.
        elements (list): List of PUONElement objects.
        f_occ (float): Average utility occupancy of the itemset.
        f_wubocc (float): Weak upper bound of utility occupancy.
        list_ub_rem_rel (list): Cached (nu+nru)/tu values.
        priority (float): Priority for best-first expansion (hybrid upper bound).

    Example:
        >>> pl = PUONList(('a',))
        >>> pl.add_element(PUONElement('t1', 10, 5, 0, 30))
        >>> pl.compute_occ()
        0.3333333333333333
    """
    itemset: Tuple[str, ...]
    elements: List[PUONElement] = field(default_factory=list)
    f_occ: float = 0.0
    f_wubocc: float = 0.0
    list_ub_rem_rel: List[float] = field(default_factory=list)
    priority: float = 0.0

    def add_element(self, element: PUONElement):
        """
        Add a PUONElement to the list.

        Args:
            element (PUONElement): Element to append.

        Returns:
            None
        """
        self.elements.append(element)

    def get_support(self) -> int:
        """
        Get support (number of transactions containing the itemset).

        Returns:
            int: Support count.
        """
        return len(self.elements)

    def compute_occ(self) -> float:
        """
        Compute utility occupancy of the itemset.

        occ(A) = (1/supp(A)) * sum_{t in rho(A)} (u(A,t)/tu(t))

        Returns:
            float: Utility occupancy value.
        """
        if not self.elements:
            self.f_occ = 0.0
            return 0.0
        sum_urel = sum(elem.nu / elem.tu for elem in self.elements)
        self.f_occ = sum_urel / len(self.elements)
        return self.f_occ

    def compute_wubocc(self, k: int) -> float:
        """
        Compute weak upper bound of utility occupancy (wubocc).

        Uses TS(A) (transactions with possible extensions) and takes the top-k
        values of (nu+nru)/tu.

        Args:
            k (int): Top-k size (used as cutoff for bound).

        Returns:
            float: Weak upper bound on utility occupancy.
        """
        if not self.elements:
            self.f_wubocc = 0.0
            return 0.0
        ts_values = [ (e.nu + e.nru) / e.tu for e in self.elements if e.nru > 0 ]
        if not ts_values:
            self.f_wubocc = 0.0
            return 0.0
        ts_values.sort(reverse=True)
        top_values = ts_values[:min(k, len(ts_values))]
        self.f_wubocc = sum(top_values) / max(1, k)
        return self.f_wubocc

## Database

In [3]:
class QuantitativeDatabase:
    """
    Quantitative database (QDB) for utility occupancy mining.

    Attributes:
        transactions (dict): {tid: {item: utility}}
        item_profits (dict): {item: profit}
        transaction_utilities (dict): {tid: TU}
    """
    def __init__(self):
        self.transactions: Dict[str, Dict[str, float]] = {}
        self.item_profits: Dict[str, float] = {}
        self.transaction_utilities: Dict[str, float] = {}

    def set_profits(self, profits: Dict[str, float]):
        self.item_profits = profits

    def add_transaction(self, tid: str, items_quantities: Dict[str, int]):
        transaction = {}
        total_utility = 0.0
        for item, qty in items_quantities.items():
            util = qty * self.item_profits.get(item, 1.0)
            transaction[item] = util
            total_utility += util
        self.transactions[tid] = transaction
        self.transaction_utilities[tid] = total_utility

    def reduce_database(self, relevant_items: Set[str]):
        for tid in self.transactions:
            self.transactions[tid] = {
                item: util for item, util in self.transactions[tid].items()
                if item in relevant_items
            }
            self.transaction_utilities[tid] = sum(self.transactions[tid].values())

## Top-k Hybrid HUOPM

In [4]:
class TopKHybridHUOPM:
    """
    Top-k Hybrid HUOPM miner (Phase 3).

    This version replaces fixed thresholds with a dynamic top-k heap and
    explores the search space in best-first order using a hybrid upper bound.

    Args:
        k (int): Number of top patterns to return.
    """

    def __init__(self, k: int):
        self.k = k
        self.topk_heap: List[Tuple[float, Tuple[str, ...]]] = []  # (occ, itemset)
        self.min_threshold = 0.0
        self.cmap: Dict[str, Set[str]] = {}
        self.item_order: Dict[str, int] = {}
        self.puon_lists: Dict[Tuple[str, ...], PUONList] = {}
        self.threshold_trace: List[Tuple[float, float]] = []  # (time, threshold)

    def _update_topk(self, itemset: Tuple[str, ...], occ: float):
        if self.k <= 0:
            return
        if len(self.topk_heap) < self.k:
            heapq.heappush(self.topk_heap, (occ, itemset))
        else:
            if occ > self.topk_heap[0][0]:
                heapq.heapreplace(self.topk_heap, (occ, itemset))
        self.min_threshold = self.topk_heap[0][0] if self.topk_heap else 0.0

    def _compute_item_supports(self, db: QuantitativeDatabase) -> Dict[str, int]:
        supports = defaultdict(int)
        for t in db.transactions.values():
            for item in t.keys():
                supports[item] += 1
        return dict(supports)

    def _build_cmap(self, db: QuantitativeDatabase, sorted_items: List[str], ms: int):
        for i, item in enumerate(sorted_items):
            self.cmap[item] = set()
            for tid, trans in db.transactions.items():
                if item in trans:
                    for j in range(i + 1, len(sorted_items)):
                        other = sorted_items[j]
                        if other in trans:
                            self.cmap[item].add(other)
        for item in sorted_items:
            cooc = defaultdict(int)
            for tid, trans in db.transactions.items():
                if item in trans:
                    for other in self.cmap[item]:
                        if other in trans:
                            cooc[other] += 1
            self.cmap[item] = {o for o, sup in cooc.items() if sup >= ms}

    def _construct_puon_list(self, db: QuantitativeDatabase, itemset: Tuple[str, ...]) -> PUONList:
        if itemset in self.puon_lists:
            return self.puon_lists[itemset]
        pl = PUONList(itemset)
        last_item = itemset[-1]
        for tid, trans in db.transactions.items():
            if all(it in trans for it in itemset):
                nu = sum(trans[it] for it in itemset)
                nru = sum(util for it, util in trans.items() if self.item_order[it] > self.item_order[last_item])
                npu = 0.0
                tu = db.transaction_utilities[tid]
                elem = PUONElement(tid, nu, nru, npu, tu)
                elem.ubrem_rel = (nu + nru) / tu if tu > 0 else 0.0
                pl.add_element(elem)
                pl.list_ub_rem_rel.append(elem.ubrem_rel)
        pl.compute_occ()
        pl.compute_wubocc(self.k)
        self.puon_lists[itemset] = pl
        return pl

    def _construct_extension(self, db: QuantitativeDatabase, itemset: Tuple[str, ...], ext_item: str, parent: PUONList) -> Optional[PUONList]:
        ext_itemset = itemset + (ext_item,)
        pl = PUONList(ext_itemset)
        for elem in parent.elements:
            tid = elem.nid
            trans = db.transactions.get(tid, {})
            if ext_item in trans:
                nu = elem.nu + trans[ext_item]
                nru = sum(util for it, util in trans.items() if self.item_order[it] > self.item_order[ext_item])
                tu = elem.tu
                new_elem = PUONElement(tid, nu, nru, elem.npu, tu)
                new_elem.ubrem_rel = (nu + nru) / tu if tu > 0 else 0.0
                pl.add_element(new_elem)
                pl.list_ub_rem_rel.append(new_elem.ubrem_rel)
        if not pl.elements:
            return None
        pl.compute_occ()
        pl.compute_wubocc(self.k)
        return pl

    def _hybrid_upper_bound(self, pl: PUONList) -> float:
        if not pl.elements:
            return 0.0
        avg_rem = sum(e.nru / e.tu for e in pl.elements) / len(pl.elements)
        return min(1.0, pl.f_wubocc + avg_rem)

    def fit(self, db: QuantitativeDatabase, ms: int = 1) -> List[Tuple[Tuple[str, ...], float]]:
        start = time.time()
        supports = self._compute_item_supports(db)
        relevant = {i for i, s in supports.items() if s >= ms}
        db.reduce_database(relevant)
        sorted_items = sorted(relevant, key=lambda x: (supports[x], x))
        self.item_order = {item: idx for idx, item in enumerate(sorted_items)}
        self._build_cmap(db, sorted_items, ms)

        pq = []
        for item in sorted_items:
            pl = self._construct_puon_list(db, (item,))
            pl.priority = self._hybrid_upper_bound(pl)
            heapq.heappush(pq, (-pl.priority, (item,), pl))

        while pq:
            _, itemset, pl = heapq.heappop(pq)
            self._update_topk(itemset, pl.f_occ)
            self.threshold_trace.append((time.time() - start, self.min_threshold))

            if pl.priority < self.min_threshold:
                continue

            last = itemset[-1]
            for y in sorted_items:
                if self.item_order[y] <= self.item_order[last]:
                    continue
                if last in self.cmap and y not in self.cmap[last]:
                    continue

                ext_pl = self._construct_extension(db, itemset, y, pl)
                if not ext_pl:
                    continue
                ext_pl.priority = self._hybrid_upper_bound(ext_pl)
                if ext_pl.priority < self.min_threshold:
                    continue
                heapq.heappush(pq, (-ext_pl.priority, itemset + (y,), ext_pl))

        results = sorted(self.topk_heap, key=lambda x: x[0], reverse=True)
        return [(it, occ) for occ, it in results]

## Example Run (Toy Data)

In [5]:
transactions = {
    't1': {'b': 8, 'c': 7, 'd': 5, 'e': 4, 'f': 2},
    't2': {'a': 2, 'e': 3},
    't3': {'c': 2, 'e': 1, 'f': 3},
    't4': {'d': 3, 'f': 5},
    't5': {'a': 3, 'b': 3, 'c': 5, 'd': 4, 'e': 2, 'f': 1}
}
profits = {'a':2, 'b':3, 'c':1, 'd':5, 'e':11, 'f':7}

qdb = QuantitativeDatabase()
qdb.set_profits(profits)
for tid, items in transactions.items():
    qdb.add_transaction(tid, items)

miner = TopKHybridHUOPM(k=5)
results = miner.fit(qdb, ms=1)

print("Top-k results:")
for itemset, occ in results:
    print(itemset, f"occ={occ:.4f}")

Top-k results:
('b', 'c', 'd', 'e') occ=0.8444
('b', 'd', 'e') occ=0.7775
('a', 'e') occ=0.7029
('c', 'd', 'e') occ=0.6739
('a', 'b', 'c', 'e') occ=0.6087
