In [1]:
# dimensions_choice.py

import random

ALL_ARGS = ["datapoints", "nbr_dims", "last_dim"]


def alternate_dim(nbr_dims, last_dim, **kwargs):
    """
    Returns the next dimension to split on.
    """
    return {"dim": (last_dim + 1) % nbr_dims}

ALTERNATE_OUT_PLUS = []


def random_dim(nbr_dims, **kwargs):
    """
    Returns a random dimension to split on.
    """
    return {"dim": random.randint(0, nbr_dims - 1)}

RANDOM_OUT_PLUS = []


def max_variance_dim(datapoints: list[list[float]], nbr_dims: int, **kwargs):
    """
    Returns the dimension with the highest variance.
    """
    max_variance = 0
    max_variance_dim = 0

    mean_val = 0

    mean_val_out = 0

    for dim in range(nbr_dims):
        variance = 0
        # calculate the mean value across all points for the current dimension
        mean_val = sum([point[dim] for point in datapoints]) / len(datapoints)

        # calculate the variance
        variance = sum([(point[dim] - mean_val) ** 2 for point in datapoints])
        variance /= len(datapoints)

        if variance > max_variance:
            max_variance = variance
            max_variance_dim = dim
            mean_val_out = mean_val
    return {"dim": max_variance_dim, "mean_val": mean_val_out}

MAX_VARIANCE_OUT_PLUS = ["mean_val"]


def widest_interval_dim(datapoints: list[list[float]], nbr_dims: int, **kwargs):
    """
    Returns the dimension with the highest maximum-minimum value.
    """
    max_range = 0
    max_range_dim = 0

    max_val = 0
    min_val = 0

    for dim in range(nbr_dims):
        max_val = max([point[dim] for point in datapoints])
        min_val = min([point[dim] for point in datapoints])
        range_val = max_val - min_val
        if range_val > max_range:
            max_range = range_val
            max_range_dim = dim
    return {"dim": max_range_dim, "max_val": max_val, "min_val": min_val}

WIDEST_INTERVAL_OUT_PLUS = ["max_val", "min_val"]

DIM_OUT_PLUS = {
    "alternate": ALTERNATE_OUT_PLUS,
    "random": RANDOM_OUT_PLUS,
    "max_variance": MAX_VARIANCE_OUT_PLUS,
    "widest_interval": WIDEST_INTERVAL_OUT_PLUS,
}


In [2]:
# seeds_choice.py

# from dimension_choice import *


def one_dim_farthest_seeds(
    datapoints: list[list[float]],
    nbr_dims: int,
    dimension_choice_alg: str = "random",
    last_dim: int = None,
    **kwargs
):
    """
    Returns the seeds that are the farthest apart from each other on a single dimension.
    """

    switcher: dict[str, function] = {
        "alternate": alternate_dim,
        "random": random_dim,
        "max_variance": max_variance_dim,
        "widest_interval": widest_interval_dim,
    }

    assert (
        dimension_choice_alg in switcher
    ), "Invalid dimension_choice_alg, choose from 'alternate', 'random', 'max_variance', 'widest_interval'"

    dimension_choice = switcher[dimension_choice_alg]

    dim_result = dimension_choice(datapoints=datapoints, nbr_dims=nbr_dims, last_dim=last_dim)

    chosen_dim = dim_result["dim"]

    max_point = max(datapoints, key=lambda x: x[chosen_dim])
    min_point = min(datapoints, key=lambda x: x[chosen_dim])

    return {"seeds": [max_point, min_point], "dim": chosen_dim}


def farthest_euc_distance_seeds(
    datapoints: list[list[float]],
    nbr_dims: int,
    **kwargs
):
    """
    Returns the seeds that are the farthest apart from each other on all dimensions.
    """

    max_dist = 0
    max_dist_points = []

    for point1 in datapoints:
        for point2 in datapoints:
            dist = sum([(point1[dim] - point2[dim]) ** 2 for dim in range(nbr_dims)]) ** 0.5
            if dist > max_dist:
                max_dist = dist
                max_dist_points = [point1, point2]

    return {"seeds": max_dist_points}


In [3]:
# group_choice.py

def closest_seed_group(seed:list[float], seed2:list[float], nbr_dims:int, datapoints:list[list[float]]):
    """
    Returns the group of points that are closest to either seed1 or seed2.
    """
    group1 = []
    group2 = []

    for point in datapoints:
        dist1 = sum([(point[dim] - seed[dim]) ** 2 for dim in range(nbr_dims)]) ** 0.5
        dist2 = sum([(point[dim] - seed2[dim]) ** 2 for dim in range(nbr_dims)]) ** 0.5

        if dist1 < dist2:
            group1.append(point)
        else:
            group2.append(point)

    return group1, group2


def sorting_distance_to_one_seed_group(seed:list[float], nbr_dims:int, datapoints:list[list[float]], **kwargs):
    """
    Sorts the points based on the distance from the seed. Then splits the sorted points into two groups.
    """
    sorted_points = sorted(datapoints, key=lambda x: sum([(x[dim] - seed[dim]) ** 2 for dim in range(nbr_dims)]))
    group1 = sorted_points[:len(sorted_points) // 2]
    group2 = sorted_points[len(sorted_points) // 2:]

    return group1, group2





In [4]:
# from seeds_choice import *
# from grouping_choice import *
from sklearn.metrics import silhouette_score
import numpy as np

class RTree:
    """
    RTree class

    Attributes:
    ----------
    k : int
        The number of dimensions of the datapoints.
    root : dict
        The root node of the RTree.
    grouping_choice : function
        The function to choose the grouping of points.
    seed_choice : function
        The function to choose the seed points.
    dimension_choice : str
        The function to choose the dimension to split on.

    Methods:
    -------
    build(datapoints: list[list[float]]) -> dict
        Builds the RTree from the given datapoints.
    recursive_build(datapoints: list[list[float]], depth: int, last_dim: int) -> dict
        Recursively builds the RTree from the given datapoints.
    compute_silhouette_score() -> float
        Computes the Silhouette Score for the RTree.
    _flatten_tree(node: dict, label: int) -> list[list[float]], list[int]
        Flattens the RTree to get all points and their cluster labels.
    """

    def __init__(
        self,
        k: int,
        datapoints: list[list[float]],
        grouping_choice: str = "closest_seed",
        seed_choice: str = "one_dim_farthest",
        dimension_choice: str = "random",
        leaf_size: int = 10,
        max_depth: int = None,
    ):
        """
        Initializes the RTree with the given datapoints and the grouping_choice and seed_choice functions.

        Parameters
        ----------
        k : int
            The number of dimensions of the datapoints.
        datapoints : list[list[float]]
            The list of datapoints to build the RTree from.
        grouping_choice : str, optional
            The function to choose the grouping of points, by default "closest_seed_group".
            Options: "closest_seed_group", "sorting_distance_to_one_seed_group".
        seed_choice : str, optional
            The function to choose the seed points, by default "one_dim_farthest".
            Options: "one_dim_farthest", "farthest_euc_distance".
        dimension_choice : str, optional
            Used by the seed_choice function when it's "one_dim_farthest", by default "random".
            Options: "alternate", "random", "max_variance", "widest_interval".
        leaf_size : int, optional
            The maximum number of points that can be stored in a leaf node, by default 10.
        max_depth : int, optional
            The maximum depth of the tree, by default None.
        """
        self.k = k
        self.leaf_size = leaf_size
        self.max_depth = max_depth

        switcher: dict[str, function] = {
            "closest_seed": closest_seed_group,
            "sorting_distance_to_one_seed": sorting_distance_to_one_seed_group,
        }
        self.grouping_choice = switcher[grouping_choice]

        switcher = {
            "one_dim_farthest": one_dim_farthest_seeds,
            "farthest_euc_distance": farthest_euc_distance_seeds,
        }
        self.seed_choice = switcher[seed_choice]

        self.dimension_choice = dimension_choice

        self.root = self.build(datapoints)

    def build(self, datapoints: list[list[float]]) -> dict:
        """
        Builds the RTree from the given datapoints.

        Parameters
        ----------
        datapoints : list[list[float]]
            The list of datapoints to build the RTree from.
        """
        Pmin = [min(point[dim] for point in datapoints) for dim in range(self.k)]
        Pmax = [max(point[dim] for point in datapoints) for dim in range(self.k)]

        return {
            "min": Pmin,
            "max": Pmax,
            "points": datapoints,
            "depth": 0,
            "children": self.recursive_build(datapoints, 1),
        }

    def recursive_build(
        self, datapoints: list[list[float]], depth: int, last_dim: int = 0
    ) -> dict:
        # Stop recursion if the number of points is <= leaf_size or max_depth is reached
        if len(datapoints) <= self.leaf_size or (self.max_depth is not None and depth >= self.max_depth):
            return None

        seeds_result = self.seed_choice(
            datapoints=datapoints,
            nbr_dims=self.k,
            dimension_choice_alg=self.dimension_choice,
            last_dim=last_dim,
        )

        seeds = seeds_result["seeds"]

        groups = self.grouping_choice(
            seed=seeds[0], seed2=seeds[1], nbr_dims=self.k, datapoints=datapoints
        )

        Pmin = [min(point[dim] for point in groups[0]) for dim in range(self.k)]
        Pmax = [max(point[dim] for point in groups[0]) for dim in range(self.k)]

        left = {
            "min": Pmin,
            "max": Pmax,
            "points": groups[0],
            "depth": depth,
            "children": self.recursive_build(
                groups[0],
                depth + 1,
                last_dim=seeds_result["dim"] if "dim" in seeds_result else None,
            ),
        }

        Pmin = [min(point[dim] for point in groups[1]) for dim in range(self.k)]
        Pmax = [max(point[dim] for point in groups[1]) for dim in range(self.k)]

        right = {
            "min": Pmin,
            "max": Pmax,
            "points": groups[1],
            "depth": depth,
            "children": self.recursive_build(
                groups[1],
                depth + 1,
                last_dim=seeds_result["dim"] if "dim" in seeds_result else None,
            ),
        }

        return {"left": left, "right": right}

    def compute_silhouette_score(self):
        """
        Computes the Silhouette Score for the RTree.

        Returns
        -------
        float
            The Silhouette Score of the RTree.
        """
        # Flatten the tree to get all points and their cluster labels
        points, labels = self._flatten_tree(self.root, 0)
        points = np.array(points)
        labels = np.array(labels)

        # Compute the Silhouette Score
        score = silhouette_score(points, labels)
        return score

    def _flatten_tree(self, node, label):
        """
        Flattens the RTree to get all points and their cluster labels.

        Parameters
        ----------
        node : dict
            The current node of the RTree.
        label : int
            The current cluster label.

        Returns
        -------
        list[list[float]], list[int]
            The list of points and their cluster labels.
        """
        if node is None:
            return [], []

        if "children" not in node or node["children"] is None:
            points = node["points"]
            labels = [label] * len(points)
            return points, labels

        left_points, left_labels = self._flatten_tree(node["children"]["left"], label)
        right_points, right_labels = self._flatten_tree(node["children"]["right"], label + 1)

        return left_points + right_points, left_labels + right_labels

In [5]:
# import sys

# sys.path.append('.')
# sys.path.append('./r_tree/')

# from r_tree import RTree

import time
import sys
import tracemalloc
import h5py

In [6]:
train = []

with h5py.File('/content/drive/MyDrive/db2/gist-960-euclidean.hdf5', 'r') as f:
    print("Keys: %s" % f.keys())
    # Get the data
    train = f['train'][:]

    test = f['test'][:]
    distances = f['distances'][:]
    neighbors = f['neighbors'][:]


Keys: <KeysViewHDF5 ['distances', 'neighbors', 'test', 'train']>


In [7]:
variants = [
    {"grouping_choice": "closest_seed", "seed_choice": "one_dim_farthest", "dimension_choice": "random", "leaf_size": 10, "max_depth": None},
    {"grouping_choice": "sorting_distance_to_one_seed", "seed_choice": "farthest_euc_distance", "dimension_choice": "max_variance", "leaf_size": 20, "max_depth": 20},
    {"grouping_choice": "closest_seed", "seed_choice": "farthest_euc_distance", "dimension_choice": "widest_interval", "leaf_size": 30, "max_depth": 15},
    {"grouping_choice": "sorting_distance_to_one_seed", "seed_choice": "one_dim_farthest", "dimension_choice": "alternate", "leaf_size": 40, "max_depth": 10},
    {"grouping_choice": "closest_seed", "seed_choice": "one_dim_farthest", "dimension_choice": "random", "leaf_size": 50, "max_depth": 5},
    {"grouping_choice": "sorting_distance_to_one_seed", "seed_choice": "farthest_euc_distance", "dimension_choice": "max_variance", "leaf_size": 60, "max_depth": None},
    {"grouping_choice": "closest_seed", "seed_choice": "farthest_euc_distance", "dimension_choice": "widest_interval", "leaf_size": 70, "max_depth": 20},
    {"grouping_choice": "sorting_distance_to_one_seed", "seed_choice": "one_dim_farthest", "dimension_choice": "alternate", "leaf_size": 80, "max_depth": 15},
    {"grouping_choice": "closest_seed", "seed_choice": "one_dim_farthest", "dimension_choice": "random", "leaf_size": 90, "max_depth": 10},
    {"grouping_choice": "sorting_distance_to_one_seed", "seed_choice": "farthest_euc_distance", "dimension_choice": "max_variance", "leaf_size": 100, "max_depth": None},
    {"grouping_choice": "closest_seed", "seed_choice": "farthest_euc_distance", "dimension_choice": "widest_interval", "leaf_size": 110, "max_depth": 20},
    {"grouping_choice": "sorting_distance_to_one_seed", "seed_choice": "one_dim_farthest", "dimension_choice": "alternate", "leaf_size": 120, "max_depth": 15},
    {"grouping_choice": "closest_seed", "seed_choice": "one_dim_farthest", "dimension_choice": "random", "leaf_size": 130, "max_depth": 10},
    {"grouping_choice": "sorting_distance_to_one_seed", "seed_choice": "farthest_euc_distance", "dimension_choice": "max_variance", "leaf_size": 140, "max_depth": None},
]

In [None]:
results = []

for variant in variants:
    print(f"Running experiment with variant: {variant}")

    # Start tracking memory usage
    tracemalloc.start()

    start_time = time.time()

    # Build the R-Tree
    tree = RTree(
        k=960,
        datapoints=train[:10000],
        grouping_choice=variant["grouping_choice"],
        seed_choice=variant["seed_choice"],
        dimension_choice=variant["dimension_choice"],
        leaf_size=variant["leaf_size"],
        max_depth=variant["max_depth"]
    )

    build_time = time.time() - start_time

    current_memory, peak_memory = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    score = tree.compute_silhouette_score()

    results.append({
        "variant": variant,
        "build_time": build_time,
        "peak_memory": peak_memory,
        "silhouette_score": score
    })

    print(f"Build Time: {build_time:.2f} seconds")
    print(f"Peak Memory: {peak_memory / 1024 / 1024:.2f} MB")
    print(f"Silhouette Score: {score:.4f}")
    print("-" * 40)


Running experiment with variant: {'grouping_choice': 'closest_seed', 'seed_choice': 'one_dim_farthest', 'dimension_choice': 'random', 'leaf_size': 10, 'max_depth': None}
Build Time: 5700.55 seconds
Peak Memory: 246.92 MB
Silhouette Score: -0.1304
----------------------------------------
Running experiment with variant: {'grouping_choice': 'sorting_distance_to_one_seed', 'seed_choice': 'farthest_euc_distance', 'dimension_choice': 'max_variance', 'leaf_size': 20, 'max_depth': 20}
