In [None]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA

from tdamapper.cover import CubicalCover
from tdamapper.learn import MapperAlgorithm
from tdamapper.core import mapper_connected_components

X, labels = load_digits(return_X_y=True)
y = PCA(2, random_state=42).fit_transform(X)
print(f"X: {X.shape}, y: {y.shape}, labels: {labels.shape}")
mapper = MapperAlgorithm(
    cover=CubicalCover(n_intervals=10, overlap_frac=0.5),
    clustering=AgglomerativeClustering(10),
    verbose=False,
)
clusters = mapper_connected_components(
    X, y, 
    cover=CubicalCover(n_intervals=5, overlap_frac=0.5),
    clustering=AgglomerativeClustering(10)
)

graph = mapper.fit_transform(X, y)
print(f"nodes: {len(graph.nodes())}, edges: {len(graph.edges())}")
print(clusters)
print(len(clusters))
result = np.array(clusters)
for i in list(np.unique(result)):
    print(f"Cluster {i}: {np.sum(result == i)}")

In [None]:
from pathlib import Path
from pandas import read_parquet, concat, DataFrame
from sklearn.decomposition import PCA
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from tdamapper.cover import CubicalCover
from tdamapper.learn import MapperAlgorithm
from tdamapper.core import mapper_connected_components

def load_embeddings(dataset, numfiles=None):
    """
    Load embeddings from specified datasets and concatenate them into a single DataFrame.

    Parameters:
    - datasets: Name of dataset to load. ['aol', 'aql', 'ms-marco', 'orcas', 'combined']
    - numfiles: Number of files to load from each dataset (if None, load all).

    Returns:
    - DataFrame containing embeddings.
    """

    embeddings_data = DataFrame()
    # Determine the suffix based on the dataset
    suffix = "special" if dataset in ["aql", "aol"] else "all"

    # Path to embeddings
    path = Path(
        f"/home/benjamin/dev/applying-tda/data/{dataset}-get-embeddings-{suffix}") if dataset != "joint" else Path(f"/home/benjamin/dev/applying-tda/data/embeddings_combined_{numfiles*4}.parquet")

    if dataset == "joint":
        # Load combined embeddings directly from parquet file
        embeddings_data = read_parquet(path)
    else:
        # Get number of files in path
        files = list(path.glob("*.parquet"))
        print(f"Number of files ({dataset}): {len(files)}")

        # Limit to specified number of files if numfiles is set
        if numfiles is not None:
            files = files[:numfiles]

        # Load embeddings from each file
        for cnt, file_path in enumerate(files):
            print(f"Loading {cnt+1}/{len(files)} {dataset} {file_path.name}")
            df = read_parquet(file_path)
            # concatenate column with an identifier for the dataset
            df['dataset'] = dataset.upper()
            embeddings_data = concat([embeddings_data, df], ignore_index=True)

    return embeddings_data

def pre_process_dataset(dataset: DataFrame) -> np.ndarray:
    """
    Preprocess a single dataset: turn into numpy array, standardize data.

    Parameters:
    - dataset: pandas dataframe.
    """

    # change dtype of arrays in the embeddings column to float32
    dataset["embeddings"] = dataset["embeddings"].apply(
        lambda x: np.array(x, dtype=np.float32))
    # convert to numpy array, standardize data
    embeddings = dataset.to_numpy()

    # Stack the arrays in the embeddings column into a 2D array
    emb_array = np.stack(embeddings[:, 1])

    # Standardize each feature (column-wise)
    emb_array = (emb_array - np.mean(emb_array, axis=0)) / \
        np.std(emb_array, axis=0)

    return emb_array


mode = "joint"  # "aol", "aql", "ms-marco", "orcas", "combined", "joint"
numfiles = 3  # Number of files to load from each dataset
embeddings = load_embeddings(mode, numfiles)
X = pre_process_dataset(embeddings)
print(f"X: {X.shape}")

# Perform PCA to reduce dimensionality for visualization
y = PCA(2, random_state=42).fit_transform(X)
mapper = MapperAlgorithm(
    cover=CubicalCover(n_intervals=10, overlap_frac=0.5),
    clustering=AgglomerativeClustering(10),
    verbose=False,
)
# Perform Mapper algorithm to find clusters
clusters = mapper_connected_components(
    X, y, 
    cover=CubicalCover(n_intervals=5, overlap_frac=0.5),
    clustering=AgglomerativeClustering(10)
)
print(clusters)
print(len(clusters))
result = np.array(clusters)
for i in list(np.unique(result)):
    print(f"Cluster {i}: {np.sum(result == i)}")