Install Libraries

In [None]:
# install UMAP
!pip install umap-learn

# install HDBSCAN
!pip install hdbscan

Import Libraries

In [1]:
# Basic Operations
import pandas as pd

# UMAP & HDBSCAN
import umap
import hdbscan

# Optimization
import itertools

# Evaluation Methods
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

  from .autonotebook import tqdm as notebook_tqdm


Import and Extract Dataset

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/annikazwei/VI-Structure-with-NLP/refs/heads/main/Data_Output/NLP_Vectors_BI.csv')
data = data.rename(columns={"Unnamed: 0": "Interests"})
interests = data['Interests']
embeddings = data.iloc[:, 1:].values

Optimization Function

The optimization function is used to find optimal values for UMAP and HDBSCAN parameters. 

In [3]:
# Function to Evaluate Clustering
def evaluate_clustering(embeddings, umap_params, hdbscan_params):
    reducer = umap.UMAP(**umap_params).fit(embeddings)
    reduced_embeddings = reducer.transform(embeddings)
    
    clusterer = hdbscan.HDBSCAN(**hdbscan_params).fit(reduced_embeddings)
    labels = clusterer.labels_
    noise_points = list(labels).count(-1)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    
    # Calculate evaluation metrics
    silhouette = silhouette_score(reduced_embeddings, labels) if n_clusters > 1 else None
    ch_score = calinski_harabasz_score(reduced_embeddings, labels) if n_clusters > 1 else None
    db_score = davies_bouldin_score(reduced_embeddings, labels) if n_clusters > 1 else None
    
    # Calculate cluster size distribution metric (variance of cluster sizes)
    cluster_sizes = [list(labels).count(i) for i in set(labels) if i != -1]

    return {
        "noise_points": noise_points,
        "n_clusters": n_clusters,
        "silhouette": silhouette,
        "calinski_harabasz": ch_score,
        "davies_bouldin": db_score,
    }

In [4]:
# Function for Parameter Search
def parameter_search(embeddings, umap_grid, hdbscan_grid):
    best_config_overall = None
    best_results_overall = None

    best_by_metric = {
        "silhouette": {"config": None, "results": None},
        "calinski_harabasz": {"config": None, "results": None},
        "davies_bouldin": {"config": None, "results": None},
        "noise_points": {"config": None, "results": None},
    }

    # Generate all combinations of UMAP and HDBSCAN parameters
    umap_combinations = list(itertools.product(
        umap_grid["n_components"],
        umap_grid["n_neighbors"],
        umap_grid["min_dist"]
    ))
    hdbscan_combinations = list(itertools.product(
        hdbscan_grid["min_cluster_size"],
        hdbscan_grid["min_samples"],
        ['eom', 'leaf']  # Add both cluster selection methods
    ))

    # Iterate through all combinations
    for umap_params in umap_combinations:
        for hdbscan_params in hdbscan_combinations:
            current_umap_params = {
                "n_components": umap_params[0],
                "n_neighbors": umap_params[1],
                "min_dist": umap_params[2],
                "random_state": 42,
            }
            current_hdbscan_params = {
                "min_cluster_size": hdbscan_params[0],
                "min_samples": hdbscan_params[1],
                "cluster_selection_method": hdbscan_params[2]
            }
            print(f"Testing UMAP params: {current_umap_params}, HDBSCAN params: {current_hdbscan_params}")

            # Evaluate clustering
            results = evaluate_clustering(embeddings, current_umap_params, current_hdbscan_params)

            # Track best configurations for individual metrics
            for metric, criteria in best_by_metric.items():
                if metric == "davies_bouldin":
                    if results[metric] is not None and (criteria["results"] is None or results[metric] < criteria["results"][metric]):
                        best_by_metric[metric] = {"config": (current_umap_params, current_hdbscan_params), "results": results}
                elif metric == "noise_points":
                    if results[metric] is not None and (criteria["results"] is None or results[metric] < criteria["results"][metric]):
                        best_by_metric[metric] = {"config": (current_umap_params, current_hdbscan_params), "results": results}
                else:  # For silhouette and calinski_harabasz (maximize)
                    if results[metric] is not None and (criteria["results"] is None or results[metric] > criteria["results"][metric]):
                        best_by_metric[metric] = {"config": (current_umap_params, current_hdbscan_params), "results": results}

            # Determine overall best configuration
            if best_results_overall is None or (
                results["silhouette"] is not None and
                (
                    results["silhouette"] > (best_results_overall["silhouette"] or 0) or
                    results["calinski_harabasz"] > (best_results_overall["calinski_harabasz"] or 0) or
                    results["davies_bouldin"] < (best_results_overall["davies_bouldin"] or float('inf'))
                )
            ):
                best_config_overall = (current_umap_params, current_hdbscan_params)
                best_results_overall = results

    return best_config_overall, best_results_overall, best_by_metric

In [5]:
# Define Grids for UMAP and HDBSCAN
umap_grid = {
    "n_components": [15, 20, 25], 
    "n_neighbors": [2, 3, 5], 
    "min_dist": [0.01, 0.05, 0.1],
}

hdbscan_grid = {
    "min_cluster_size": [2, 3, 4], 
    "min_samples": [1, 2, 3]
}

In [None]:
# Run Parameter Search
best_config_overall, best_results_overall, best_by_metric = parameter_search(embeddings, umap_grid, hdbscan_grid)

# Output the best configuration and results
print("\nBest Overall Configuration:")
print("UMAP Parameters:", best_config_overall[0])
print("HDBSCAN Parameters:", best_config_overall[1])
print("Results:", best_results_overall)

print("\nBest Configurations by Metric:")
for metric, details in best_by_metric.items():
    print(f"\n{metric.capitalize()}:")
    print("  UMAP Parameters:", details["config"][0])
    print("  HDBSCAN Parameters:", details["config"][1])
    print("  Results:", details["results"])

Function for UMAP Dimension Reduction

In [7]:
def reduce_dimensions(embeddings):
    reducer = umap.UMAP(
        n_components = 25, 
        n_neighbors = 2, 
        min_dist = 0.01, 
        random_state = 42,
    )
    reduced_embeddings = reducer.fit_transform(embeddings)
    return reduced_embeddings

Function for HDBSCAN Clustering

In [8]:
def perform_clustering(embeddings):
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size = 2, 
        min_samples = 2, 
        allow_single_cluster = True,
        cluster_selection_method = 'eom',
    ).fit(embeddings)
    return clusterer

Perform Dimension Reduction & Clustering

In [9]:
reduced_embeddings = reduce_dimensions(embeddings)
clusterer = perform_clustering(reduced_embeddings)

  warn(


Parameter Evaluation

In [10]:
# Calculate Evaluation Scores
labels = clusterer.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
silhouette = silhouette_score(reduced_embeddings, labels) if n_clusters > 1 else None
calinski_harabasz = calinski_harabasz_score(reduced_embeddings, labels)
davies_bouldin = davies_bouldin_score(reduced_embeddings, labels)

In [11]:
# Print Evaluation Scores
print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")
print(f"Silhouette Score: {silhouette}")
print(f"Calinski-Harabasz Score: {calinski_harabasz}")
print(f"Davies-Bouldin Score: {davies_bouldin}")

Number of clusters: 8
Number of noise points: 2
Silhouette Score: 0.8294634819030762
Calinski-Harabasz Score: 1618.648681640625
Davies-Bouldin Score: 0.24103453299463715


Create Dataframe for Results

In [12]:
hdbscan_output = pd.DataFrame(reduced_embeddings, columns=[f"UMAP_{i+1}" for i in range(reduced_embeddings.shape[1])])
hdbscan_output.insert(0, 'Interest', interests) 
hdbscan_output.insert(1, 'Cluster', clusterer.labels_)
hdbscan_output.insert(2, 'Cluster_Probability', clusterer.probabilities_)

hdbscan_output

Unnamed: 0,Interest,Cluster,Cluster_Probability,UMAP_1,UMAP_2,UMAP_3,UMAP_4,UMAP_5,UMAP_6,UMAP_7,...,UMAP_16,UMAP_17,UMAP_18,UMAP_19,UMAP_20,UMAP_21,UMAP_22,UMAP_23,UMAP_24,UMAP_25
0,life science,6,1.0,10.984294,5.299196,6.251344,5.184079,2.295752,5.225235,4.844056,...,6.311605,3.737576,4.113806,6.305713,4.519217,3.292734,3.871205,2.958922,2.135118,2.092906
1,medical science,6,1.0,11.017192,5.218471,6.271049,5.210646,2.26237,5.213142,4.849294,...,6.400005,3.71534,4.157379,6.41432,4.600076,3.180313,3.838516,2.9401,2.140189,2.059076
2,health care service,3,1.0,10.762259,5.121116,6.192292,4.904592,2.470652,5.179655,5.215617,...,6.416853,3.81757,4.09009,6.090676,4.737141,3.170732,3.844402,3.063529,2.036081,1.915243
3,media,0,1.0,4.41473,7.828119,5.654346,6.803453,4.4847,5.718266,2.757999,...,6.739776,4.341225,5.753776,1.237189,6.29053,6.744328,6.309406,3.212255,4.219425,5.20158
4,applied arts and design,0,1.0,4.392937,7.7125,5.715338,6.900097,4.274,5.836255,2.794879,...,6.746998,4.305086,5.787941,1.256681,6.225467,5.969346,6.299294,3.274997,4.196745,5.274898
5,music,0,1.0,4.535243,7.705097,5.699535,6.897958,4.147241,5.820273,2.889572,...,6.810085,4.368908,5.735102,1.347471,6.417342,5.926995,6.420116,3.135025,4.126384,5.261236
6,visual arts,0,1.0,4.40927,7.665399,5.728226,6.921822,4.251095,5.82971,2.818198,...,6.834178,4.315996,5.776184,1.313457,6.297843,6.0205,6.37435,3.246887,4.157575,5.302672
7,performing arts,0,1.0,4.470724,7.67047,5.73226,6.942646,4.157066,5.834614,2.920476,...,6.904308,4.350232,5.747534,1.417654,6.402036,6.043293,6.508552,3.171262,4.067503,5.307277
8,creative writing,0,1.0,4.437359,7.763938,5.668536,6.843376,4.357711,5.758852,2.795884,...,6.806169,4.354958,5.744977,1.277128,6.320478,6.445686,6.356286,3.202248,4.190815,5.22812
9,culinary art,6,0.648797,11.01617,5.163006,6.265523,5.211328,2.247859,5.208032,4.869684,...,6.451028,3.730155,4.169146,6.44429,4.656437,3.106171,3.830346,2.955532,2.156576,2.056687
