In [None]:
import pickle
import numpy as np
import time

import hdbscan

In [None]:
def log_to_file(file_name: str, message: str):
    with open(file_name, "a") as f:
        f.write(f"{message}\n")

def print_size(file_name: str, obj, obj_name="N/A"):
    from pympler import asizeof
    memory_usage = asizeof.asizeof(obj)
    # Convert memory usage to a more readable format
    if memory_usage < 1024:
        memory_usage_str = f"{memory_usage} bytes"
    elif memory_usage < 1024 ** 2:
        memory_usage_str = f"{memory_usage / 1024} KB"
    elif memory_usage < 1024 ** 3:
        memory_usage_str = f"{memory_usage / (1024 ** 2)} MB"
    else:
        memory_usage_str = f"{memory_usage / (1024 ** 3)} GB"
    # Print the memory usage and object name
    log_to_file(file_name, f"Memory usage of {obj_name}: {memory_usage_str}")

In [None]:
with open("bak/snow-man/point-cloud.pkl", 'rb') as f:
    points_cloud: np.ndarray = pickle.load(f)

In [None]:
log_to_file("clustring.logs", "started processing....")
from sklearn.cluster import DBSCAN

start_time = time.time()
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=10).fit(points_cloud)
end_time = time.time()
log_to_file("clustring.logs", f"time taken: {end_time - start_time:,} seconds")

In [None]:
with open("./hdbscan_model.pkl", 'wb') as f:
    pickle.dump(hdbscan_model, f)

In [None]:
print_size("clustring.logs", hdbscan_model, "hdbscan_model")

In [None]:
# Get the cluster labels for each point
labels = hdbscan_model.labels
log_to_file("clustring.logs", "Labels Done...")

# Get the indices of the core points (i.e., points that are part of a dense region)
core_indices = np.where(labels != -1)[0]
log_to_file("clustring.logs", "Core Indicies Done...")

# Get the coordinates of the core points
core_points = points_cloud[core_indices, :]
log_to_file("clustring.logs", "Core Points Done...")

# Get the indices of the outlier points (i.e., points that are not part of any dense region)
outlier_indices = np.where(labels == -1)[0]
log_to_file("clustring.logs", "Outlier Indicies Done...")

# Get the coordinates of the outlier points
outlier_points = points_cloud[outlier_indices, :]
log_to_file("clustring.logs", "Outlier Points Done...")

# Log the number of clusters and the number of outlier points
log_to_file("clustring.logs", f"Number of clusters: {len(np.unique(labels))-1:,}")
log_to_file("clustring.logs", f"Number of core points: {len(core_indices)}")
log_to_file("clustring.logs", f"Number of outlier points: {len(outlier_indices)}")
log_to_file("clustring.logs", f"Number of total points: {len(core_indices) + len(outlier_indices):,}")

# Furthur analytics on the output

In [None]:
from collections import Counter

log_to_file("clustring.logs", "Analysis of X, Y, Z of Core Points")

log_to_file("clustring.logs", f"X<{len(core_points[:,0]):,}>: {core_points[:,0].min():,} to {core_points[:,0].max():,}")
x_counter = Counter(core_points[:,0])
log_to_file("clustring.logs", f"We have {len(x_counter):,} unique X values")
log_to_file("clustring.logs", f"Most Common X: {x_counter.most_common(1)}, Least Two Common X: {x_counter.most_common()[:-3:-1]}")
# log_to_file("clustring.logs", x_counter)
log_to_file("clustring.logs", "-----------------------------------------------------")

log_to_file("clustring.logs", f"Y<{len(core_points[:,1]):,}>: {core_points[:,1].min():,} to {core_points[:,1].max():,}")
y_counter = Counter(core_points[:,1])
log_to_file("clustring.logs", f"We have {len(y_counter):,} unique Y values")
log_to_file("clustring.logs", f"Most Common Y: {y_counter.most_common(1)}, Least Two Common Y: {y_counter.most_common()[:-3:-1]}")
# log_to_file("clustring.logs", y_counter)
log_to_file("clustring.logs", "-----------------------------------------------------")

log_to_file("clustring.logs", f"Z<{len(core_points[:,2]):,}>: {core_points[:,2].min():,} to {core_points[:,2].max():,}")
z_counter = Counter(core_points[:,2])
log_to_file("clustring.logs", f"We have {len(z_counter):,} unique Z values")
log_to_file("clustring.logs", f"Most Common Z: {z_counter.most_common(1)}, Least Two Common Y: {z_counter.most_common()[:-3:-1]}")
# log_to_file("clustring.logs", z_counter)
log_to_file("clustring.logs", "-----------------------------------------------------")

In [None]:
from collections import Counter

log_to_file("clustring.logs", "Analysis of X, Y, Z of Outliers Points")

log_to_file("clustring.logs", f"X<{len(outlier_points[:,0]):,}>: {outlier_points[:,0].min():,} to {outlier_points[:,0].max():,}")
x_counter = Counter(outlier_points[:,0])
log_to_file("clustring.logs", f"We have {len(x_counter):,} unique X values")
log_to_file("clustring.logs", f"Most Common X: {x_counter.most_common(1)}, Least Two Common X: {x_counter.most_common()[:-3:-1]}")
# log_to_file("clustring.logs", x_counter)
log_to_file("clustring.logs", "-----------------------------------------------------")

log_to_file("clustring.logs", f"Y<{len(outlier_points[:,1]):,}>: {outlier_points[:,1].min():,} to {outlier_points[:,1].max():,}")
y_counter = Counter(outlier_points[:,1])
log_to_file("clustring.logs", f"We have {len(y_counter):,} unique Y values")
log_to_file("clustring.logs", f"Most Common Y: {y_counter.most_common(1)}, Least Two Common Y: {y_counter.most_common()[:-3:-1]}")
# log_to_file("clustring.logs", y_counter)
log_to_file("clustring.logs", "-----------------------------------------------------")

log_to_file("clustring.logs", f"Z<{len(outlier_points[:,2]):,}>: {outlier_points[:,2].min():,} to {outlier_points[:,2].max():,}")
z_counter = Counter(outlier_points[:,2])
log_to_file("clustring.logs", f"We have {len(z_counter):,} unique Z values")
log_to_file("clustring.logs", f"Most Common Z: {z_counter.most_common(1)}, Least Two Common Y: {z_counter.most_common()[:-3:-1]}")
# log_to_file("clustring.logs", z_counter)
log_to_file("clustring.logs", "-----------------------------------------------------")