# PBR geometry analysis

In [10]:
import laspy
import numpy as np
import os
import open3d as o3d
from joblib import Parallel, delayed
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

In [None]:
def read_las_file(file_path):
    
    # read las file
    las_file = laspy.read(file_path)

    # get the point data
    point_data = las_file.points
    # get the x, y, z coordinates
    x = point_data.x
    y = point_data.y
    z = point_data.z
    # get the intensity values
    semantics = point_data.intensity

    # stock the x, y, z, semantics in a numpy array
    x = np.array(x)
    y = np.array(y)
    z = np.array(z)
    semantics = np.array(semantics)

    # get color values from las file
    red = point_data.red
    green = point_data.green
    blue = point_data.blue

    colors = np.array([red, green, blue]).T

    # stack the arrays
    points_semantics_source = np.vstack((x, y, z, semantics)).T

    print(points_semantics_source.shape)

    # assert the length of the arrays
    assert len(x) == len(y) == len(z) == len(semantics), "Length of x, y, z, and intensity arrays do not match."

    # print the numbers of points and semantics
    print(f"Number of points: {len(x)}")
    print(f"Number of semantics: {np.unique(semantics).size}")

    return points_semantics_source, colors




(4648369, 4)
Number of points: 4648369
Number of semantics: 6658


In [None]:
def save_points_to_las(points, color, filename):
    # Create a new LAS header and file
    header = laspy.LasHeader(point_format=3, version="1.2")
    las_file = laspy.LasData(header)

    # Set coordinates
    las_file.x = points[:, 0]
    las_file.y = points[:, 1]
    las_file.z = points[:, 2]

    # Handle intensity (semantics)
    semantics = points[:, 3].astype(np.int32)  # Promote to signed int
    max_intensity = semantics[semantics != -1].max()
    semantics[semantics == -1] = max_intensity + 1  # Set outliers to new value
    las_file.intensity = semantics.astype(np.uint16)  # Cast back to uint16

    print(f"Max intensity: {max_intensity}")

    # Set RGB color
    las_file.red = color[:, 0].astype(np.uint16)
    las_file.green = color[:, 1].astype(np.uint16)
    las_file.blue = color[:, 2].astype(np.uint16)

    # Write the LAS file
    las_file.write(filename)



def sor_filter_parallel(points_semantics, nb_neighbors=10, std_ratio=0.6, n_jobs=8):
    semantics_ids = np.unique(points_semantics[:, 3])


    def process_semantic_group(points_semantics, semantics_id, nb_neighbors, std_ratio):
        # Get indices of points with this semantics
        indices = np.where(points_semantics[:, 3] == semantics_id)[0]
        local_points_semantics = points_semantics[indices]
        xyz = local_points_semantics[:, :3]

        # Create Open3D point cloud
        pcd = o3d.geometry.PointCloud()
        pcd.points = o3d.utility.Vector3dVector(xyz)

        # Apply SOR filter
        _, inliers = pcd.remove_statistical_outlier(nb_neighbors=nb_neighbors, std_ratio=std_ratio)
        inliers = np.asarray(inliers)

        # Identify outliers (not in inliers)
        all_indices = np.arange(len(indices))
        outlier_indices = np.setdiff1d(all_indices, inliers)

        # Return global outlier indices to be set to -1
        return indices[outlier_indices]
    
    # Run SOR in parallel for each semantic group
    outlier_indices_all = Parallel(n_jobs=n_jobs)(
        delayed(process_semantic_group)(points_semantics, sid, nb_neighbors, std_ratio)
        for sid in semantics_ids
    )

    # Flatten and mark all outliers with semantic = -1
    all_outliers = np.concatenate(outlier_indices_all)
    points_semantics[all_outliers, 3] = -1

    # Print the number of outliers
    print(f"Number of outliers from SOR filter: {len(all_outliers)}")

    return points_semantics


def cluster_filter(points_semantics, eps=0.5, min_samples=10, n_jobs=8):
    semantics_ids = np.unique(points_semantics[:, 3])

    def process_semantic_group(sem_id):
        # Find global indices for this semantic group
        group_indices = np.where(points_semantics[:, 3] == sem_id)[0]
        group_points = points_semantics[group_indices, :3]

        if len(group_points) < min_samples:
            return group_indices  # All treated as outliers

        clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(group_points)
        labels, counts = np.unique(clustering.labels_, return_counts=True)

        # Ignore noise-only cases (no valid clusters)
        if np.all(labels == -1):
            return group_indices

        largest_cluster_label = labels[np.argmax(counts)]
        inliers_local = np.where(clustering.labels_ == largest_cluster_label)[0]
        all_local = np.arange(len(group_indices))
        outliers_local = np.setdiff1d(all_local, inliers_local)

        # Return global indices of outliers
        return group_indices[outliers_local]

    # Parallel loop over semantic IDs
    outlier_indices_all = Parallel(n_jobs=n_jobs)(
        delayed(process_semantic_group)(sid) for sid in semantics_ids
    )

    all_outlier_indices = np.concatenate(outlier_indices_all)
    points_semantics[all_outlier_indices, 3] = -1

    # print the number of outliers
    print(f"Number of outliers from DBSCAN filter: {len(all_outlier_indices)}")

    return points_semantics



def size_filter(points_semantics, min_horizontal_length=0.5, max_horizontal_length=5.0, min_vertical_length=0.5, max_vertical_length=5.0):
    semantics_ids = np.unique(points_semantics[:, 3])

    def process_semantic_group(points_semantics, semantics_id, min_horizontal_length, max_horizontal_length, min_vertical_length, max_vertical_length):
        # Get indices of points with this semantics
        indices = np.where(points_semantics[:, 3] == semantics_id)[0]
        local_points_semantics = points_semantics[indices]
        xyz = local_points_semantics[:, :3]

        # Calculate the bounding box
        min_x, min_y, min_z = np.min(xyz, axis=0)
        max_x, max_y, max_z = np.max(xyz, axis=0)

        # Calculate lengths
        horizontal_length = np.sqrt((max_x - min_x) ** 2 + (max_y - min_y) ** 2)
        vertical_length = max_z - min_z

        # return True if the lengths are within the specified range; otherwise, return False
        if min_horizontal_length <= horizontal_length <= max_horizontal_length and min_vertical_length <= vertical_length <= max_vertical_length:
            return semantics_id, True
        else:
            return semantics_id, False
        
    # Parallel processing
    results = Parallel(n_jobs=8)(
        delayed(process_semantic_group)(points_semantics, sid, min_horizontal_length, max_horizontal_length, min_vertical_length, max_vertical_length)
        for sid in semantics_ids
    )

    # set the semantics to -1 for the points that do not pass the filter
    for semantics_id, keep in results:
        if not keep:
            indices = np.where(points_semantics[:, 3] == semantics_id)[0]
            points_semantics[indices, 3] = -1

    # print the number of semantics_ids that passed the filter
    print(f"Number of semantics_ids that passed the size filter: {len(np.unique(points_semantics[:, 3]))}")
    # print the number of semantics_ids that did not pass the filter
    print(f"Number of semantics_ids that did not pass the size filter: {len(semantics_ids) - len(np.unique(points_semantics[:, 3]))}")
    return points_semantics


def pca_curvature_filter(points_semantics, curvature_threshold=0.5):
    """
    Filter points based on PCA planarity.
    :param points_semantics: numpy array of shape (N, 4) where N is the number of points
    :param planarity_threshold: threshold for planarity; larger values indicate flatter surfaces; which means only very flat surfaces are removed
    """
    semantics_ids = np.unique(points_semantics[:, 3])

    def process_semantic_group(points_semantics, semantics_id, ratio):
        # Get indices of points with this semantics
        indices = np.where(points_semantics[:, 3] == semantics_id)[0]
        local_points_semantics = points_semantics[indices]
        xyz = local_points_semantics[:, :3]

        # Perform PCA
        pca = PCA(n_components=3)
        pca.fit(xyz)

        eigenvalues = pca.explained_variance_  # λ₁, λ₂, λ₃

        # Sort eigenvalues from largest to smallest
        eigenvalues = np.sort(eigenvalues)[::-1]
        λ1, λ2, λ3 = eigenvalues


        # Avoid divide by zero
        if λ1 == 0:
            return 0.0  # when all points are the same

        curvature = λ3 / (λ1 + λ2 + λ3)  

        if semantics_id == 5166:
            print(f"semantics_id: {semantics_id}, λ1: {λ1}, λ2: {λ2}, λ3: {λ3}, curvature: {curvature}")
        
        if curvature > ratio:   # less flat
            return semantics_id, True
        else:  # more flat
            return semantics_id, False
        
    # Parallel processing
    results = Parallel(n_jobs=8)(
        delayed(process_semantic_group)(points_semantics, sid, curvature_threshold)
        for sid in semantics_ids
    )
    # set the semantics to -1 for the points that do not pass the filter
    for semantics_id, keep in results:
        if not keep:
            indices = np.where(points_semantics[:, 3] == semantics_id)[0]
            points_semantics[indices, 3] = -1
    # print the number of semantics_ids that passed the filter
    print(f"Number of semantics_ids that passed the PCA filter: {len(np.unique(points_semantics[:, 3]))}")
    # print the number of semantics_ids that did not pass the filter
    print(f"Number of semantics_ids that did not pass the PCA filter: {len(semantics_ids) - len(np.unique(points_semantics[:, 3]))}")
    return points_semantics

    




In [None]:
# points_semantics_source, colors = read_las_file("data/centennial_bluff_mission_a_0.las")
# points_semantics = points_semantics_source.copy()
# #points_semantics = sor_filter_parallel(points_semantics, nb_neighbors=6, std_ratio=1.0, n_jobs=8)
# points_semantics = cluster_filter(points_semantics, eps=0.5, min_samples=10, n_jobs=8)
# points_semantics = size_filter(points_semantics, min_horizontal_length=0.3, max_horizontal_length=4.0, min_vertical_length=0.3, max_vertical_length=4.0)
# #save_points_to_las(points_semantics, colors, "data/centennial_bluff_mission_a_0_size_filtered.las")

points_semantics_source, colors = read_las_file("data/centennial_bluff_mission_a_0_size_filtered.las")
points_semantics = points_semantics_source.copy()

points_semantics = pca_curvature_filter(points_semantics, planarity_threshold=0.15)
save_points_to_las(points_semantics, colors, "data/centennial_bluff_mission_a_0_filtered.las")


(4648369, 4)
Number of points: 4648369
Number of semantics: 5758
semantics_id: 5166.0, λ1: 0.8960413313045098, λ2: 0.17898183871062104, λ3: 0.0001824252738268768, curvature: 0.0001696654803752678
Number of semantics_ids that passed the PCA filter: 113
Number of semantics_ids that did not pass the PCA filter: 5645
Max intensity: 6518
