In [None]:
import gudhi as gd
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
import time
import itertools

In [None]:
with open('semantic_point_cloud.pickle', 'rb') as file:
    full_data = pkl.load(file)

print(f"type {type(full_data)}, shape {full_data.shape}")
full_data[:5,:5]

Separate names and inspect values

In [None]:
names = full_data[:, 0]
data = full_data[:, 1:]

plt.hist(data.flatten(), bins=100, edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram of Values')
plt.show()

Reverting data to represent proximity instead of relational strength. Explicitly, we consider 10 - data where max(data) is 9.3.

In [None]:
plt.hist((10-data).flatten(), bins=100, edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram of Values')
plt.show()

Filter the data

In [None]:
interesting_words = np.random.choice(data.shape[0], 300, replace=False)
sub_words_data = data[interesting_words,:]

interesting_groundings = [1,4,7,13,19]
cardinality = 3

Compute the barcode for each subset of interesting groundings of the given cardinality

In [None]:
barcodes = []
times = []
combinations = list(itertools.combinations(interesting_groundings, cardinality))
for combo in combinations:
    start_time = time.time()
    combo_data = sub_words_data[:, combo]
    rips_complex = gd.RipsComplex(points=combo_data, max_edge_length=10)
    simplex_tree = rips_complex.create_simplex_tree(max_dimension=2)
    barcode = simplex_tree.persistence()
    barcodes.append(barcode)
    end_time = time.time()
    times.append(end_time - start_time)
print(f"Max time of {len(combinations)} computations was {max(times)}")

We will compute all bottleneck distances between computed barcodes. The hypothesis is that the closer they are the more related the groundings are. Even more vaguely, the shape of the data is correlated to the meaning of the coordinates used to represent it.  

In [None]:
zero_barcodes, one_barcodes = [], []
for barcode in barcodes:
    zero_barcodes.append([value for key, value in barcode if key == 0])
    one_barcodes.append([value for key, value in barcode if key == 1])

n = len(barcodes)
zero_bottleneck_distances = np.zeros((n, n))
one_bottleneck_distances = np.zeros((n, n))

# Compute bottleneck distances
for i in range(n):
    for j in range(i + 1, n):
        zero_distance = gd.bottleneck_distance(zero_barcodes[i], zero_barcodes[j])
        zero_bottleneck_distances[i, j] = zero_distance
        zero_bottleneck_distances[j, i] = zero_distance  # Symmetric matrix

        one_distance = gd.bottleneck_distance(one_barcodes[i], one_barcodes[j])
        one_bottleneck_distances[i, j] = one_distance
        one_bottleneck_distances[j, i] = one_distance  # Symmetric matrix

plt.imshow(zero_bottleneck_distances, cmap='hot', interpolation='nearest')
plt.colorbar()  # Add a colorbar to show the scale
plt.title('Heatmap for 0-bottleneck')
plt.show()

plt.imshow(one_bottleneck_distances, cmap='hot', interpolation='nearest')
plt.colorbar()  # Add a colorbar to show the scale
plt.title('Heatmap for 1-bottleneck')
plt.show()