In [None]:
import pandas as pd
import numpy as np
from scipy.spatial import KDTree


In [None]:
df_words = pd.read_csv('../../behavioral data/analysis/1_data/WordsAndClusters_PapersAndExp2.csv')
df_words

In [None]:
import matplotlib.pyplot as plt

plt.hist(df_words.loc[:, 'group_dim_1'], bins=50, edgecolor='black')
plt.xlabel('Nearest-Neighbor Distance')
plt.ylabel('Frequency')
plt.title('Distribution of Nearest-Neighbor Distances')
plt.show()

In [None]:
# Create array from group_dim_1 and group_dim_2
associations_arr = np.array([df_words.loc[df_words.type == "association", 'group_dim_1'].values,
                   df_words.loc[df_words.type == "association", 'group_dim_2'].values]).T

situations_arr = np.array([df_words.loc[df_words.type == "situation", 'group_dim_1'].values,
                   df_words.loc[df_words.type == "situation", 'group_dim_2'].values]).T

exp_arr = np.array([df_words.loc[df_words.orig == "Experiment", 'group_dim_1'].values,
                   df_words.loc[df_words.orig == "Experiment", 'group_dim_2'].values]).T

literature_arr = np.array([df_words.loc[df_words.orig == "Papers", 'group_dim_1'].values,
                   df_words.loc[df_words.orig == "Papers", 'group_dim_2'].values]).T

complete_arr = np.array([df_words['group_dim_1'].values, df_words['group_dim_2'].values]).T

tree = KDTree(complete_arr)

# Find the nearest neighbor (excluding itself) for each point
distances, _ = tree.query(complete_arr, k=2)  # k=2 to exclude self-match
mean_radius = np.mean(distances[:, 1])  # Take mean of second column (actual NN)
print(mean_radius)


def get_percentage_overlap(arr1, arr2, mean_radius):
    # Create KDTree for Dataset B
    tree = KDTree(arr1)

    # Define threshold distance for overlap
    radius = mean_radius

    # Count how many points in A have a close match in B
    matches = [tree.query(point, distance_upper_bound=radius)[0] < radius for point in arr2]
    overlapping_count = sum(matches)

    # Compute overlap percentage
    overlap_percentage = (overlapping_count / len(arr2)) * 100

    return overlap_percentage

# Percentage of points (from second array) that have a close match in the first array
print(f"Percentage of points from literature that have a close match in the exp: {get_percentage_overlap(exp_arr, literature_arr, mean_radius):.2f}%")
print(f"Percentage of points from literature that have a close match in the associations: {get_percentage_overlap(associations_arr, literature_arr, mean_radius):.2f}%")
print(f"Percentage of points from literature that have a close match in the situations: {get_percentage_overlap(situations_arr, literature_arr, mean_radius):.2f}%")

print(f"Percentage of points from associations that have a close match in the literature: {get_percentage_overlap(literature_arr, associations_arr, mean_radius):.2f}%")
print(f"Percentage of points from associations that have a close match in the situations: {get_percentage_overlap(situations_arr, associations_arr, mean_radius):.2f}%")


print(f"Percentage of points from situations that have a close match in the literature: {get_percentage_overlap(associations_arr, situations_arr, mean_radius):.2f}%")
print(f"Percentage of points from situations that have a close match in the associations: {get_percentage_overlap(associations_arr, situations_arr, mean_radius):.2f}%")


print(f"Percentage of points from exp that have a close match in the literature: {get_percentage_overlap(literature_arr, exp_arr, mean_radius):.2f}%")

In [None]:
# Define grid resolution
grid_size = 0.25  # Adjust based on data scale



def get_grid_overlap_jaccard(arr1, arr2, grid_size):
    # Convert coordinates to grid indices
    def to_grid(points, grid_size):
        return set(tuple((points // grid_size).astype(int)) for points in points)

    # Convert datasets to grid cells
    arr1_cells = to_grid(arr1, grid_size)
    arr2_cells = to_grid(arr2, grid_size)

    # Compute overlap metrics
    intersection = arr1_cells & arr2_cells  # Cells in both sets
    union = arr1_cells | arr2_cells  # All unique occupied cells

    # Compute Jaccard Index
    jaccard_index = len(intersection) / len(union) if len(union) > 0 else 0.0

    return jaccard_index

# Grid-based first array in second
print(f"Percentage from literature to exp: {get_grid_overlap_jaccard(exp_arr, literature_arr, grid_size):.2f}%")
print(f"Percentage from literature to associations: {get_grid_overlap_jaccard(associations_arr, literature_arr, grid_size):.2f}%")
print(f"Percentage from literature to situations: {get_grid_overlap_jaccard(situations_arr, literature_arr, grid_size):.2f}%")

print(f"Percentage from associations to situations: {get_grid_overlap_jaccard(situations_arr, associations_arr, grid_size):.2f}%")