#### Imports

In [2]:
import pandas as pd
import numpy as np

#### Utilities Functions

In [3]:
import time

def get_bruteforce_loop_distances(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
    """
    Compute the Euclidean distance between each pair of points in X and Y.

    Parameters:
    X (np.ndarray): An array of shape (m, n) representing m points in n-dimensional space.
    Y (np.ndarray): An array of shape (1, n) representing p points in n-dimensional space.

    Returns:
    np.ndarray: A distance matrix of shape (m, p) where the entry at (i, j) is the distance between X[i] and Y[j].
    """
    start_time = time.time()
    m = X.shape[0]
    p = Y.shape[0]
    distances = np.zeros((m, p))

    for i in range(m):
        for j in range(p):
            distances[i, j] = np.linalg.norm(X[i] - Y[j])

    end_time = time.time()
    print(f"Brute-force loop distance computation took {end_time - start_time:.4f} seconds.")
    return distances

def bruteforce_distances_vectorized(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
    start_time = time.time()

    # X: (m, d)
    # Y: (1, d)
    diff = X[:, None, :] - Y[None, :, :]     # (m, p, d)
    distances = np.linalg.norm(diff, axis=2) # (m, p)

    print(f"Vectorized brute-force took {time.time() - start_time:.4f}s")
    return distances


def create_dataframe(X: np.ndarray, distances: np.ndarray) -> pd.DataFrame:
    """
    Create a pandas DataFrame that includes the original data points and their corresponding distances.

    Parameters:
    X (np.ndarray): An array of shape (m, n) representing m points in n-dimensional space.
    distances (np.ndarray): A distance matrix of shape (m, p) where the entry at (i, j) is the distance between X[i] and Y[j].

    Returns:
    pd.DataFrame: A DataFrame containing the original data points and their distances.
    """
    df = pd.DataFrame(X, columns=[f"Feature_{i+1}" for i in range(X.shape[1])])
    for j in range(distances.shape[1]):
        df[f"Distance_to_Y{j+1}"] = distances[:, j]
    return df

### Test Utilitities
X = np.array(
    [[1, 1], [2, 2], [8, 8], [9, 9]]
) # 4 Data Points in 2D

Y = np.array([[3, 3], [7, 7]]) # Query Point in 2D

print(f"Shape of X: {X.shape}")
print(f"Shape of Y: {Y.shape}")

distances_loop = get_bruteforce_loop_distances(X, Y)
print(f"Shape of distance matrix: {distances_loop.shape}\n\n")

distances_vectorised = bruteforce_distances_vectorized(X, Y)
print(f"Shape of distance matrix: {distances_vectorised.shape}\n\n")


print(f"Query Vector:\n{Y}")
df = create_dataframe(X, distances_vectorised)
df

Shape of X: (4, 2)
Shape of Y: (2, 2)
Brute-force loop distance computation took 0.0003 seconds.
Shape of distance matrix: (4, 2)


Vectorized brute-force took 0.0000s
Shape of distance matrix: (4, 2)


Query Vector:
[[3 3]
 [7 7]]


Unnamed: 0,Feature_1,Feature_2,Distance_to_Y1,Distance_to_Y2
0,1,1,2.828427,8.485281
1,2,2,1.414214,7.071068
2,8,8,7.071068,1.414214
3,9,9,8.485281,2.828427


#### Raw Implementation (1 Million Vectors)

In [4]:
X = np.random.rand(1_000_000, 50)  # 1_000_000 Data Points in 50D

Y = np.random.rand(10, 50)    # 10 Query Points in 50D

print(f"Shape of X: {X.shape}")
print(f"Shape of Y: {Y.shape}")

distances_loop = get_bruteforce_loop_distances(X, Y)
print(f"Shape of distance matrix: {distances_loop.shape}\n\n")

distances_vectorised = bruteforce_distances_vectorized(X, Y)
print(f"Shape of distance matrix: {distances_vectorised.shape}\n\n")

Shape of X: (1000000, 50)
Shape of Y: (10, 50)
Brute-force loop distance computation took 18.2083 seconds.
Shape of distance matrix: (1000000, 10)


Vectorized brute-force took 6.4751s
Shape of distance matrix: (1000000, 10)




#### Parallel Raw Implementation

### Implementation using "Library Name"