In [None]:
import numpy as np
from tqdm import tqdm

# Cosine Distances

Here, you're instering the cosine distances you've previously calculated. If you only have the cosine similarity, you can compute it like this: 
```python 
1 - cosine similarity
```


In [None]:
# distances are the calculated cosine distances of the data points
distances = []

# Method 1: Mean-N Rarity

This method takes a list of distances, sorts and selects the top n_neighbours distances from each inner list, calculates the average for each set of selected distances, and then normalizes these average values between 0 and 1. The resulting rarity_core list contains the normalized values that represent the rarity of the data in each inner list of distances.

In [None]:
# here you can set the threshold value n_neighbours
n_neighbours = 100

# sort the scores and take the first n_neighbours values
sorted_distances = [sorted(distance_array)[:n_neighbours] for distance_array in distances]

# calculate the sum of values and the number of instances in each array
value_sums = [sum(distance_array) for distance_array in sorted_distances]
instance_counts = [len(distance_array) for distance_array in sorted_distances]

# calculate the average for each array
averages = [value_sum / instance_count for value_sum, instance_count in zip(value_sums, instance_counts)]

# normalization and transformation
rarity_score = averages
min_score = min(rarity_score)
max_score = max(rarity_score)
rarity_score = (rarity_score - min_score) / (max_score - min_score)

# Method 2: Flow Rarity

This method computes inward flows for each data point of the distance array based on distance with a decay parameter, reverses the flow values, normalizes them between 0 and 1, and stores the resulting values in the rarity_score_flow variable. As a result you get the rarity score of each data point.

In [None]:
# compute flows based on distance with a decay parameter
def compute_flows(distance, decay=10):
    return np.exp(-decay * distance)

# sort the scores to get their sorted indices
sorted_ids = np.argsort(distances)

# here you can set the threshold value n_next_hubs
n_next_hubs = 100

# iterative flow search
inward_flow_results = np.zeros(len(distances))
for id in tqdm(range(len(distances))):
    idx = sorted_ids[id][1:(n_next_hubs + 1)]
    inward_flow_results[id] += compute_flows(distances[id, idx]).sum()

# normalization and transformation
rarity_score_flow = 1-inward_flow_results
min_score = min(rarity_score_flow)
max_score = max(rarity_score_flow)
rarity_score_flow = (rarity_score_flow - min_score) / (max_score - min_score)

# To be added

Currently, there is a potential issue where the resulting rarity values are distributed too widely in the higher range, leading to a distortion in the distribution of values. As a result, data points that should be considered "rare" may appear in the lower range.