In [1]:
import numpy as np
import warnings
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
warnings.filterwarnings("ignore")
# Sample data: 10 vectors with 70 features each
data = np.random.uniform(-1, 1, size=(100, 70))

# Select the target vector you want to find the k most similar to
# target_vector = np.random.uniform(-1, 1, size=(70,))

In [3]:
most_common_feature_values = []
for feature_idx in range(70):
    feature_values = data[:, feature_idx]
    feature_counter = Counter(feature_values)
    most_common_value = feature_counter.most_common(1)[0][0]
    most_common_feature_values.append(most_common_value)

feature_clusters = {value: [] for value in most_common_feature_values}

for vector in data:
    for feature_idx, common_value in enumerate(most_common_feature_values):
        if vector[feature_idx] == common_value:
            feature_clusters[common_value].append(vector)

In [4]:
print(len(feature_clusters.items()))
for common_value, vectors in feature_clusters.items():
    print(f"Cluster for Common Feature Value: {common_value}")
    for vector in vectors:
        print(f"Vector: {vector}")


70
Cluster for Common Feature Value: 0.5491950690422478
Vector: [ 0.54919507  0.09630691 -0.43763665  0.71281883 -0.79353502 -0.82699671
 -0.77461041  0.87497063  0.22979524 -0.98356834 -0.05042138  0.62681028
  0.06641232 -0.29980786  0.70349056 -0.13519167 -0.68437638  0.02655229
  0.75535927 -0.59627935  0.86786403  0.02871121  0.88934765  0.67035305
  0.55769719 -0.45104451  0.11437003  0.1959039   0.69562248  0.04251817
  0.92573608 -0.22465913 -0.18217542  0.46881398 -0.62798227 -0.72893536
  0.67819122 -0.13585103  0.01425857  0.23452135 -0.55977125  0.00502219
 -0.69017485 -0.82929216  0.94139457 -0.53198583 -0.03764385  0.16156437
 -0.84330665  0.02011632 -0.34910054  0.16863983 -0.84285625 -0.54568215
 -0.89625875  0.64424908 -0.40253426 -0.06478903  0.29524941 -0.85904598
  0.61416751  0.76061998 -0.15612995 -0.84117394 -0.46248333  0.17266082
  0.16473978  0.16592197  0.42690045  0.84495425]
Cluster for Common Feature Value: 0.09630690622511517
Vector: [ 0.54919507  0.09630

In [5]:
def calculate_cosine_similarity(vector1, vector2):
    return cosine_similarity([vector1], [vector2])[0][0]


query_vector = np.array([0.56706617, 0.34455938, 0.44616175, -0.0650799, -0.47266785, 0.59080561, -0.43080394, -0.5135719, 0.22320162, -0.87283061, -0.23668043, 0.75665041, 0.339533, 0.65053203, -0.66506984, 0.28330003, 0.91276114, 0.92018113, -0.54658019, -0.62682039, 0.36603691, -0.58707456, -0.05218846, 0.18525563, -0.05814506, -0.44615855, 0.00208571, 0.89983658, -0.63459025, -0.01489715, 0.76628297, 0.82888021, 0.53721707, 0.54458186, 0.04724125, -0.4342464, 0.65653435, -0.48079959, 0.2170091, -0.8127404, 0.1231197, -0.99787317, 0.29134905, 0.15293282, -0.74184803, -0.63027566, 0.73296933, -0.7761865, -0.23274278, 0.19896239, 0.78437348, 0.86621835, -0.26795431, -0.42089293, -0.43584217, 0.5946235, 0.28696005, -0.87394404, 0.30378758, -0.11370392, 0.78394917, 0.69688519, 0.60582702, 0.29769932, -0.70871125, -0.95531047, 0.46796252, -0.48035647, 0.72460336, 0.50286528])

print(len(query_vector))

# Initialize variables to keep track of the most similar vector and its similarity score
most_similar_vector = None
max_similarity_score = -1  # Initialize with a negative value

# Iterate through your dataset and find the most similar vector
for vector in data:
    similarity = calculate_cosine_similarity(query_vector, vector)
    if similarity > max_similarity_score:
        max_similarity_score = similarity
        most_similar_vector = vector

print("Most Similar Vector:", most_similar_vector)
print("Max Similarity Score:", max_similarity_score)

70
Most Similar Vector: [ 0.48793718 -0.87190838  0.27675764  0.1282966  -0.141799   -0.25567124
  0.09120451 -0.8986759  -0.89053367 -0.53555723  0.06645169 -0.28945393
 -0.96753095  0.88543028 -0.75057958 -0.24442833  0.52173034  0.93583977
  0.64148901  0.54684453 -0.04092287  0.15247113 -0.06565006 -0.34840902
 -0.84684377  0.22888387  0.23140155  0.85615572  0.36350183 -0.1943283
  0.65599604  0.34065339  0.29456264  0.37785914  0.145725    0.91934577
 -0.19564166  0.9999238  -0.0582359   0.2873591   0.28339551  0.3680339
 -0.34063086  0.30071855 -0.2836955   0.07964208 -0.00592159 -0.53360583
 -0.72773126 -0.92386367  0.66894935  0.70833276  0.6948805  -0.41484269
  0.78145226  0.28691428  0.7578807   0.51564758 -0.58588275 -0.09552321
  0.55123319 -0.41427607  0.93594862  0.49757092  0.43143392 -0.82240885
  0.94735766 -0.39955881  0.45778284  0.32679111]
Max Similarity Score: 0.23672331095905738


In [6]:
# Iterate through your clusters and find the most similar vector within each cluster
for common_value, vectors in feature_clusters.items():
    max_similarity_score = -1
    most_similar_vector = None
    for vector in vectors:
        similarity = calculate_cosine_similarity(query_vector, vector)
        if similarity > max_similarity_score:
            max_similarity_score = similarity
            most_similar_vector = vector

    print(f"Most Similar Vector in Cluster for Common Feature Value {common_value}: {most_similar_vector}")
    print(f"Max Similarity Score in Cluster: {max_similarity_score}")

Most Similar Vector in Cluster for Common Feature Value 0.5491950690422478: [ 0.54919507  0.09630691 -0.43763665  0.71281883 -0.79353502 -0.82699671
 -0.77461041  0.87497063  0.22979524 -0.98356834 -0.05042138  0.62681028
  0.06641232 -0.29980786  0.70349056 -0.13519167 -0.68437638  0.02655229
  0.75535927 -0.59627935  0.86786403  0.02871121  0.88934765  0.67035305
  0.55769719 -0.45104451  0.11437003  0.1959039   0.69562248  0.04251817
  0.92573608 -0.22465913 -0.18217542  0.46881398 -0.62798227 -0.72893536
  0.67819122 -0.13585103  0.01425857  0.23452135 -0.55977125  0.00502219
 -0.69017485 -0.82929216  0.94139457 -0.53198583 -0.03764385  0.16156437
 -0.84330665  0.02011632 -0.34910054  0.16863983 -0.84285625 -0.54568215
 -0.89625875  0.64424908 -0.40253426 -0.06478903  0.29524941 -0.85904598
  0.61416751  0.76061998 -0.15612995 -0.84117394 -0.46248333  0.17266082
  0.16473978  0.16592197  0.42690045  0.84495425]
Max Similarity Score in Cluster: 0.16021973209539572
Most Similar Vecto