In [1]:
import pandas as pd
import torch

data = pd.read_csv('embedding.csv')
print(data)
series_data = data.embeddings[0]
number_list = [float(num_str) for num_str in series_data.strip('[]').split(',')]

# 리스트를 PyTorch 텐서로 변환
tensor_data = torch.tensor(number_list)

print("변환된 텐서:")
print(tensor_data)

              id                                         embeddings
0     1609.04741  [-0.6698794364929199, -0.4727935194969177, 0.2...
1     1703.07738  [-0.2581411302089691, -0.012778624892234802, -...
2     1706.03762  [-0.5425103902816772, 0.18396970629692078, -0....
3     1710.08969  [-0.41298601031303406, 0.5831630229949951, -0....
4     1711.06149  [-0.7896791696548462, 0.07237350940704346, -0....
...          ...                                                ...
4064  2402.09329  [-0.4379268288612366, -0.23558062314987183, -0...
4065  2402.09812  [-0.6690561771392822, -0.07181265950202942, -0...
4066  2402.11639  [-0.7059850692749023, -0.26324883103370667, -0...
4067  2402.13512  [-0.5639562606811523, -0.09524745494127274, -0...
4068  2402.17417  [-0.6053261756896973, -0.2860853970050812, -0....

[4069 rows x 2 columns]
변환된 텐서:
tensor([-0.6699, -0.4728,  0.2280,  ..., -0.2286, -0.6570,  0.6681])


In [30]:
import csv
import ast
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 전역 변수로 데이터를 저장할 딕셔너리
vectors = {}

def parse_csv(file_path):
    global vectors
    with open(file_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            id = row['id']
            vector_str = row['embeddings']
            vector = ast.literal_eval(vector_str)
            vectors[id] = vector

def compute_similarity(base_ids):
    similarity_results = []
    for base_id in base_ids:
        base_vector = np.array([vectors[base_id]])
        for id, vector in vectors.items():
            if id != base_id:
                similarity = cosine_similarity(base_vector, [np.array(vector)])[0][0]
                similarity_results.append({'source': id, 'target': base_id, 'distance': similarity})
    return similarity_results

def get_top_similarity_results(similarity_results, top_n=5):
    similarity_results = sorted(similarity_results, key=lambda x: x['similarity'], reverse=True)
    return similarity_results[:top_n]

# CSV 파일에서 데이터 파싱
file_path = 'embedding.csv'
parse_csv(file_path)


In [31]:
# 기준이 되는 여러 ID
base_ids = ['1609.04741', '1703.07738', '1806.05382']

top_5_results = {}
for base_id in base_ids:
    similarity_results = compute_similarity([base_id])
    top_5_results[base_id] = get_top_similarity_results(similarity_results)

# 결과 출력
results = []
for base_id, top_5 in top_5_results.items():
    print(f"Top 5 similarities for base ID {base_id}:")
    for result in top_5:
        print(result)
        results.append(result)

Top 5 similarities for base ID 1609.04741:
{'from': '2105.03891', 'to': '1609.04741', 'similarity': 0.7967764055095873}
{'from': '2102.06361', 'to': '1609.04741', 'similarity': 0.7796217906946868}
{'from': '2005.08665', 'to': '1609.04741', 'similarity': 0.7540322177135702}
{'from': '2301.03634', 'to': '1609.04741', 'similarity': 0.7500096755111593}
{'from': '2103.16273', 'to': '1609.04741', 'similarity': 0.7476865946495533}
Top 5 similarities for base ID 1703.07738:
{'from': '2008.04378', 'to': '1703.07738', 'similarity': 0.6856879431382494}
{'from': '2101.08967', 'to': '1703.07738', 'similarity': 0.669291005697058}
{'from': '2010.00516', 'to': '1703.07738', 'similarity': 0.6687917998791476}
{'from': '2004.13621', 'to': '1703.07738', 'similarity': 0.668282168170491}
{'from': '2005.0665', 'to': '1703.07738', 'similarity': 0.6627738646226708}
Top 5 similarities for base ID 1806.05382:
{'from': '2306.01526', 'to': '1806.05382', 'similarity': 0.8868579787302365}
{'from': '2011.03891', 'to'

In [29]:
results

[{'from': '1609.04741', 'to': '2105.03891', 'similarity': 0.7967764055095873},
 {'from': '1609.04741', 'to': '2102.06361', 'similarity': 0.7796217906946868},
 {'from': '1609.04741', 'to': '2005.08665', 'similarity': 0.7540322177135702},
 {'from': '1609.04741', 'to': '2301.03634', 'similarity': 0.7500096755111593},
 {'from': '1609.04741', 'to': '2103.16273', 'similarity': 0.7476865946495533},
 {'from': '1703.07738', 'to': '2008.04378', 'similarity': 0.6856879431382494},
 {'from': '1703.07738', 'to': '2101.08967', 'similarity': 0.669291005697058},
 {'from': '1703.07738', 'to': '2010.00516', 'similarity': 0.6687917998791476},
 {'from': '1703.07738', 'to': '2004.13621', 'similarity': 0.668282168170491},
 {'from': '1703.07738', 'to': '2005.0665', 'similarity': 0.6627738646226708},
 {'from': '1806.05382', 'to': '2306.01526', 'similarity': 0.8868579787302365},
 {'from': '1806.05382', 'to': '2011.03891', 'similarity': 0.8562002713592036},
 {'from': '1806.05382', 'to': '2112.10481', 'similarity