*Copyright 2025 Jaeyoung Chun / Winning Twelve*

You may not make copies of this and use or distribute it for any purpose.

# Similarity

In [1]:
sentences = [
    "안녕하세요",
    "안녕",
    "잘가",
    "잘자",
    "반갑다"
]

## Helper Functions

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similarity(a, b):
    return cosine_similarity([a], [b])[0][0]

In [3]:
import pandas as pd

def compare(embeddings):
    out = []
    for i, emb1 in enumerate(embeddings):
        for j, emb2 in enumerate(embeddings):
            if i < j:
                score = get_similarity(emb1, emb2)
                sentence1, sentence2 = sentences[i], sentences[j]
                out.append((sentence1, sentence2, score))
                
    return pd.DataFrame(out, columns=["sentence1", "sentence2", "similarity_score"])

## OpenAI

### text-embedding-3-small

In [4]:
from langchain_openai import OpenAIEmbeddings

In [5]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [6]:
embeddings = embedding_model.embed_documents(sentences)

In [7]:
compare(embeddings)

Unnamed: 0,sentence1,sentence2,similarity_score
0,안녕하세요,안녕,0.826183
1,안녕하세요,잘가,0.333874
2,안녕하세요,잘자,0.378787
3,안녕하세요,반갑다,0.247146
4,안녕,잘가,0.372109
5,안녕,잘자,0.419632
6,안녕,반갑다,0.265561
7,잘가,잘자,0.83246
8,잘가,반갑다,0.24847
9,잘자,반갑다,0.241822


### text-embedding-3-large

In [8]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")

In [9]:
embeddings = embedding_model.embed_documents(sentences)

In [10]:
compare(embeddings)

Unnamed: 0,sentence1,sentence2,similarity_score
0,안녕하세요,안녕,0.819306
1,안녕하세요,잘가,0.456149
2,안녕하세요,잘자,0.445797
3,안녕하세요,반갑다,0.418926
4,안녕,잘가,0.513085
5,안녕,잘자,0.501129
6,안녕,반갑다,0.443209
7,잘가,잘자,0.670334
8,잘가,반갑다,0.382664
9,잘자,반갑다,0.363693
