## OpenAIEmbeddings

In [1]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()

In [2]:
embeddings = embeddings_model.embed_documents(
    [
        '안녕하세요!',
        '어! 오랜만이에요',
        '이름이 어떻게 되세요?',
        '날씨가 추워요',
        'Hello LLM!'
    ]
)
len(embeddings), len(embeddings[0])

(5, 1536)

In [3]:
print(embeddings[0][:20])

[-0.01041258405894041, -0.01355851348489523, -0.006538722664117813, -0.018673023208975792, -0.018280573189258575, 0.016685454174876213, -0.009216244332492352, 0.003937159199267626, -0.0074185701087117195, 0.0100644426420331, 0.011760839261114597, -0.006709628272801638, -0.02540796995162964, -0.02252156473696232, -0.004892964847385883, -0.021761983633041382, 0.025281373411417007, -0.01764758862555027, 0.00793128740042448, -0.017837483435869217]


In [4]:
embedded_query = embeddings_model.embed_query('첫인사를 하고 이름을 물어봤나요?')
embedded_query[:5]

[0.003640108974650502,
 -0.024275783449411392,
 0.010910888202488422,
 -0.04110145568847656,
 -0.004543057177215815]

In [5]:
# 코사인 유사도
import numpy as np
from numpy import dot
from numpy.linalg import norm

def cos_sim(A, B):
  return dot(A, B)/(norm(A)*norm(B))

for embedding in embeddings:
    print(cos_sim(embedding, embedded_query))

# 세번째 문장이 유사도가 가장 높음을 알 수 있음

0.8347793912001633
0.8154197762848944
0.8844172747319565
0.7898703827307415
0.7467077657972325


## HuggingFaceEmbeddings

In [7]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings(
    model_name='jhgan/ko-sroberta-nli',
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True},
)

embeddings_model

  from .autonotebook import tqdm as notebook_tqdm


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
), model_name='jhgan/ko-sroberta-nli', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True}, multi_process=False, show_progress=False)

In [8]:
embeddings = embeddings_model.embed_documents(
    [
        '안녕하세요!',
        '어! 오랜만이에요',
        '이름이 어떻게 되세요?',
        '날씨가 추워요',
        'Hello LLM!'
    ]
)
len(embeddings), len(embeddings[0])

(5, 768)

In [9]:
embedded_query = embeddings_model.embed_query('첫인사를 하고 이름을 물어봤나요?')

for embedding in embeddings:
    print(cos_sim(embedding, embedded_query))

0.5899016586852425
0.41826323335587334
0.724060503562248
0.057026584369319275
0.43164185906115604


## GoogleGenerativeAIEmbeddings

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings_model = GoogleGenerativeAIEmbeddings(model='models/embedding-001')

embeddings = embeddings_model.embed_documents(
    [
        '안녕하세요!',
        '어! 오랜만이에요',
        '이름이 어떻게 되세요?',
        '날씨가 추워요',
        'Hello LLM!'
    ]
)
len(embeddings), len(embeddings[0])

In [None]:
embedded_query = embeddings_model.embed_query('첫인사를 하고 이름을 물어봤나요?')

for embedding in embeddings:
    print(cos_sim(embedding, embedded_query))