# ChromaのPythonライブラリをインストール

In [1]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.12-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.7.0-py3-none-any.whl.metadata (5.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.34.1-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)
  Downloading opentelemetry_instrumentation_fastapi-0.55b1-py3-none-any.whl.metadata (2.2 kB)
Collecting opentelemetry-sdk>=1.2.0 (from c

# ライブラリとデータセットをロード

In [2]:
import pandas as pd
import chromadb

In [3]:
df = pd.read_csv('data.csv')

In [4]:
df.info()  # Chromaにデータを入れる際、ChromaのIDはstrかつユニークのみ受け付ける

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      100 non-null    object
 1   Item    100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


In [5]:
df.head()

Unnamed: 0,ID,Item
0,A0,Compact Printer Air Advanced Digital : Situati...
1,A1,Tablet : Discussion loss politics free one tho...
2,A2,Smart Blender Cooker : No situation per.
3,A3,Advanced Router Rechargeable : For force gas e...
4,A4,Portable Mouse Monitor Phone : Feeling back re...


今回はA１のタブレットをカスタムEmbeddingを使い探せるようにする

# Chromaの使い方(Custom Embedding Function)

## Step 1: ChromaのデータをDiskにセーブ

In [6]:
# testのフォルダーが作られ、chroma.sqlite3が作成される
chroma_client = chromadb.PersistentClient(path="test")  # Pathはセーブしたいロケーション　（今回はGoogle Colabなので名前だけ)

## Step 2:　カスタムEmbeddingを作成する
今回はLLAMAを使用  
*SentenceTransfotmerを推奨  
APIを使えばOpenAI等のEmneddingを使えるが、従来これらのモデルはエンコード用に作られていない。

HuggingFace: https://huggingface.co/tasks/sentence-similarity  
Chroma: https://docs.trychroma.com/docs/embeddings/embedding-functions

In [7]:
from chromadb import Documents, EmbeddingFunction, Embeddings  # 上書きする為に必要
from sentence_transformers import SentenceTransformer # HuggingFaceのライブラリーを使う為

#　上書きするのは__call__
class CustomEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents, model_name: str = 'thuan9889/llama_embedding_model_v1') -> Embeddings:
        sentences = input

        model = SentenceTransformer(model_name)  # モデルのロード
        embeddings = model.encode(sentences)  # インプットをEmbedding

        # リストにコンバート
        embeddings_as_list = [embedding.tolist() for embedding in embeddings]

        return embeddings_as_list

## Step 3: Vector　DatabaseをカスタムEmbeddingと一緒に作成

In [8]:
myembedding = CustomEmbeddingFunction()  #　インスタンス化
collection = chroma_client.get_or_create_collection(
    name="test",
    embedding_function=myembedding) #　ディフォルトのembedding_functionを変更

  myembedding = CustomEmbeddingFunction()  #　インスタンス化


## Step 4: データをCollectionに加える

In [10]:
collection.add(
    documents=df['Item'].tolist(),
    ids=df['ID'].tolist()
)

In [11]:
collection.peek()

{'ids': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9'],
 'embeddings': array([[-5.50409742e-02,  2.12281551e-02, -1.35449663e-01, ...,
         -8.06968287e-03, -6.09836951e-02,  4.70388122e-02],
        [-6.87931553e-02,  6.51822775e-04, -3.32023352e-02, ...,
          3.08674434e-03, -6.08183444e-02,  8.18606745e-03],
        [ 3.36782560e-02, -5.99298365e-02,  7.07072858e-03, ...,
         -2.71151680e-02, -7.45719764e-03, -1.19472952e-06],
        ...,
        [-1.54856723e-02,  4.98729050e-02, -5.55039868e-02, ...,
          2.19804663e-02, -1.09298646e-01, -6.08610595e-03],
        [ 5.74598135e-03,  1.83753781e-02, -7.47339725e-02, ...,
         -1.73444841e-02,  4.24693152e-02,  4.30014171e-03],
        [-7.77137727e-02,  4.78394367e-02, -7.62003511e-02, ...,
         -9.93046910e-03, -2.74772327e-02,  2.90386081e-02]]),
 'documents': ['Compact Printer Air Advanced Digital : Situation organization these memory much off.',
  'Tablet : Discussion loss politics free 

## Step 4: Queryを書く

In [12]:
results = collection.query(
    query_texts=["Tablet", "Printer"], # 探したいキーワードを入れる
    n_results=3 # 欲しい数を入れる：　ディフォルトは10個
)
results

{'ids': [['A52', 'A17', 'A1'], ['A17', 'A0', 'A38']],
 'embeddings': None,
 'documents': [['Tablet : Many deal community public beyond safe anyone.',
   'Digital Tablet Router Printer Lite : Accept campaign every research test.',
   'Tablet : Discussion loss politics free one thousand.'],
  ['Digital Tablet Router Printer Lite : Accept campaign every research test.',
   'Compact Printer Air Advanced Digital : Situation organization these memory much off.',
   'Wireless Tablet Router Printer Wireless Premium Air : North meeting short summer situation positive candidate.']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None, None], [None, None, None]],
 'distances': [[0.7642314434051514, 1.0232985019683838, 1.1109884977340698],
  [0.9037421941757202, 0.9324209094047546, 1.0014959573745728]]}

# Step 5:　更新されたVector　Databaseをロードする

In [13]:
chroma_client2 = chromadb.PersistentClient(path="test")

In [14]:
myembedding2 = CustomEmbeddingFunction()  #　インスタンス化
collection2 = chroma_client2.get_or_create_collection(
    name="test",
    embedding_function=myembedding) #　ディフォルトのembedding_functionを変更

  myembedding2 = CustomEmbeddingFunction()  #　インスタンス化


In [15]:
results2 = collection2.query(
    query_texts=["Tablet"], # 探したいキーワードを入れる
    n_results=3 # 欲しい数を入れる：　ディフォルトは10個
)
results2

{'ids': [['A52', 'A17', 'A1']],
 'embeddings': None,
 'documents': [['Tablet : Many deal community public beyond safe anyone.',
   'Digital Tablet Router Printer Lite : Accept campaign every research test.',
   'Tablet : Discussion loss politics free one thousand.']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None, None]],
 'distances': [[0.7642315626144409, 1.0232982635498047, 1.1109883785247803]]}