In [1]:
import glob
import os
import pandas as pd
import numpy as np
import weaviate
import chromadb

In [67]:
chroma_client = chromadb.Client()

In [None]:
weaviate_client = weaviate.Client("http://localhost:8080/")
weaviate_client.is_ready()

In [4]:
def load_parquet(parquet_path):
    return pd.read_parquet(parquet_path)

In [5]:
def print_parquet_data(data):
    for _, row in data.iterrows():
        print("Abstract:", row['abstract'])
        print("Embedding:", row['embeddings'])
        print("DOI:", row['doi'])
        print()

In [6]:
abstracts1 = load_parquet('arxiv-paper-abstracts/arxiv_abstracts/arxiv_abstracts/arxiv_abstracts/abstracts_1.parquet')

In [10]:
len(abstracts1['abstract'])

100000

In [68]:
collection1 = chroma_client.create_collection(name="abstracts1")

In [50]:
test_batch = abstracts1[:1000]

In [69]:
collection1.add(
    embeddings = [embedding.tolist() for embedding in test_batch['embeddings']],
    documents = [abstract for abstract in test_batch['abstract']],
    metadatas = [{'doi': doi} for doi in test_batch['doi']],
    ids = [f'id{i}' for i in range(len(test_batch['doi']))]
)

In [64]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')

Downloading (…)8df09/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 2.38MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 299kB/s]
Downloading (…)50dc78df09/README.md: 100%|██████████| 6.14k/6.14k [00:00<00:00, 7.27MB/s]
Downloading (…)dc78df09/config.json: 100%|██████████| 636/636 [00:00<00:00, 516kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 927kB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [02:52<00:00, 2.53MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 54.0/54.0 [00:00<00:00, 131kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 420kB/s]
Downloading (…)8df09/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 796kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 461/461 [00:00<00:00, 978kB/s]
Downloading (…)df09/train_script.py: 100%|██████████| 10.4k/10.4k [00:00<00:00, 12.8MB/s]
Downloading (…)50dc78df09/vocab.txt: 100%|██████

In [65]:
query = model.encode('applications of temperature gradient')

In [71]:
result = collection1.query(
    query_embeddings=query.tolist(),
    n_results=10)

In [76]:
result['metadatas']

[[{'doi': '0704.0684'},
  {'doi': '0704.0157'},
  {'doi': '0704.0701'},
  {'doi': '0704.0889'},
  {'doi': '0704.0534'},
  {'doi': '0704.0472'},
  {'doi': '0704.0993'},
  {'doi': '0704.0393'},
  {'doi': '0704.0434'},
  {'doi': '0704.0338'}]]