In [2]:
import chromadb
client = chromadb.PersistentClient(path="./chroma_db")

In [3]:
import polars as pl

df = pl.read_parquet('/home/lyk/Downloads/2025.parquet').drop('conan_v1')

In [4]:
database = client.get_or_create_collection(name="arxiv",    
    configuration={
        "hnsw": {
            "space": "cosine", # Cohere models often use cosine space
        },
})
database

Collection(name=arxiv)

In [5]:
safe_seperator = "\t"

index = df['id'].to_list()
embedding = df['jasper_v1'].to_numpy()
metadata = df.drop('id', 'jasper_v1','license').with_columns(
    pl.col('authors').list.join(safe_seperator), 
    pl.col('categories').list.join(safe_seperator)
).to_dicts()

In [15]:
# batch = 4096
# for i in range(0, len(index), batch):
#     print(f"upsert {i} to {i+batch}")
#     database.upsert(
#         ids = index[i:i+batch],
#         embeddings = embedding[i:i+batch],
#         metadatas = metadata[i:i+batch],
#     )

In [7]:
database.get('2504.18116', include = ["metadatas", "embeddings"])

{'ids': ['2504.18116'],
 'embeddings': array([[ 0.0195315 ,  0.00181597,  0.05034558, ...,  0.03292076,
         -0.03156114,  0.05090722]], shape=(1, 1024)),
 'documents': None,
 'uris': None,
 'included': ['metadatas', 'embeddings'],
 'data': None,
 'metadatas': [{'abstract': 'Large language models (LLMs) have demonstrated strong capabilities in programming and mathematical reasoning tasks, but are constrained by limited high-quality training data. Synthetic data can be leveraged to enhance fine-tuning outcomes, but several factors influence this process, including model size, synthetic data volume, pruning strategy, and number of fine-tuning rounds. We explore these axes and investigate which conditions enable model self-improvement. We introduce the Think, Prune, Train process, a scalable framework that iteratively fine-tunes models on their own reasoning traces, using ground-truth pruning to ensure high-quality training data. This approach yields improved performance: on GSM8K, Ge

In [10]:
# def query(
#         query_embeddings: Optional[OneOrMany[Embedding]] = None,
#         n_results: int = 10,
#         where: Optional[Where] = None,
#         where_document: Optional[WhereDocument] = None,
#         include: Include = ["metadatas", "documents",
#                             "distances"]) -> QueryResult

arxiv_id = '2502.18008'
data = database.get(arxiv_id, include = ["metadatas", "embeddings"])
emb = data['embeddings'][0]
meta = data['metadatas'][0]
print(f"ArXiv ID: {arxiv_id}")
print(f"Title: {meta['title']}")
print(f"Abstract: {meta['abstract']}")
results = database.query(
    query_embeddings=[emb],
    n_results=20,
    include=["metadatas", "distances"]   
)

docs = [
    f'Title: {results['metadatas'][0][i]["title"]}\n'
    f'Abstract: {results['metadatas'][0][i]["abstract"]}\n'
    for i in range(len(results["ids"][0]))
]

ArXiv ID: 2502.18008
Title: NotaGen: Advancing Musicality in Symbolic Music Generation with Large Language Model Training Paradigms
Abstract: We introduce NotaGen, a symbolic music generation model aiming to explore the potential of producing high-quality classical sheet music. Inspired by the success of Large Language Models (LLMs), NotaGen adopts pre-training, fine-tuning, and reinforcement learning paradigms (henceforth referred to as the LLM training paradigms). It is pre-trained on 1.6M pieces of music in ABC notation, and then fine-tuned on approximately 9K high-quality classical compositions conditioned on "period-composer-instrumentation" prompts. For reinforcement learning, we propose the CLaMP-DPO method, which further enhances generation quality and controllability without requiring human annotations or predefined rewards. Our experiments demonstrate the efficacy of CLaMP-DPO in symbolic music generation models with different architectures and encoding schemes. Furthermore, 

In [13]:
results

{'ids': [['2502.18008',
   '2502.10467',
   '2502.14893',
   '2412.16526',
   '2410.08435',
   '2501.08809',
   '2504.09219',
   '2501.17011',
   '2505.03314',
   '2503.19611',
   '2503.00084',
   '2504.13535',
   '2309.13259',
   '2502.13128',
   '2504.05690',
   '2502.04522',
   '2504.16839',
   '2503.08147',
   '2408.15176',
   '2503.17654']],
 'embeddings': None,
 'documents': None,
 'uris': None,
 'included': ['metadatas', 'distances'],
 'data': None,
 'metadatas': [[{'updated': '2025-03-24',
    'authors': 'Yashan Wang\tShangda Wu\tJianhuai Hu\tXingjian Du\tYueqi Peng\tYongxin Huang\tShuai Fan\tXiaobing Li\tFeng Yu\tMaosong Sun',
    'created': '2025-03-21',
    'title': 'NotaGen: Advancing Musicality in Symbolic Music Generation with Large Language Model Training Paradigms',
    'categories': 'cs.SD\tcs.AI\teess.AS',
    'abstract': 'We introduce NotaGen, a symbolic music generation model aiming to explore the potential of producing high-quality classical sheet music. Inspired b

In [12]:
len(docs[1:])

19

In [None]:
# from FlagEmbedding import FlagReranker,FlagLLMReranker
from FlagEmbedding import LightWeightFlagLLMReranker
reranker = LightWeightFlagLLMReranker('BAAI/bge-reranker-v2.5-gemma2-lightweight', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

In [80]:
q = f"Which paper is most related to the this one?\n{docs[0]}"
keys = docs[1:]
scorse = reranker.compute_score([[q, k] for k in keys], normalize=True)

In [81]:
results = [{'score': score, 'doc': doc} for score, doc in zip(scorse, keys)]
results = sorted(results, key=lambda x: x['score'], reverse=True)
for result in results:
    print(f"Score: {result['score']}")
    print(f"Document: {result['doc']}")
    print()

Score: 0.1232107581442072
Document: Title: XMusic: Towards a Generalized and Controllable Symbolic Music Generation Framework
Abstract: In recent years, remarkable advancements in artificial intelligence-generated content (AIGC) have been achieved in the fields of image synthesis and text generation, generating content comparable to that produced by humans. However, the quality of AI-generated music has not yet reached this standard, primarily due to the challenge of effectively controlling musical emotions and ensuring high-quality outputs. This paper presents a generalized symbolic music generation framework, XMusic, which supports flexible prompts (i.e., images, videos, texts, tags, and humming) to generate emotionally controllable and high-quality symbolic music. XMusic consists of two core components, XProjector and XComposer. XProjector parses the prompts of various modalities into symbolic music elements (i.e., emotions, genres, rhythms and notes) within the projection space to 

In [79]:
score = reranker.compute_score(['query', 'passage'])
print(score) # -5.65234375

# You can map the scores into 0-1 by set "normalize=True", which will apply sigmoid function to the score
score = reranker.compute_score(['query', 'passage'], normalize=True)
print(score) # 0.003497010252573502

scores = reranker.compute_score([['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']])
print(scores) # [-8.1875, 5.26171875]

# You can map the scores into 0-1 by set "normalize=True", which will apply sigmoid function to the score
scores = reranker.compute_score([['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']], normalize=True)
print(scores) # [0.00027803096387751553, 0.9948403768236574]

[-5.650123596191406]
[0.0035047555373109926]
[-8.18381404876709, 5.265047073364258]
[0.00027905737696341354, 0.9948574330052771]
