# How to generate Sparse Vectors with SPLADE

https://qdrant.tech/documentation/fastembed/fastembed-splade/

In [1]:
from fastembed import SparseEmbedding, SparseTextEmbedding

In [4]:
models = SparseTextEmbedding.list_supported_models()
models[0]

{'model': 'prithivida/Splade_PP_en_v1',
 'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1', 'url': None},
 'model_file': 'model.onnx',
 'description': 'Independent Implementation of SPLADE++ Model for English.',
 'license': 'apache-2.0',
 'size_in_GB': 0.532,
 'additional_files': [],
 'requires_idf': None,
 'vocab_size': 30522}

In [5]:
model_name = "prithivida/Splade_PP_en_v1"
model = SparseTextEmbedding(model_name=model_name)

## Embed data

In [7]:
documents: list[str] = [
    "Chandrayaan-3 is India's third lunar mission",
    "It aimed to land a rover on the Moon's surface - joining the US, China and Russia",
    "The mission is a follow-up to Chandrayaan-2, which had partial success",
    "Chandrayaan-3 will be launched by the Indian Space Research Organisation (ISRO)",
    "The estimated cost of the mission is around $35 million",
    "It will carry instruments to study the lunar surface and atmosphere",
    "Chandrayaan-3 landed on the Moon's surface on 23rd August 2023",
    "It consists of a lander named Vikram and a rover named Pragyan similar to Chandrayaan-2. Its propulsion module would act like an orbiter.",
    "The propulsion module carries the lander and rover configuration until the spacecraft is in a 100-kilometre (62 mi) lunar orbit",
    "The mission used GSLV Mk III rocket for its launch",
    "Chandrayaan-3 was launched from the Satish Dhawan Space Centre in Sriharikota",
    "Chandrayaan-3 was launched earlier in the year 2023",
]

In [16]:
sparse_embeddings_list: list[SparseEmbedding] = list(
    model.embed(documents, batch_size=6)
)

## Retrieve Embedding

In [17]:
index = 0
sparse_embeddings_list[index]

SparseEmbedding(values=array([0.05297276, 0.01963477, 0.3645905 , 1.38508415, 0.7177667 ,
       0.12668137, 0.46230468, 0.44676718, 0.26896986, 1.01519763,
       1.56553161, 0.29411644, 1.53102267, 0.59785521, 1.10018086,
       0.02078829, 0.09955899, 0.44248503, 0.09748027, 1.53519893,
       1.36765647, 0.15741006, 0.49882478, 0.38628468, 0.76612252,
       1.2580502 , 0.39058524, 0.27236614, 0.45152271, 0.48261923,
       0.26085106, 1.35912812, 0.70710599, 1.71639597]), indices=array([ 1010,  1011,  1016,  1017,  2001,  2018,  2034,  2093,  2117,
        2319,  2353,  2509,  2634,  2686,  2796,  2817,  2922,  2959,
        3003,  3148,  3260,  3390,  3462,  3523,  3822,  4231,  4316,
        4774,  5590,  5871,  6416, 11926, 12076, 16469]))

## Examine weights

In [18]:
for i in range(5):
    print(
        f"Token at index {sparse_embeddings_list[0].indices[i]} has weight {sparse_embeddings_list[0].values[i]}"
    )

Token at index 1010 has weight 0.05297275632619858
Token at index 1011 has weight 0.01963476650416851
Token at index 1016 has weight 0.36459049582481384
Token at index 1017 has weight 1.3850841522216797
Token at index 2001 has weight 0.7177667021751404


## Analyze results

In [21]:
import json

from tokenizers import Tokenizer

tokenizer = Tokenizer.from_pretrained(
    SparseTextEmbedding.list_supported_models()[0]["sources"]["hf"]
)

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

In [23]:
def get_tokens_and_weights(sparse_embedding, tokenizer):
    token_weight_dict = {}
    for i in range(len(sparse_embedding.indices)):
        token = tokenizer.decode([sparse_embedding.indices[i]])
        weight = sparse_embedding.values[i]
        token_weight_dict[token] = weight

    # Sort the dictionary by weights.
    token_weight_dict = dict(
        sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True)
    )
    return token_weight_dict


print(
    json.dumps(
        get_tokens_and_weights(sparse_embeddings_list[index], tokenizer), indent=4
    )
)

{
    "chandra": 1.7163959741592407,
    "third": 1.565531611442566,
    "##ya": 1.5351989269256592,
    "india": 1.5310226678848267,
    "3": 1.3850841522216797,
    "mission": 1.3676564693450928,
    "lunar": 1.3591281175613403,
    "moon": 1.2580502033233643,
    "indian": 1.1001808643341064,
    "##an": 1.0151976346969604,
    "3rd": 0.7661225199699402,
    "was": 0.7177667021751404,
    "spacecraft": 0.7071059942245483,
    "space": 0.5978552103042603,
    "flight": 0.4988247752189636,
    "satellite": 0.4826192259788513,
    "first": 0.4623046815395355,
    "expedition": 0.45152270793914795,
    "three": 0.4467671811580658,
    "fourth": 0.4424850344657898,
    "vehicle": 0.3905852437019348,
    "iii": 0.3862846791744232,
    "2": 0.36459049582481384,
    "##3": 0.29411643743515015,
    "planet": 0.27236613631248474,
    "second": 0.268969863653183,
    "missions": 0.26085105538368225,
    "launched": 0.15741005539894104,
    "had": 0.1266813725233078,
    "largest": 0.0995589941