In [1]:
import json, re
from pathlib import Path
from typing import Dict
import math
import numpy as np

# Hyperspace config and setup

In [2]:
import json

config = {
    "configuration": {
        "id": {
            "type":"float"
        },
        "title":{
        "type":"keyword"
        },
        "bundle_id": {
            "type":"keyword"
        },
        "ios":{
        "type":"boolean"
        },
        "categories": {
            "type":"keyword",
            "struct_type":"list"
        },
        "content": {
            "type":"keyword"
        },
        "embedded_app": {
            "type": "dense_vector",
            "dim": 384,
            "index_type": "brute_force",
            "metric": "IP"
        }
    }
}

with open('advec_config.json', 'w') as f:
    f.write(json.dumps(config, indent=2))

In [3]:
import hyperspace

username = "----"
password = "----"

host = "https://beta.prod.hyper-space.xyz/"

hyperspace_client = hyperspace.HyperspaceClientApi(host=host,
                                                   username=username, password= password)

collection_name = 'advec'


## Load data

In [40]:

try:
    hyperspace_client.delete_collection(collection_name)
except:
    pass
hyperspace_client.create_collection('advec_config.json', collection_name)
hyperspace_client.collections_info()

{'collections': {'advec': {'creation_time': '2023-10-01T13:33:29Z',
   'last_query_time': '2023-10-02T08:45:41Z',
   'size': 89001},
  'amazon-images-norm': {'creation_time': '2023-10-02T13:55:17Z',
   'last_query_time': '2023-10-03T06:03:49Z',
   'size': 100000}}}

In [34]:
import numpy as np
vecs = np.load('data/vectors.npy')
metadata_file = open('data/context.jsonl',encoding="utf8")

In [85]:
BATCH_SIZE = 500

batch = []
for i, (metadata_row, vec) in enumerate(zip(metadata_file, vecs)):
    row = {key: value for key, value in json.loads(metadata_row).items() if key in config["configuration"].keys()}
    row['embedded_app'] = np.ndarray.tolist(vec)

    batch.append(hyperspace.Document(str(i), row))

    if i % BATCH_SIZE == 0:
        response = hyperspace_client.add_batch(batch, collection_name)
        batch.clear()
        print(i, response)
response = hyperspace_client.add_batch(batch, collection_name)
batch.clear()
print(i, response)
hyperspace_client.commit(collection_name)

0 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
500 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
1000 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
1500 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
2000 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
2500 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
3000 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
3500 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
4000 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
4500 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
5000 {'code': 200,
 'message': 'Batch successfully added',
 'status': 'HTTP/1.1 200 OK'}
5500 {'code': 200,
 'mess

In [4]:
hyperspace_client.collections_info()

{'collections': {'advec': {'creation_time': '2023-10-01T13:33:29Z',
   'last_query_time': '2023-10-03T15:22:09Z',
   'size': 89001},
  'amazon-images-norm': {'creation_time': '2023-10-02T13:55:17Z',
   'last_query_time': '2023-10-03T10:44:25Z',
   'size': 100000}}}

# Vector search

In [5]:
input_document = hyperspace_client.get_document(collection_name, 42)
print(input_document['title'] + "\n" + str(input_document['categories']))

Sensors: Temp and Humidity
['WEATHER', 'APPLICATION']


In [6]:

query_with_knn = {
    'params': input_document,
    'knn' : {
        'embedded_app' : {"boost":1},
        'query' : {'boost':0}
    }
}

results = hyperspace_client.search(query_with_knn,
                                        size=5,
                                        collection_name=collection_name)

def print_res(result,keys=["title","bundle_id","categories"]):
    for i,result in enumerate(results['similarity']):
        vector_api_response = hyperspace_client.get_document(document_id=result['document_id'], collection_name=collection_name)
        response = f"{i+1} - {result['document_id']} : {result['score']} --- "
        keys_str = " - ".join([str(vector_api_response[k]) for k in keys])
        print(response+keys_str)

In [7]:
print_res(results)

1 - 12145 : 1.0000001192092896 --- Sensors: Temp and Humidity - com.ydvisual.s4envtrak - ['WEATHER', 'APPLICATION']
2 - 42 : 1.0000001192092896 --- Sensors: Temp and Humidity - com.ydvisual.s4envtrak - ['WEATHER', 'APPLICATION']
3 - 43618 : 0.9984776973724365 --- Temperature humidity barometeF - jp.metersfree - ['TOOLS', 'APPLICATION']
4 - 46921 : 0.9973560571670532 --- Real Mercury Thermometer - com.discipleskies.dsthermometer - ['WEATHER', 'APPLICATION']
5 - 9002 : 0.9973560571670532 --- Real Mercury Thermometer - com.discipleskies.dsthermometer - ['WEATHER', 'APPLICATION']


## Classic score

In [8]:
#classic score func
sf_file = 'classic_score.py'
hyperspace_client.set_function(sf_file, collection_name=collection_name, function_name='classic_score')

{'code': 200, 'message': 'Function was set successfully', 'status': 'OK'}

In [9]:
input_document = hyperspace_client.get_document(collection_name, 42)
input_document['title'] + "\n" + str(input_document['categories'])

"Sensors: Temp and Humidity\n['WEATHER', 'APPLICATION']"

In [10]:
query_with_knn = {
    'params': input_document,
    'knn' : {
        'embedded_app' : {"boost":1},
        'query' : {'boost':1}
    }
}

results = hyperspace_client.search(query_with_knn,
                                        size=10,
                                   function_name="classic_score",
                                        collection_name=collection_name)
print_res(results,["title","bundle_id","categories"])

1 - 43618 : 2.9984776973724365 --- Temperature humidity barometeF - jp.metersfree - ['TOOLS', 'APPLICATION']
2 - 46921 : 2.9973559379577637 --- Real Mercury Thermometer - com.discipleskies.dsthermometer - ['WEATHER', 'APPLICATION']
3 - 9002 : 2.9973559379577637 --- Real Mercury Thermometer - com.discipleskies.dsthermometer - ['WEATHER', 'APPLICATION']
4 - 12431 : 2.996110677719116 --- Living in the sun - Sun & Moon - com.zara.app.compassk - ['PHOTOGRAPHY', 'APPLICATION']
5 - 328 : 2.996110677719116 --- Living in the sun - Sun & Moon - com.zara.app.compassk - ['PHOTOGRAPHY', 'APPLICATION']
6 - 50635 : 2.9960594177246094 --- Smart thermometer - com.naavsystems.smartthermo - ['WEATHER', 'APPLICATION']
7 - 52176 : 2.995534896850586 --- Sun Alarm - com.vvse.sunalarm - ['WEATHER', 'APPLICATION']
8 - 11144 : 2.9947710037231445 --- Temperature: Phone, Room, City - com.ppn.temperature.checker - ['WEATHER', 'APPLICATION']
9 - 74829 : 2.9947710037231445 --- Temperature: Phone, Room, City - com.pp

## Generate new embeddings and search

In [13]:
from sentence_transformers import SentenceTransformer
emb_model = SentenceTransformer('BAAI/bge-small-en')

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
sim_sentence = """a great app for gaming with my friends """
# Embedding using sentence-transformers - irrelevant at advec-be deployment machine
sim_embedding = emb_model.encode([sim_sentence], normalize_embeddings=True)[0]

query_with_knn = {
    'params': {
        
        'embedded_app':sim_embedding.tolist()
    },
    'knn' : {
        'embedded_app' : {"boost":1},

    }
}


In [15]:
%%time
results = hyperspace_client.search(query_with_knn,
                                        size=20,
                                        collection_name=collection_name)

CPU times: total: 203 ms
Wall time: 177 ms


In [16]:
results
print_res(results,['title','bundle_id'])

1 - 20141 : 0.8744263648986816 --- Game Offline 3D no Wi Fi - com.chillapps.gameoffline3d
2 - 70640 : 0.8744263648986816 --- Game Offline 3D no Wi Fi - com.chillapps.gameoffline3d
3 - 76652 : 0.8744263648986816 --- Game Offline 3D no Wi Fi - com.chillapps.gameoffline3d
4 - 37344 : 0.8736395835876465 --- Vlinder avatar maker: Anime - com.dressup.doll.vlinder.avatar.maker.anime
5 - 50248 : 0.8733617663383484 --- Real Fireworks - com.mustafademir.realfireworks
6 - 40079 : 0.8724929690361023 --- Dragon Blast - com.dragonblast.free
7 - 42537 : 0.8724929690361023 --- Monster Blast - com.candy.cute.monster.blast.gp
8 - 9564 : 0.8724929690361023 --- Dragon Blast - com.dragonblast.free
9 - 74551 : 0.871880054473877 --- KurtMaster2D - com.plbm.plbm1
10 - 84062 : 0.871880054473877 --- KurtMaster2D - com.plbm.plbm1
11 - 39177 : 0.871452808380127 --- ReallyMake: Pottery Sculpting - 1191748553
12 - 50148 : 0.8714419603347778 --- KoGaMa Brazil - com.multiverse.brkogama
13 - 50151 : 0.8714419603347778

## Hybrid
We search for similar descriptions but filter out apps that are not from ios using score function

In [12]:
#classic score func
sf_file = 'complex_score_1.py'
hyperspace_client.set_function(sf_file, collection_name=collection_name, function_name='cat_ratio_score')

{'code': 200, 'message': 'Function was set successfully', 'status': 'OK'}

In [21]:
input_document = hyperspace_client.get_document(collection_name, 12345)
input_document['title'] + "\n" + str(input_document['categories'])

"Hockey Elite\n['GAME_SPORTS', 'GAME']"

In [23]:
query_with_knn = {
    'params': input_document,
    'knn' : {
        'embedded_app' : {"boost":0.3},
        'query' : {'boost':0.7}
    }
}

results = hyperspace_client.search(query_with_knn,
                                        size=10,
                                   function_name="cat_ratio_score",
                                        collection_name=collection_name)
print_res(results,["ios","title","bundle_id","categories"])

1 - 15274 : 0.6863179206848145 --- False - Bowling Tins Game - com.androtiyas.bowlingtin - ['GAME_SPORTS', 'GAME']
2 - 8200 : 0.6862764358520508 --- False - MamoBall 4v4 Online Soccer - com.alberun.mamoball - ['GAME_SPORTS', 'GAME']
3 - 40167 : 0.6862521767616272 --- False - Archery Tournament - com.dreamgame.archery.master - ['GAME_SPORTS', 'GAME']
4 - 21125 : 0.6498104333877563 --- False - Glow Hockey 2 - com.natenai.glowhockey2 - ['GAME_ARCADE', 'GAME']
5 - 50775 : 0.6498104333877563 --- False - Glow Hockey 2 - com.natenai.glowhockey2 - ['GAME_ARCADE', 'GAME']
6 - 21124 : 0.6497644186019897 --- False - Glow Hockey - com.natenai.glowhockey - ['GAME_ARCADE', 'GAME']
7 - 50774 : 0.6497644186019897 --- False - Glow Hockey - com.natenai.glowhockey - ['GAME_ARCADE', 'GAME']
8 - 81094 : 0.6496398448944092 --- False - Hit Boxing 3D - hit.boxing.d3 - ['GAME_ARCADE', 'GAME']
9 - 30578 : 0.6496272087097168 --- False - 8 Ball King - com.ballking.luckyfreegame - ['GAME_PUZZLE', 'GAME']
10 - 8247

In [18]:
results

{'candidates': 31395,
 'similarity': [{'document_id': '57889', 'score': 1.562448501586914},
  {'document_id': '80180', 'score': 1.562448501586914},
  {'document_id': '60181', 'score': 1.562364935874939},
  {'document_id': '80679', 'score': 1.562364935874939},
  {'document_id': '17422', 'score': 1.5622408390045166},
  {'document_id': '57997', 'score': 1.562232494354248},
  {'document_id': '80288', 'score': 1.562232494354248},
  {'document_id': '11849', 'score': 1.5622313022613525},
  {'document_id': '55542', 'score': 1.5621967315673828},
  {'document_id': '50529', 'score': 1.5621466636657715}],
 'took_ms': 11.5222}