In [1]:
!pip install redis numpy lmdb scipy

Collecting redis
  Downloading redis-4.5.3-py3-none-any.whl (238 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.6/238.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting numpy
  Downloading numpy-1.24.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting lmdb
  Downloading lmdb-1.4.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (306 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.5/306.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting async-timeout>=4.0.2
  

In [2]:
import redis
import uuid
import pickle
import random
import time
import numpy as np
import lmdb

In [3]:
uuids = set(str(uuid.uuid4()) for _ in range(1000000))

In [4]:
uuids = list(uuids)

In [5]:
# with open("uuids.pickle", "wb") as f:
#     pickle.dump(uuids, f)

# Redis

In [6]:
# Connect to Redis
redis_client = redis.Redis(host='localhost', port=6379)

In [7]:
# Set the feature data in Redis
for entity_id in uuids:
    feature_value = round(random.random(), 4)
    redis_client.set(entity_id, feature_value)

In [8]:
redis_client.mget([uuids[0], uuids[1]])

[b'0.3669', b'0.3996']

## Lookup test for 1000 features

In [20]:
times = []
for _ in range(1000):
    
    ids_list = random.sample(uuids, 1000)
    
    start = time.time()
    features = redis_client.mget(ids_list)
    features = [float(feat) for feat in features]
    end = time.time()
    
    times.append((end-start)*1000)
    
print(np.mean(times), np.std(times), np.percentile(times, 0.99))

6.493175029754639 0.17160608565119145 6.296029424667358


## Pipeline

In [22]:
times = []
for _ in range(1000):
    
    ids_list = random.sample(uuids, 1000)
    
    start = time.time()
    pipe = redis_client.pipeline()
    for _id in ids_list:
        pipe.get(_id)
    
    features = pipe.execute()
    features = [float(feat) if feat else None for feat in features ]
    end = time.time()
    times.append((end-start)*1000)
    
print(np.mean(times), np.std(times), np.percentile(times, 0.99))

8.946181535720825 1.8580869229525314 8.66556167602539


In [17]:
features

[b'0.7777', b'0.1066', b'0.3673', b'0.6348', b'0.0437', None]

# LMDB

In [8]:
lmdb_env = lmdb.open("/tmp/lmdb", map_size=int(2e9))

In [9]:
with lmdb_env.begin(write=True) as txn:
    for entity_id in sorted(uuids):
        feature_value = round(random.random(), 4)
        txn.put(f"{entity_id}__feature_popularity".encode(), str(feature_value).encode(), append=True) 

In [10]:
with lmdb_env.begin(write=False) as txn:
    keys = [f"{uuids[0]}__feature_popularity".encode(), f"{uuids[1]}__feature_popularity".encode(), f"aa__feature_popularity".encode()]
    print(keys)
    with txn.cursor() as cursor:
        feat = dict(cursor.getmulti(keys))
        feat = [feat[key] if key in feat else None for key in keys]
        print(feat, len(feat))

[b'4cae0707-ee2b-4b63-b847-fc228253d6bd__feature_popularity', b'f3bfc444-741e-4dc4-8fd4-de28fc9c0136__feature_popularity', b'aa__feature_popularity']
[b'0.1699', b'0.926', None] 3


## Lookup test for 1000 features

In [46]:
times = []
for _ in range(3000):
    
    ids_list = list(sorted(random.sample(uuids, 1000), reverse=True))
    
    start = time.time()
    keys = [ f"{_uuid}__feature_popularity".encode() for _uuid in ids_list]
    with lmdb_env.begin(write=False) as txn:
        cursor = txn.cursor()
        features = dict(cursor.getmulti(keys))
    features = [features[key] if key in features else None for key in keys]
    end = time.time()
    
    times.append((end-start)*1000)
    
print(np.mean(times), np.std(times), np.percentile(times, 0.99))

1.2671126524607341 0.08550261755062631 1.1909008026123047


In [47]:
times2 = []
for _ in range(3000):
    
    ids_list = random.sample(uuids, 1000)
    
    start = time.time()
    keys = [ f"{_uuid}__feature_popularity".encode() for _uuid in ids_list]
    # keys = list(sorted(keys, reverse=True))
    
    with lmdb_env.begin(write=False) as txn:
        cursor = txn.cursor()
        features = {}
        for key in keys:
            features[key] = txn.get(key)
    features = [features[key] if key in features else None for key in keys]
    end = time.time()
    
    times2.append((end-start)*1000)
    
print(np.mean(times2), np.std(times2), np.percentile(times2, 0.99))

1.259629487991333 0.6777019895160604 1.169443130493164


In [48]:
from scipy.stats import ks_2samp, ttest_ind

In [49]:
ks_2samp(times, times2)

KstestResult(statistic=0.2826666666666667, pvalue=6.103183174756487e-106, statistic_location=1.2063980102539062, statistic_sign=-1)

In [50]:
ttest_ind(times, times2)

Ttest_indResult(statistic=0.5999367852441286, pvalue=0.5485710291143395)