## Demo of Converting MongoDB to a Vector Database using SuperDuperDB

In [1]:
import os
mongodb_uri = os.getenv("MONGODB_URI","mongomock://test")

In [2]:
! curl -O https://superduperdb-public.s3.eu-west-1.amazonaws.com/wikipedia-sample.json

import json
import random

with open(f'wikipedia-sample.json') as f:
    data = json.load(f)
data = random.sample(data, 1000)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 6546k  100 6546k    0     0  1936k      0  0:00:03  0:00:03 --:--:-- 1937k


In [3]:
from superduperdb import superduper
from superduperdb.db.mongodb.query import Collection
from superduperdb.container.document import Document
import os
mongodb_uri = os.getenv("MONGODB_URI","mongomock://test")
db = superduper(mongodb_uri)
collection = Collection('wikipedia')
db.execute(collection.insert_many([Document(r) for r in data]))

([ObjectId('653a3bd3ed8f1698d383d4fe'),
  ObjectId('653a3bd3ed8f1698d383d4ff'),
  ObjectId('653a3bd3ed8f1698d383d500'),
  ObjectId('653a3bd3ed8f1698d383d501'),
  ObjectId('653a3bd3ed8f1698d383d502'),
  ObjectId('653a3bd3ed8f1698d383d503'),
  ObjectId('653a3bd3ed8f1698d383d504'),
  ObjectId('653a3bd3ed8f1698d383d505'),
  ObjectId('653a3bd3ed8f1698d383d506'),
  ObjectId('653a3bd3ed8f1698d383d507'),
  ObjectId('653a3bd3ed8f1698d383d508'),
  ObjectId('653a3bd3ed8f1698d383d509'),
  ObjectId('653a3bd3ed8f1698d383d50a'),
  ObjectId('653a3bd3ed8f1698d383d50b'),
  ObjectId('653a3bd3ed8f1698d383d50c'),
  ObjectId('653a3bd3ed8f1698d383d50d'),
  ObjectId('653a3bd3ed8f1698d383d50e'),
  ObjectId('653a3bd3ed8f1698d383d50f'),
  ObjectId('653a3bd3ed8f1698d383d510'),
  ObjectId('653a3bd3ed8f1698d383d511'),
  ObjectId('653a3bd3ed8f1698d383d512'),
  ObjectId('653a3bd3ed8f1698d383d513'),
  ObjectId('653a3bd3ed8f1698d383d514'),
  ObjectId('653a3bd3ed8f1698d383d515'),
  ObjectId('653a3bd3ed8f1698d383d516'),


In [4]:
import os
import sys
sys.path.append('../src/')
from open_api_key import OPENAI_API_KEY
os.environ['OPENAI_API_KEY'] =  OPENAI_API_KEY

from superduperdb.ext.openai.model import OpenAIEmbedding

model = OpenAIEmbedding(model='text-embedding-ada-002')

In [5]:
from superduperdb.container.vector_index import VectorIndex
from superduperdb.container.listener import Listener

db.add(
    VectorIndex(
        identifier=f'wiki-index-{model.identifier}',
        indexing_listener=Listener(
            model=model,
            key='abstract',
            select=collection.find(),
            predict_kwargs={'max_chunk_size': 1000},
        ),
        compatible_listener=Listener(
            model=model,
            key='title',
            select=collection.find(),
            active=False,
        ),
    )
)

INFO:root:Adding model text-embedding-ada-002 to db
INFO:root:Done.
1000it [00:00, 54552.25it/s]
INFO:root:Computing chunk 0/1
100%|██████████| 10/10 [00:08<00:00,  1.17it/s]
INFO:root:loading hashes: 'wiki-index-text-embedding-ada-002'
Loading vectors into vector-table...: 1000it [00:00, 1007.77it/s]


[]

In [6]:
cur = db.execute(
    collection
        .like({'title': 'articles about sport'}, n=3, vector_index=f'wiki-index-{model.identifier}')
        .find({}, {'title': 1})
)

for r in cur:
    print(r.unpack())

INFO:root:loading hashes: 'wiki-index-text-embedding-ada-002'
Loading vectors into vector-table...: 1000it [00:00, 1015.63it/s]


{'title': "1977 European Athletics Indoor Championships – Men's 800 metres", '_id': ObjectId('653a3bd3ed8f1698d383d58c'), '_score': 0.7967645313221112}
{'title': 'Vassilis Stravopodis', '_id': ObjectId('653a3bd3ed8f1698d383d5d3'), '_score': 0.7972161069719391}
{'title': 'Proceedings of the Institution of Mechanical Engineers, Part P', '_id': ObjectId('653a3bd3ed8f1698d383d5f8'), '_score': 0.8121169715604872}
