In [16]:
from typing import Dict, List, Annotated
import struct
import numpy as np

class VecDBWorst:
    def __init__(self, file_path = "saved_db.bin", new_db = True) -> None:
        self.file_path = file_path
        if new_db:
            # just open new file to delete the old one
            with open(self.file_path, "w") as fout:
                # if you need to add any head to the file
                pass
    
    def insert_records(self, rows: List[Dict[int, Annotated[List[float], 70]]]):
        with open(self.file_path, "a+") as fout:
            for row in rows:
                id, embed = row["id"], row["embed"]
                row_str = f"{id}," + ",".join([str(e) for e in embed])
                print(row_str)
                fout.write(f"{row_str}\n")
        self._build_index()

    def insert_records_binary(self, rows: List[Dict[int, Annotated[List[float], 70]]]):
        with open(self.file_path, "ab") as fout:  # Open the file in binary mode for appending
            for row in rows:
                id, embed = row["id"], row["embed"]
                # Pack the data into a binary format
                data = struct.pack(f"I{70}f", id, *embed)
                fout.write(data)
        self._build_index()

    def calculate_offset(self, record_id: int) -> int:
        # Calculate the offset for a given record ID
        record_size = struct.calcsize("I70f")
        return record_id * record_size

    def read_record_by_id(self, record_id: int) -> Dict[int, Annotated[List[float], 70]]:
        record_size = struct.calcsize("I70f")
        offset = self.calculate_offset(record_id)

        with open(self.file_path, "rb") as fin:
            fin.seek(offset)  # Move the file pointer to the calculated offset
            data = fin.read(record_size)
            if not data:
                return {}  # Record not found

            # Unpack the binary data into a dictionary
            unpacked_data = struct.unpack("I70f", data)
            id_value, floats = unpacked_data[0], unpacked_data[1:]

            # Create and return the record dictionary
            record = {"id": id_value, "embed": list(floats)}
            return {record_id: record}

    def retrive(self, query: Annotated[List[float], 70], top_k = 5):
        scores = []
        with open(self.file_path, "r") as fin:
            for row in fin.readlines():
                row_splits = row.split(",")
                id = int(row_splits[0])
                embed = [float(e) for e in row_splits[1:]]
                score = self._cal_score(query, embed)
                scores.append((score, id))
        # here we assume that if two rows have the same score, return the lowest ID
        scores = sorted(scores, reverse=True)[:top_k]
        return [s[1] for s in scores]
    
    def _cal_score(self, vec1, vec2):
        dot_product = np.dot(vec1, vec2)
        norm_vec1 = np.linalg.norm(vec1)
        norm_vec2 = np.linalg.norm(vec2)
        cosine_similarity = dot_product / (norm_vec1 * norm_vec2)
        return cosine_similarity

    def _build_index(self):
        pass




In [24]:
import numpy as np

# Function to generate random embeddings
def generate_embeddings(num_records, embedding_dim):
    return [np.random.rand(embedding_dim).tolist() for _ in range(num_records)]

# Create an instance of VecDB
db = VecDBWorst()

# Define parameters
total_records = 20000000  # 20 million records
chunk_size = 10000  # Insert records in chunks of 10,000

# Insert records in chunks
for i in range(0, total_records, chunk_size):
    chunk_records = []
    for j in range(i + 1, i + chunk_size + 1):
        if j > total_records:
            break
        record = {"id": j, "embed": generate_embeddings(1, 70)[0]}
        # record = {"id": j, "embed": np.full(70,j)}
        #  make this size of record to be fixed 1500 bytes
        # size_of_dummy_needed = 1500 - len(record["embed"])
        
        chunk_records.append(record)

    db.insert_records_binary(chunk_records)
    print(f"Inserted {len(chunk_records)} records. Total records inserted: {j}")

print("Insertion complete.")


Inserted 10000 records. Total records inserted: 10000
Inserted 10000 records. Total records inserted: 20000
Inserted 10000 records. Total records inserted: 30000
Inserted 10000 records. Total records inserted: 40000
Inserted 10000 records. Total records inserted: 50000
Inserted 10000 records. Total records inserted: 60000
Inserted 10000 records. Total records inserted: 70000
Inserted 10000 records. Total records inserted: 80000
Inserted 10000 records. Total records inserted: 90000
Inserted 10000 records. Total records inserted: 100000
Inserted 10000 records. Total records inserted: 110000
Inserted 10000 records. Total records inserted: 120000
Inserted 10000 records. Total records inserted: 130000
Inserted 10000 records. Total records inserted: 140000
Inserted 10000 records. Total records inserted: 150000
Inserted 10000 records. Total records inserted: 160000
Inserted 10000 records. Total records inserted: 170000
Inserted 10000 records. Total records inserted: 180000
Inserted 10000 reco

In [27]:
record = db.read_record_by_id(150000)
print(record)

{150000: {'id': 150001, 'embed': [0.3768047094345093, 0.5434426069259644, 0.6379473209381104, 0.09962593764066696, 0.0887569859623909, 0.7316314578056335, 0.5597277879714966, 0.5738756656646729, 0.8712009787559509, 0.4523925185203552, 0.9733819365501404, 0.5012685656547546, 0.6938456296920776, 0.4241641163825989, 0.6885622143745422, 0.1353963315486908, 0.8240202069282532, 0.9548629522323608, 0.9230455756187439, 0.24257425963878632, 0.5855097770690918, 0.767795979976654, 0.8740313649177551, 0.880972683429718, 0.44004979729652405, 0.8210075497627258, 0.32657867670059204, 0.1786700040102005, 0.3260217010974884, 0.5470225811004639, 0.5936652421951294, 0.583935022354126, 0.8334327340126038, 0.4611385464668274, 0.8981862664222717, 0.938395619392395, 0.4210079312324524, 0.1288173347711563, 0.03499150648713112, 0.9750182628631592, 0.19303089380264282, 0.9584629535675049, 0.32923057675361633, 0.3998655378818512, 0.405233234167099, 0.6883865594863892, 0.8930075764656067, 0.7664490938186646, 0.11

In [10]:
import numpy as np
#  generate arr tuple start from 0 to 20_000_000
arr = np.arange(0, 20000000, 1)
#  print size of arr

20000000


In [18]:
with open('saved_db.csv', 'r') as file:
    # Move to the 10th byte from the beginning of the file
    file.seek(0)
    
    # Read and print the content from the current position
    # 1 MB => 1,000,000 bytes
    # 1,000,000 /1350 => 740
    # 20_000_000 / 740 => 27_027 blocks
    # content = file.read(1_000_000)
    content = file.readline()
    #  print the size of the line
    print(len(content))
    print(content)
    file.seek(len(content)+1)
    content = file.readline()
    print(len(content))
    print(content)


1352
1,0.6606054344785669,0.44804264919028125,0.22881102975586143,0.05333096241749047,0.5084951022842068,0.24025468500905356,0.9004077931618367,0.5564744960563222,0.6457622553961205,0.4071379003772593,0.351361920884804,0.5246836622014442,0.5630821296710911,0.13841085114014862,0.912664767315102,0.5534151567197382,0.10338708210292058,0.3645633090119468,0.6404595199220359,0.5813346723340996,0.42199801042598284,0.10550608934237249,0.5526635969910976,0.43935910359837194,0.6732200053671904,0.7076701077422254,0.19089540924022919,0.9152535222406287,0.8863368914838918,0.9655744116760181,0.5840517776203855,0.7949553581999904,0.8518481739348143,0.6931905817377731,0.26893208995375584,0.5426920969546063,0.5658971300296081,0.5877407129287981,0.3977236524806921,0.6332123020053829,0.33207439505341196,0.8456863999150352,0.6296123820724633,0.22941054064612076,0.9236083251994747,0.08041007481992846,0.2645688410916386,0.5302248461883718,0.021245950392764446,0.40642113522284695,0.4509976122110796,0.4079103