In [1]:
from typing import List
import pandas as pd

In [2]:
df = pd.read_csv("./data/18789327023.csv")

In [3]:
EPSILON = 100 # at this moment random value

In [4]:
per_timestamp = df.groupby(["Seen Timestamp"])

In [5]:
new_frames = {}
for ts, subframe in per_timestamp:
    new_frames[ts] = subframe
    new_frames[ts]["dx"] = subframe["X 1"][:-1] - subframe["X 2"][1:]
    new_frames[ts]["dy"] = subframe["Y 1"][:-1] - subframe["Y 2"][1:]
    new_frames[ts]["w"] = subframe["X 2"] - subframe["X 1"]
    new_frames[ts]["h"] = subframe["Y 2"] - subframe["Y 1"]

The simplest approach, no epsilon, vectors have to match exactly

In [6]:
from docarray import DocList, BaseDoc
from docarray.typing import NdArray

from vectordb import InMemoryExactNNVectorDB

import numpy as np

In [7]:
class MyDoc(BaseDoc):
    vec_id: int = 0
    embedding: NdArray[6]

db = InMemoryExactNNVectorDB[MyDoc]()




In [8]:
def euclidean_dist(v1: np.ndarray, v2: np.ndarray) -> float:
    return np.sqrt(((v1 - v2) ** 2).sum())

def get_vectors(frame: pd.DataFrame) -> List[np.ndarray]:
    cpy = frame.copy(deep=True)
    cpy['prev_w'] = cpy['w'].shift(1)
    cpy['prev_h'] = cpy['h'].shift(1)
    return cpy[['dx', 'dy', 'w', 'h', 'prev_w', 'prev_h']].values.tolist()[1:]

In [9]:
frames_train = [sdf for i, sdf in enumerate(new_frames.values()) if i < len(new_frames) // 2]
frames_test = [sdf for i, sdf in enumerate(new_frames.values()) if i >= len(new_frames) // 2]

In [10]:
vectors = []
frequency = {}
for sdf in frames_train:
    vectors += get_vectors(sdf)

added_ctr = 0
db.index(inputs=DocList[MyDoc]([MyDoc(vec_id=added_ctr, embedding=vectors[0])]))
frequency[added_ctr] = 1
for v in vectors[1:]:
    results = db.search(inputs=DocList[MyDoc]([MyDoc(vec_id=-1, embedding=v)]), limit=10)
    d = euclidean_dist(results[0].matches[0].embedding, v)
    if d > EPSILON:
        added_ctr += 1
        db.index(inputs=DocList[MyDoc]([MyDoc(vec_id=added_ctr, embedding=v)]))
        frequency[added_ctr] = 1
    else:
        frequency[results[0].matches[0].vec_id] += 1

print("fraction of vectors added:", added_ctr / len(vectors))

fraction of vectors added: 0.16923076923076924


## Grouping

In [13]:
for sdf in frames_train:
    vs = get_vectors(sdf)
    print(sdf["Text"].iloc[0])
    for v, row in zip(vs, sdf["Text"]):
        res = db.search(inputs=DocList[MyDoc]([MyDoc(vec_id=-1, embedding=v)]))[0]
        ix = 0
        while ix < 9 and frequency[res.matches[ix].vec_id] == 3:
            ix += 1
        d = euclidean_dist(v, res.matches[ix].embedding)
        if euclidean_dist(v, res.matches[ix].embedding) > EPSILON or frequency[res.matches[ix].vec_id] <= 1:
            print("="*30)
        print(row)

nan
nan
nan
UVP 1.79
1.29
UVP
RAUCH Eistee
je 1,5 I
UVP 0.99
0.69
UVP
LIPTON Ice Tea
je 0,33 I
UVP 1.49
1.19
UVP
HOHES C Water
je 0,75 I
UVP 0.99
0.79
UVP
GEROLSTEINER Mineralwasser
je 1,5 I
Sparen auf Top-Marken
ab 05.09. bis 07.09.
Angebote
Vorteile
Einkaufsliste
Vorteilscode
UVP 1.79
UVP 1.79
1.29
UVP
RAUCH Eistee
je 1,5 I
UVP 0.99
0.69
UVP
LIPTON Ice Tea
je 0,33 I
UVP 1.49
1.19
UVP
HOHES C Water
je 0,75 I
UVP 0.99
0.79
UVP
GEROLSTEINER Mineralwasser
UVP 1.79
UVP 1.79
1.29
UVP
RAUCH Eistee
je 1,5 I
UVP 0.99
0.69
UVP
LIPTON Ice Tea
je 0,33 I
UVP 1.49
1.19
UVP
HOHES C Water
je 0,75 I
UVP 0.99
0.79
UVP
GEROLSTEINER Mineralwasser
