Severity
P2 - Not urgent, nice to have
Current Behavior
Deeplake raises Dtype is unknown error for filter = """(-21877) >= (-1093 * f6['e3'])""".
Steps to Reproduce
import deeplake as deeplake_lib
from deeplake import types as dl_types
import numpy as np
# create dataset
path = "mem://test_collection"
try: deeplake_lib.delete(path)
except: pass
ds = deeplake_lib.create(path)
# create schema
ds.add_column("f0", dl_types.Int64())
ds.add_column("f1", dl_types.Array("float32", 1))
ds.add_column("f2", dl_types.Float32())
ds.add_column("f3", dl_types.Float32())
ds.add_column("f4", dl_types.Dict())
ds.add_column("f5", dl_types.Embedding(48, dtype='float32'))
ds.add_column("f6", dl_types.Dict())
ds.add_column("f7", dl_types.Text())
ds.add_column("f8", dl_types.Embedding(46, dtype='float32'))
# define data
data_list = [...]
# insert data
pkey_name = 'f0'
pk_to_idx = {}
if pkey_name is not None and len(ds) > 0:
col = ds[pkey_name]
for i, v in enumerate(col):
pk_to_idx[v.item()] = i
for data in data_list:
pk = data.get(pkey_name) if pkey_name is not None else None
idx = pk_to_idx.get(pk) if pk is not None else None
if idx is None:
row = {k: [v] for k, v in data.items()}
ds.append(row)
if pk is not None:
pk_to_idx[pk] = len(ds) - 1
else:
for field, v in data.items():
if field in ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8']:
ds[field][idx] = v
ds.commit()
# create scalar index
ds["f0"].create_index(dl_types.Inverted)
ds["f1"].create_index(dl_types.Inverted)
# search
target_vector = [0.61611, -0.62616, 0.0, -0.80932, 0.80428, 0.0, 0.73469, 0.34276, 0.17649, 0.0, 0.93962, 0.00258, 0.0, 0.64142, 0.0, 0.37028, 0.41766, 0.0, 0.0, -0.42989, -0.77392, -0.12478, -0.70494, -0.95591, 0.0, -0.33446, 0.0, -0.3029, -0.75286, 0.94385, 0.0543, 0.10103, 0.30253, 0.21931, -0.25426, -0.30362, 0.14265, -0.60801, 0.34153, 0.16569, -0.24234, -0.23225, 0.48922, -0.39635, 0.24076, -0.63206, -0.14048, -0.45444]
anns_field = 'f5'
filter = """(-21877) >= (f4['e3'] * f6['e3'])"""
limit = 2309
q_correct = f"SELECT *, COSINE_SIMILARITY(f5, ARRAY{repr(target_vector)}) AS distance_score WHERE {filter} ORDER BY COSINE_SIMILARITY(f5, ARRAY{repr(target_vector)}) DESC LIMIT 2309"
view = ds.query(q_correct)
print("first query executed successfully")
filter = """(-21877) >= (-1093 * f6['e3'])"""
q_error = f"SELECT *, COSINE_SIMILARITY(f5, ARRAY{repr(target_vector)}) AS distance_score WHERE {filter} ORDER BY COSINE_SIMILARITY(f5, ARRAY{repr(target_vector)}) DESC LIMIT 2309"
view = ds.query(q_error) # this raises the error
print("second query executed successfully") # should not be printed if error triggered.
here is the full script
deeplake_bug_trigger.py.txt
Expected/Desired Behavior
Deeplake raises Dtype is unknown error for filter = """(-21877) >= (-1093 * f6['e3'])""", but not for filter = """(-21877) >= (f4['e3'] * f6['e3'])""". f4 is also a JSON field, and some f4 doesn't have the e3 field. This means f4[e3] is more difficult to handle than -1093. If deeplake can handle the second filter of JSON * JSON, it should also be able to handle int * JSON.
Python Version
3.10
OS
22.04
IDE
No response
Packages
No response
Additional Context
No response
Possible Solution
No response
Are you willing to submit a PR?
Severity
P2 - Not urgent, nice to have
Current Behavior
Deeplake raises Dtype is unknown error for filter = """(-21877) >= (-1093 * f6['e3'])""".
Steps to Reproduce
here is the full script
deeplake_bug_trigger.py.txt
Expected/Desired Behavior
Deeplake raises Dtype is unknown error for filter = """(-21877) >= (-1093 * f6['e3'])""", but not for filter = """(-21877) >= (f4['e3'] * f6['e3'])""". f4 is also a JSON field, and some f4 doesn't have the e3 field. This means f4[e3] is more difficult to handle than -1093. If deeplake can handle the second filter of JSON * JSON, it should also be able to handle int * JSON.
Python Version
3.10
OS
22.04
IDE
No response
Packages
No response
Additional Context
No response
Possible Solution
No response
Are you willing to submit a PR?