In [1]:
from dataclasses import asdict, dataclass
from database.database import VectorSearchResult
import pandas as pd

df = pd.read_json('data/dataset/split/small_train.jsonl', lines=True)

@dataclass
class QueryResult:
    query: pd.Series
    results: list[dict]



In [3]:
record = df.iloc[0]
type(record)

pandas.core.series.Series

In [5]:
# create some dummy VectorSearchResult objects
results = [
    VectorSearchResult(text="foo", doi="abc", pubdate="2023-01-01", distance=0.15),
    VectorSearchResult(text="bar", doi="def", pubdate="2023-01-02", distance=0.25),
    VectorSearchResult(text="baz", doi="ghi", pubdate="2023-01-03", distance=0.35)
]

results_as_dicts = [asdict(result) for result in results]

In [6]:
qr = QueryResult(query=record, results=results_as_dicts)
print(qr)

QueryResult(query=source_doi                      10.1146/annurev.astro.46.060407.145222
sent_original        It is unclear whether the solution can be foun...
sent_no_cit          It is unclear whether the solution can be foun...
sent_idx                                                           541
citation_dois                       [10.1111/j.1365-2966.2009.14750.x]
pubdate                                                     2009-09-01
resolved_bibcodes                                [2009MNRAS.396..203S]
Name: 0, dtype: object, results=[{'text': 'foo', 'doi': 'abc', 'pubdate': '2023-01-01', 'distance': 0.15}, {'text': 'bar', 'doi': 'def', 'pubdate': '2023-01-02', 'distance': 0.25}, {'text': 'baz', 'doi': 'ghi', 'pubdate': '2023-01-03', 'distance': 0.35}])


In [9]:
for key, value in asdict(qr).items():
    print(f"{key}:\n\t{value}")


query:
	source_doi                      10.1146/annurev.astro.46.060407.145222
sent_original        It is unclear whether the solution can be foun...
sent_no_cit          It is unclear whether the solution can be foun...
sent_idx                                                           541
citation_dois                       [10.1111/j.1365-2966.2009.14750.x]
pubdate                                                     2009-09-01
resolved_bibcodes                                [2009MNRAS.396..203S]
Name: 0, dtype: object
results:
	[{'text': 'foo', 'doi': 'abc', 'pubdate': '2023-01-01', 'distance': 0.15}, {'text': 'bar', 'doi': 'def', 'pubdate': '2023-01-02', 'distance': 0.25}, {'text': 'baz', 'doi': 'ghi', 'pubdate': '2023-01-03', 'distance': 0.35}]


In [13]:
# Create a DataFrame equivalent to the QueryResult
qr_df = pd.DataFrame({
    'query': [record.sent_original],
    'text_results': [[result['text'] for result in qr.results]],
    'doi_results': [[result['doi'] for result in qr.results]],
    'pubdate_results': [[result['pubdate'] for result in qr.results]],
    'distance_results': [[result['distance'] for result in qr.results]],
})
print(qr_df)

                                               query     text_results  \
0  It is unclear whether the solution can be foun...  [foo, bar, baz]   

       doi_results                       pubdate_results    distance_results  
0  [abc, def, ghi]  [2023-01-01, 2023-01-02, 2023-01-03]  [0.15, 0.25, 0.35]  
