In [30]:
import os
import sys
import json

src_dir = os.path.abspath('./src')
sys.path.append(src_dir)

In [31]:
from utils import init
from model.lm_encoders.setup import ModelSetup
from embed import Chroma_EmbeddingStore
from preprocess.text_operations import ConvertJsonToString

client = init()

In [32]:
ds = client.get_collection("datasets")
metas = ds.peek(5)["metadatas"]
metas

[{'json_string': '{"platform": "huggingface", "platform_resource_identifier": "acronym_identification", "name": "acronym_identification", "date_published": "2022-03-02T23:29:22", "same_as": "https://huggingface.co/datasets/acronym_identification", "is_accessible_for_free": true, "ai_asset_identifier": 2, "ai_resource_identifier": 2, "aiod_entry": {"editor": [], "status": "published", "date_modified": "2023-12-19T10:25:29", "date_created": "2023-12-19T10:25:29"}, "alternate_name": [], "application_area": [], "citation": [], "contact": [], "creator": [], "description": {"plain": "Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21."}, "distribution": [{"content_url": "https://huggingface.co/datasets/acronym_identification/resolve/refs%2Fconvert%2Fparquet/default/test/0000.parquet", "content_size_kb": 206100, "description": "acronym_identification. Config: default. Split: test", "name": "0000.parquet"}, {"content_url": "https://huggingfa

In [33]:
json.loads(metas[0]["json_string"])

{'platform': 'huggingface',
 'platform_resource_identifier': 'acronym_identification',
 'name': 'acronym_identification',
 'date_published': '2022-03-02T23:29:22',
 'same_as': 'https://huggingface.co/datasets/acronym_identification',
 'is_accessible_for_free': True,
 'ai_asset_identifier': 2,
 'ai_resource_identifier': 2,
 'aiod_entry': {'editor': [],
  'status': 'published',
  'date_modified': '2023-12-19T10:25:29',
  'date_created': '2023-12-19T10:25:29'},
 'alternate_name': [],
 'application_area': [],
 'citation': [],
 'contact': [],
 'creator': [],
 'description': {'plain': 'Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21.'},
 'distribution': [{'content_url': 'https://huggingface.co/datasets/acronym_identification/resolve/refs%2Fconvert%2Fparquet/default/test/0000.parquet',
   'content_size_kb': 206100,
   'description': 'acronym_identification. Config: default. Split: test',
   'name': '0000.parquet'},
  {'content_url': 'htt

In [34]:
EMBEDDING_COLLECTION_NAME = "embeddings-BAAI-simple"

In [35]:
emb_col = client.get_collection(EMBEDDING_COLLECTION_NAME)
emb_col.peek(5)

{'ids': ['00000359-df79-4207-9ad3-4f06d5df88de',
  '00002f68-5cee-46eb-8067-37b8356e1a8f',
  '00003bf6-edb9-4991-9bb3-990728d99a22',
  '000077c6-2fa4-4c26-abbe-ecb6be196eb2',
  '00008068-af38-4988-ab86-88e26f8b54bc'],
 'embeddings': [[-0.025739647448062897,
   0.047699980437755585,
   -0.060836825519800186,
   0.043795522302389145,
   0.07579464465379715,
   0.017618408426642418,
   0.01625131070613861,
   0.02256404608488083,
   -0.013682409189641476,
   -0.027031244710087776,
   -0.020877551287412643,
   0.005215992219746113,
   -0.01594863273203373,
   0.020906509831547737,
   0.04006724804639816,
   0.05584186688065529,
   0.017972098663449287,
   -0.04160038381814957,
   0.0762479230761528,
   0.013433114625513554,
   -0.013929387554526329,
   -0.003133220598101616,
   0.03142445534467697,
   -0.008497669361531734,
   -0.004202519077807665,
   -0.006336900405585766,
   -0.025866596028208733,
   0.007361957803368568,
   -0.05794481188058853,
   -0.030476519837975502,
   -0.02450582

In [36]:
model = ModelSetup._setup_sentence_transformer_hierarchical(
    model_path="BAAI/bge-base-en-v1.5",
    max_num_chunks=5,
    use_chunk_transformer=False,
    pooling="mean", 
    parallel_chunk_processing=True
)



In [37]:
QUERY = "I want a dataset about movies reviews"

In [38]:
store = Chroma_EmbeddingStore(client)

In [39]:
query_list = [(QUERY, 0)]
result_set = store.semantic_search(model, query_list, emb_collection_name=EMBEDDING_COLLECTION_NAME)

In [40]:
print("Doc IDs:", result_set["doc_ids"][0])
print("Distances:", result_set["distances"][0])

Doc IDs: ['355360', '355355', '155754', '350995', '288', '204067', '250466', '520', '25581', '25582']
Distances: [0.19924968481063843, 0.20402848720550537, 0.22366082668304443, 0.22886121273040771, 0.23224103450775146, 0.23358553647994995, 0.23623722791671753, 0.24049288034439087, 0.2477220892906189, 0.24823153018951416]


In [41]:
document_collection_name = "datasets"
json_docs = store.retrieve_documents_from_result_set(result_set, document_collection_name)[0]

In [42]:
for it, (doc, id, dist) in enumerate(zip(json_docs, result_set["doc_ids"][0], result_set["distances"][0])):
    descr = doc.get("description", {})
    descr = descr.get("plain", None)
    
    print("----------------------------")
    print("DOCUMENT", it)
    print("Doc ID:", id)
    print("Description:", descr)
    print("----------------------------\n\n")

----------------------------
DOCUMENT 0
Doc ID: 355360
Description: IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing. So, predict the number of positive and negative reviews using either classification or deep learning algorithms.
For more dataset information, please go through the following link,
http://ai.stanford.edu/~amaas/data/sentiment/.
----------------------------


----------------------------
DOCUMENT 1
Doc ID: 355355
Description: IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing.

### JSON -> TEXT conversion
- mostly concatenate seemingly relevant fields in JSON
- omit empty fields

In [43]:
json_doc = json_docs[0]
json_doc

{'platform': 'zenodo',
 'platform_resource_identifier': 'zenodo.org:7928582',
 'name': 'Large Movie Review Dataset',
 'date_published': '2011-06-01T00:00:00',
 'same_as': 'https://zenodo.org/api/records/7928582',
 'ai_asset_identifier': 371516,
 'ai_resource_identifier': 371546,
 'aiod_entry': {'editor': [],
  'status': 'published',
  'date_modified': '2024-04-22T16:39:07',
  'date_created': '2024-04-22T16:39:07'},
 'alternate_name': [],
 'application_area': [],
 'citation': [],
 'contact': [],
 'creator': [],
 'description': {'plain': 'IMDB dataset having 50K movie reviews for natural language processing or Text analytics.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing. So, predict the number of positive and negative reviews using either classification or deep learning algorithms.\nFor more dataset information, please go

In [44]:
text_doc = ConvertJsonToString.extract_relevant_info(json_doc)
print(text_doc)

platform: zenodo
name: Large Movie Review Dataset
date_published: 2011-06-01T00:00:00
year_published: 2011
month_published: 6
day_published: 1
description: IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing. So, predict the number of positive and negative reviews using either classification or deep learning algorithms.
For more dataset information, please go through the following link,
http://ai.stanford.edu/~amaas/data/sentiment/.
keyword: sentiment classification
DISTRIBUTION:
	name:IMDB Dataset.csv, encoding_format:text/csv

