In [None]:
import json
import os
from tqdm import tqdm

from vertexai.preview import tokenization

model_name = "gemini-1.5-flash-001"
tokenizer = tokenization.get_tokenizer_for_model(model_name)

pubmed_dataset_dir = 'datasets/pubmed/chunk'

progress = tqdm(total=len(os.listdir(pubmed_dataset_dir)))  

for filename in os.listdir(pubmed_dataset_dir):
    progress.update(1)
    
    if not filename.endswith('.jsonl'):
        continue
    path = os.path.join(pubmed_dataset_dir, filename)

    num_tokens_list = []
    with open(path, 'r') as f:
        data_list = f.readlines()   
        for data in data_list:
            data = json.loads(data)
            num_tokens = tokenizer.count_tokens(data['contents']).total_tokens
            num_tokens_list.append(num_tokens)  

    break

In [None]:
print(data['content'])

In [None]:
data['content'] in data['contents']

In [None]:
print(data['contents'])

In [None]:
from vertexai.preview import tokenization

model_name = "gemini-1.5-flash-001"
tokenizer = tokenization.get_tokenizer_for_model(model_name)

contents = "Hello World!"
result = tokenizer.count_tokens(contents)
print(result.total_tokens)

In [None]:
dir(tokenizer)

In [None]:
tokenizer._compute_tokens(contents)

In [None]:
print(f"Average number of tokens: {sum(num_tokens_list) / len(num_tokens_list)}")

In [5]:
from src.encoder.factory import encoder_factory, EncoderType

model_name = "all-MiniLM-L6-v2"
encoder = encoder_factory(EncoderType.SENTENCE_TRANSFORMER,
  model_name = model_name)

contents = [
  "Hello World!",
  "This is a test."
]
result = encoder.encode_batch(contents)
print(result.shape)

(2, 384)


In [3]:
encoder.dimension()

384

In [1]:
from src.dataloaders.factory import dataloader_factory, SupportedDatasets
pubmed_dir = 'datasets/pubmed_demo/chunk'

dataloader = dataloader_factory(
    dataset = SupportedDatasets.PUBMED,
    data_dir = pubmed_dir,
    batch_size=10,
)

for batch in dataloader:
    break

In [1]:
from src.vector_store.factory import vector_store_factory, VectorStoreType
import numpy as np

vector_store = vector_store_factory(
    store_type = VectorStoreType.QDRANT,
    index_name = 'test1',
    embedding_dim = 10,
    storage_path = 'temp'
)

idx_list = list(range(10))
dummy_vectors = np.random.rand(10, 10)
payloads = [
    {"title": "Hello World!"} for _ in range(10)
]

batch = {
    "idx": idx_list,
    "vectors": dummy_vectors,
    "metadata": payloads
}

vector_store.upsert_batch(batch)

In [3]:
query_vector = np.random.rand(10)
results = vector_store.query(query_vector, topk=3)
print(results)

[{'title': 'Hello World!'}, {'title': 'Hello World!'}, {'title': 'Hello World!'}]


In [3]:
!rm -rf temp
!mkdir temp

In [1]:
from src.llms.factory import llm_factory, LLMTypes

model = llm_factory(
    llm_type = LLMTypes.GEMINI,
    model_name = "gemini-1.5-flash-001"
)

response = model("What is the meaning of life?")
print(response)

  from .autonotebook import tqdm as notebook_tqdm


["As a large language model, I don't have personal opinions or beliefs, including on the meaning of life. The meaning of life is a question that has been pondered by philosophers and theologians for centuries, and there is no one definitive answer.  \n\nHere's a breakdown of how people have approached the question:\n\n* **Existentialism:**  Focuses on the individual's freedom and responsibility to create their own meaning in a meaningless universe.\n* **Nihilism:**  Believes that life is inherently meaningless and that there is no purpose or objective truth.\n* **Religion:**  Often defines the meaning of life through the context of a higher power, divine purpose, and/or an afterlife. \n* **Humanism:**  Emphasizes human values, reason, and ethics as the foundation for meaning and purpose. \n* **Hedonism:**  Focuses on maximizing pleasure and minimizing pain as the ultimate goal in life. \n\nUltimately, the meaning of life is a personal question.  It's something each individual must disc

In [2]:
# Remove all __pycache__ directories recursively in the current directory
!find . | grep -E "(__pycache__)" | xargs rm -rf
