In [1]:
print("Hello")

Hello


In [2]:
from helpers.common import (
    client as qdrant_client,
    vector_store,
    dense_embeddings,
    sparse_embeddings,
    MARKDOWN_DIR,
    TABLES_DIR,
    IMAGES_DESC_DIR,
    COLLECTION_NAME_TOGETHER,
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import hashlib
from pathlib import Path

from langchain_core.documents import Document

In [4]:
# Paths and COLLECTION_NAME_TOGETHER from helpers.common (MARKDOWN_DIR, TABLES_DIR, IMAGES_DESC_DIR)

In [5]:
# dense_embeddings from helpers.common


No sentence-transformers model found with name togethercomputer/m2-bert-80M-8k-retrieval. Creating a new one with mean pooling.
You are using a model of type m2_bert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
  @torch.cuda.amp.custom_fwd(cast_inputs=torch.bfloat16)
  @torch.cuda.amp.custom_bwd


-- Bidirectional: True
-- Using Long Conv Residual: True
-- Hyena w: 10
-- Hyena w mod: 1
-- Hyena filter order: 128
-- Hyena filter dropout: 0.2
-- Hyena filter wd: 0.1
-- Hyena filter emb dim: 5
-- Hyena filter lr: 0.001
-- Hyena filter lr pos emb: 1e-05


In [6]:
# sparse_embeddings from helpers.common


In [21]:
vector_store.client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='test_only_dense'), CollectionDescription(name='blueocean_new'), CollectionDescription(name='blueocean'), CollectionDescription(name='pavestone_old_v2'), CollectionDescription(name='financial_docs_together'), CollectionDescription(name='prod_old'), CollectionDescription(name='pavestone_old'), CollectionDescription(name='pavestone_final'), CollectionDescription(name='prod_new'), CollectionDescription(name='e45f6e7f-c0d5-486f-8a70-a62e1ba8e925'), CollectionDescription(name='test_with_sparse_and_dense')])

In [8]:
def extract_metadata_from_filename(filename: str):
    """
    Extract metadata from filename.
    
    Expected format: CompanyName DocType [Quarter] Year.pdf
    Examples:
        - Amazon 10-Q Q1 2024.pdf
        - Microsoft 10-K 2023.pdf
    """

    filename = filename.replace('.pdf', '').replace('.md', '')
    parts = filename.split()

    return {
        'company_name': parts[0],
        'doc_type': parts[1],
        'fiscal_quarter': parts[2] if len(parts)==4 else None,
        'fiscal_year': parts[-1]
    }

extract_metadata_from_filename('apple 10-k 2023.md')

{'company_name': 'apple',
 'doc_type': '10-k',
 'fiscal_quarter': None,
 'fiscal_year': '2023'}

In [9]:
def compute_file_hash(file_path: Path):

    sha256_hash = hashlib.sha256()

    with open(file_path, 'rb') as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)

    return sha256_hash.hexdigest()


In [10]:
compute_file_hash(Path(r'data/rag-data/markdown/apple/apple 8-k q4 2023.md'))

'88778f866cb1f5bdbffe59ee8eb14258df308a3bfdb30b1e72a36cf88d0db400'

In [12]:
# get the list of ingested file
all_points = vector_store.client.scroll(
    collection_name=COLLECTION_NAME_TOGETHER,
    limit=10_00,
    with_payload=True,
    offset=None
)

In [22]:
def get_processed_hashes():
    
    processed_hashes = set()
    offset = None

    while True:
        points, offset = vector_store.client.scroll(
                            collection_name=COLLECTION_NAME_TOGETHER,
                            limit=10_000,
                            with_payload=True,
                            offset=offset
                        )

        if not points:
            break
        
        processed_hashes.update(point.payload['metadata']['file_hash'] for point in points)

        if offset is None:
            break

    return processed_hashes

In [23]:
processed_hashes = get_processed_hashes()


In [24]:
# extract the page number from the file path
import re

def extract_page_number(file_path: Path):
    pattern = r'page_(\d+)'
    match = re.search(pattern=pattern, string=file_path.stem)
    return int(match.group(1)) if match else None

In [18]:
file_path = Path(r'data/rag-data/tables/apple/apple 8-k q4 2023/table_1_page_1.md')
extract_page_number(file_path)

1

## Ingestion FUnction

In [25]:
def ingest_file_in_db(file_path, processed_hashes):

    file_hash = compute_file_hash(file_path)
    if file_hash in processed_hashes:
        print(f"Following file has been already uploaded: {file_path}")

    path_str = str(file_path)
    if 'markdown' in path_str:
        content_type = 'text'
        doc_name = file_path.name
    elif 'tables' in path_str:
        content_type = 'tables'
        doc_name = file_path.parent.name
    elif 'images_desc' in path_str:
        content_type = 'image'
        doc_name = file_path.parent.name
    else:
        content_type = 'unknown'
        doc_name = file_path.name

    content = file_path.read_text(encoding='utf-8')

    base_metadata = extract_metadata_from_filename(doc_name)

    base_metadata.update({
        'content_type': content_type,
        'file_hash': file_hash,
        'source_file': doc_name
    })

    if content_type == 'text':
        # write method for ingesting markdown data
        pages = content.split('<!-- page break -->')
        documents = []
        for idx, page in enumerate(pages, start=1):
            metadata = base_metadata.copy()
            metadata.update({'page': idx})
            documents.append(Document(page_content=page, metadata=metadata))

        vector_store.add_documents(documents)

    else:
        # write method to ingest images desc and tables .md data
        page_num = extract_page_number(file_path)
        metadata = base_metadata.copy()
        metadata.update({'page': page_num})
        documents = [Document(page_content=content, metadata=metadata)]

        vector_store.add_documents(documents)


    processed_hashes.add(file_hash)


In [20]:
apple_path_q4 = Path('/home/yash/Desktop/Code/KGP/data/rag-data/markdown/apple/apple 8-k q4 2023.md')
ingest_file_in_db(apple_path_q4, processed_hashes)

In [None]:
from tqdm import tqdm

base_path = Path('data/rag-data')
all_md_files = list(base_path.rglob("*.md"))

for md_file in tqdm(all_md_files):
    ingest_file_in_db(md_file, processed_hashes)

 48%|████▊     | 496/1039 [45:55<05:42,  1.58it/s]   

Following file has been already uploaded: data/rag-data/tables/amazon/amazon 10-q q2 2024/table_33_page_48.md


100%|██████████| 1039/1039 [52:10<00:00,  3.01s/it]


ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/home/yash/Desktop/Code/KGP/rag_venv/lib/python3.13/site-packages/zmq/eventloop/zmqstream.py", line 565, in _log_error
    f.result()
    ~~~~~~~~^^
  File "/home/yash/Desktop/Code/KGP/rag_venv/lib/python3.13/site-packages/ipykernel/kernelbase.py", line 584, in shell_channel_thread_main
    _, msg2 = self.session.feed_identities(msg, copy=False)
              ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "/home/yash/Desktop/Code/KGP/rag_venv/lib/python3.13/site-packages/jupyter_client/session.py", line 998, in feed_identities
    raise ValueError(msg)
ValueError: DELIM not in msg_list
ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/home/yash/Desktop/Code/KGP/rag_venv/lib/python3.13/site-packages/zmq/eventloop/zmqstream.py", line 565, in _log_error
    f.result()
    ~~~~~~~~^^
  File "/home/yash/Desktop/Code/KGP/rag_

In [20]:
base_path = Path('data/rag-data')


In [22]:
all_md_files = list(base_path.rglob("*.md"))
len(all_md_files)

1039

In [45]:
test = dense_embeddings.embed_query("Hello world")
print(len(test)) 

768
