In [2]:
from dataclasses import dataclass
from typing import Dict, List


@dataclass
class DocSection:
    id: str
    parent_id: str
    title: str
    content: str

def create_doc_sections_map(doc_sections: List[DocSection]) -> Dict[str, DocSection]:
    return {section.id: section for section in doc_sections}

def get_title_breadcrumbs(doc_section: DocSection, doc_sections_map: Dict[str, DocSection]) -> List[str]:
    breadcrumbs = [doc_section.title]
    current_section = doc_section
    while current_section.parent_id:
        parent_section = doc_sections_map[current_section.parent_id]
        breadcrumbs.insert(0, parent_section.title)
        current_section = parent_section
    return breadcrumbs

# Sample markdown content
markdown_content = """
# Introduction to Machine Learning

Machine Learning is a subset of artificial intelligence that focuses on the development of algorithms and statistical models.

## Supervised Learning

Supervised learning is a type of machine learning where the algorithm learns from labeled training data.

### Classification

Classification is a supervised learning task where the output is a categorical variable.

#### Binary Classification

Binary classification involves predicting one of two possible outcomes.

### Regression

Regression is a supervised learning task where the output is a continuous variable.

## Unsupervised Learning

Unsupervised learning is a type of machine learning where the algorithm learns patterns from unlabeled data.

### Clustering

Clustering is an unsupervised learning task that involves grouping similar data points together.
"""

# Create sample DocSection objects
doc_sections = [
    DocSection("1", "", "Introduction to Machine Learning", "Machine Learning is a subset of artificial intelligence that focuses on the development of algorithms and statistical models."),
    DocSection("2", "1", "Supervised Learning", "Supervised learning is a type of machine learning where the algorithm learns from labeled training data."),
    DocSection("3", "2", "Classification", "Classification is a supervised learning task where the output is a categorical variable."),
    DocSection("4", "3", "Binary Classification", "Binary classification involves predicting one of two possible outcomes."),
    DocSection("5", "2", "Regression", "Regression is a supervised learning task where the output is a continuous variable."),
    DocSection("6", "1", "Unsupervised Learning", "Unsupervised learning is a type of machine learning where the algorithm learns patterns from unlabeled data."),
    DocSection("7", "6", "Clustering", "Clustering is an unsupervised learning task that involves grouping similar data points together.")
]

# Create doc_sections_map
doc_sections_map = create_doc_sections_map(doc_sections)

# Test get_title_breadcrumbs for different sections
for section in doc_sections:
    breadcrumbs = get_title_breadcrumbs(section, doc_sections_map)
    print(f"Breadcrumbs for '{section.title}': {' > '.join(breadcrumbs)}")


Breadcrumbs for 'Introduction to Machine Learning': Introduction to Machine Learning
Breadcrumbs for 'Supervised Learning': Introduction to Machine Learning > Supervised Learning
Breadcrumbs for 'Classification': Introduction to Machine Learning > Supervised Learning > Classification
Breadcrumbs for 'Binary Classification': Introduction to Machine Learning > Supervised Learning > Classification > Binary Classification
Breadcrumbs for 'Regression': Introduction to Machine Learning > Supervised Learning > Regression
Breadcrumbs for 'Unsupervised Learning': Introduction to Machine Learning > Unsupervised Learning
Breadcrumbs for 'Clustering': Introduction to Machine Learning > Unsupervised Learning > Clustering


In [7]:
from typing import List
import random

# Simulated functions for testing
def chunk_text(text: str) -> List[str]:
    """Simulates chunking a text into smaller parts."""
    return [text[i:i+10] for i in range(0, len(text), 10)]  # Split into chunks of 10 characters.

def compute_embeddings(chunks: List[str]) -> List[List[float]]:
    """Simulates computing embeddings for text chunks."""
    return [[random.random() for _ in range(3)] for _ in chunks]  # Random 3-dimensional vectors.

def test_populate_doc_section_summary_chunks_at_level():
    """Test the workflow logic with simplified inputs."""
    # Simulated input data
    doc_sections = [
        {"id": 1, "summary": "12312с фыва фыва 123 123 12 1ыв сыфсф ывс ывс", "level": 3},
        {"id": 2, "summary": "Another test summary for section two.", "level": 1},
        {"id": 3, "summary": "Final test summary for section three.", "level": 2},
	    {"id": 4, "summary": "йцуу йцу йцу йц уйцуйц уйцу йцуйцу йцу", "level": 3},
	    {"id": 5, "summary": "Хухухуху ухухх ху хух ух ухху ух уху ух хххуху думуд 3ю", "level": 3},
    ]
    model_doc = {"model_id": "1234", "doc_sections": doc_sections}

    # Specify the level to filter
    level = 3

    # Filter doc sections by the specified level
    filtered_doc_sections = [doc_section for doc_section in model_doc["doc_sections"] if doc_section["level"] == level]

    # Chunk summaries for each filtered doc section
    chunked_summaries = [chunk_text(doc_section["summary"]) for doc_section in filtered_doc_sections]

    # Compute embeddings for all chunks
    all_chunks = [chunk for summary_chunks in chunked_summaries for chunk in summary_chunks]
    embeddings = compute_embeddings(all_chunks)

    # Create points, preserving the link between chunks and doc_section.id
    points = []
    embedding_index = 0
    for doc_section, summary_chunks in zip(filtered_doc_sections, chunked_summaries):
        for chunk_index, chunk in enumerate(summary_chunks):
            points.append({
                "id": f"doc_section_summary_level{level}_{doc_section['id']}_{chunk_index}",
                "payload": {
                    "summary": chunk,
                    "level": str(level),
                    "model_id": model_doc["model_id"],
                    "doc_section_id": doc_section["id"]
                },
                "vector": embeddings[embedding_index]
            })
            embedding_index += 1

    # Print the results for testing
    #print("Filtered Doc Sections:", filtered_doc_sections)
    #print("Chunked Summaries:", chunked_summaries)
    #print("All Chunks:", all_chunks)
   # print("Embeddings:", embeddings)
    print("Points:", points)

# Run the test
test_populate_doc_section_summary_chunks_at_level()


Points: [{'id': 'doc_section_summary_level3_1_0', 'payload': {'summary': '12312с фыв', 'level': '3', 'model_id': '1234', 'doc_section_id': 1}, 'vector': [0.057234936310643025, 0.20998684916229038, 0.5690365095119472]}, {'id': 'doc_section_summary_level3_1_1', 'payload': {'summary': 'а фыва 123', 'level': '3', 'model_id': '1234', 'doc_section_id': 1}, 'vector': [0.3058681451762305, 0.7128078643256403, 0.01051870856249737]}, {'id': 'doc_section_summary_level3_1_2', 'payload': {'summary': ' 123 12 1ы', 'level': '3', 'model_id': '1234', 'doc_section_id': 1}, 'vector': [0.8762685193024727, 0.9564951172941304, 0.4525803796520377]}, {'id': 'doc_section_summary_level3_1_3', 'payload': {'summary': 'в сыфсф ыв', 'level': '3', 'model_id': '1234', 'doc_section_id': 1}, 'vector': [0.06512071531181485, 0.37245972165429275, 0.1974923887993565]}, {'id': 'doc_section_summary_level3_1_4', 'payload': {'summary': 'с ывс', 'level': '3', 'model_id': '1234', 'doc_section_id': 1}, 'vector': [0.787551550590798