# Generate structured dataset

## `query_feature_store`

In [1]:
from llm_engineering.domain.cleaned_documents import CleanedArticleDocument

[32m2025-07-14 15:42:30.731[0m | [1mINFO    [0m | [36mllm_engineering.settings[0m:[36mload_settings[0m:[36m94[0m - [1mLoading settings from the ZenML secret store.[0m
[32m2025-07-14 15:42:30.832[0m | [1mINFO    [0m | [36mllm_engineering.infrastructure.db.mongo[0m:[36m__new__[0m:[36m20[0m - [1mConnection to MongoDB with URI successful: mongodb://llm_engineering:llm_engineering@127.0.0.1:27017[0m


[1;35mPyTorch version 2.2.2 available.[0m


[32m2025-07-14 15:42:32.860[0m | [1mINFO    [0m | [36mllm_engineering.infrastructure.db.qdrant[0m:[36m__new__[0m:[36m29[0m - [1mConnection to Qdrant DB with URI successful: localhost:6333[0m


In [2]:
from qdrant_client.http import exceptions
try:
    cleaned_documents, next_offset = CleanedArticleDocument.bulk_find(limit=1)
except exceptions.UnexpectedResponse:
    print(exceptions)

count = 0
while next_offset:
    documents, next_offset = CleanedArticleDocument.bulk_find(limit=1, offset=next_offset)
    cleaned_documents.extend(documents)
    count +=1
print(f"{count=}")

[1;35mHTTP Request: POST [0m[34mhttp://localhost:6333/collections/cleaned_articles/points/scroll[1;35m "HTTP/1.1 200 OK"[0m
[1;35mHTTP Request: POST [0m[34mhttp://localhost:6333/collections/cleaned_articles/points/scroll[1;35m "HTTP/1.1 200 OK"[0m
[1;35mHTTP Request: POST [0m[34mhttp://localhost:6333/collections/cleaned_articles/points/scroll[1;35m "HTTP/1.1 200 OK"[0m
[1;35mHTTP Request: POST [0m[34mhttp://localhost:6333/collections/cleaned_articles/points/scroll[1;35m "HTTP/1.1 200 OK"[0m
[1;35mHTTP Request: POST [0m[34mhttp://localhost:6333/collections/cleaned_articles/points/scroll[1;35m "HTTP/1.1 200 OK"[0m
[1;35mHTTP Request: POST [0m[34mhttp://localhost:6333/collections/cleaned_articles/points/scroll[1;35m "HTTP/1.1 200 OK"[0m
[1;35mHTTP Request: POST [0m[34mhttp://localhost:6333/collections/cleaned_articles/points/scroll[1;35m "HTTP/1.1 200 OK"[0m
[1;35mHTTP Request: POST [0m[34mhttp://localhost:6333/collections/cleaned_articles/points/scro

In [3]:
len(cleaned_documents)

71

In [8]:
import pprint

pprint.pp(cleaned_documents[0].content)

('What is a Tensor in Machine Learning? Maxime Labonne The difference between '
 'tensors, arrays, and matrices Maxime Labonne SubscribeSign in Share this '
 'post Maxime Labonne What is a Tensor in Machine Learning? Copy link Facebook '
 'Email Notes More What is a Tensor in Machine Learning? The difference '
 'between tensors, arrays, and matrices Maxime Labonne Mar 29, 2022 Share this '
 'post Maxime Labonne What is a Tensor in Machine Learning? Copy link Facebook '
 'Email Notes More Share The difference between tensors, arrays, and matrices '
 'Image by author What is a tensor, exactly? Most deep learning practitioners '
 'know about them but can t pinpoint an exact definition . TensorFlow, PyTorch '
 'every deep learning framework relies on the same basic object tensors . They '
 're used to store almost everything in deep learning input data, weights, '
 'biases, predictions, etc. And yet, their definition is incredibly fuzzy the '
 'Wikipedia category alone has over 100 pages r

## `prompt`

In [9]:
from llm_engineering.application.dataset.generation import InstructionDatasetGenerator
from llm_engineering.domain.cleaned_documents import CleanedDocument
from llm_engineering.application.preprocessing.operations.chunking import chunk_document


def extract_substrings(
    documents: list[CleanedDocument], min_length: int = 1000, max_length: int = 2000
) -> list[CleanedDocument]:
    extracts = []
    for document in documents:
        document_extracts = chunk_document(document.content, min_length, max_length)
        for extract in document_extracts:
            subdocument = document.model_copy()
            subdocument.content = extract

            extracts.append(subdocument)

    return extracts

In [10]:
chunked_documents = extract_substrings(cleaned_documents)
len(chunked_documents)

430

In [12]:
pprint.pp(chunked_documents[0])

CleanedArticleDocument(id=UUID('004f70e9-fe53-4c5b-b5d2-c3ab4935ac23'), content='What is a Tensor in Machine Learning? Maxime Labonne The difference between tensors, arrays, and matrices Maxime Labonne SubscribeSign in Share this post Maxime Labonne What is a Tensor in Machine Learning? Copy link Facebook Email Notes More What is a Tensor in Machine Learning? The difference between tensors, arrays, and matrices Maxime Labonne Mar 29, 2022 Share this post Maxime Labonne What is a Tensor in Machine Learning? Copy link Facebook Email Notes More Share The difference between tensors, arrays, and matrices Image by author What is a tensor, exactly? Most deep learning practitioners know about them but can t pinpoint an exact definition . TensorFlow, PyTorch every deep learning framework relies on the same basic object tensors . They re used to store almost everything in deep learning input data, weights, biases, predictions, etc. And yet, their definition is incredibly fuzzy the Wikipedia cate

In [13]:
grouped_cleaned_documents = CleanedDocument.group_by_category(documents)
grouped_cleaned_documents

{<DataCategory.ARTICLES: 'articles'>: [CleanedArticleDocument(id=UUID('fcf51cb1-c553-48aa-abeb-93a6b7b59d71'), content='Constraint Programming in Python Maxime Labonne The Programming Paradigm to Find One Solution Among 8,080,104 Candidates Maxime Labonne SubscribeSign in Share this post Maxime Labonne Constraint Programming in Python Copy link Facebook Email Notes More Constraint Programming in Python The Programming Paradigm to Find One Solution Among 8,080,104 Candidates Maxime Labonne May 02, 2022 Share this post Maxime Labonne Constraint Programming in Python Copy link Facebook Email Notes More Share The Programming Paradigm to Find One Solution Among 8,080,104 Candidates Image by author, emojis by OpenMoji CC BY SA 4.0 Constraint Programming is a technique to find every solution that respects a set of predefined constraints. It is an invaluable tool for data scientists to solve a huge variety of problems, such as scheduling, timetabling, sequencing, etc. In this article, we ll se

In [22]:
# grouped_prompts = {}
# grouped_cleaned_documents = CleanedDocument.group_by_category(documents)
# for category, category_documents in grouped_cleaned_documents.items():
#     category_prompts = [InstructionDatasetGenerator.get_prompt(document) for document in category_documents]
#     grouped_prompts[category] = category_prompts

# grouped_prompts

In [24]:
for category, category_documents in grouped_cleaned_documents.items():
    print(f"{category=}")
    print("PROMPT: ")
    print(InstructionDatasetGenerator.get_prompt(category_documents[0]))
    break

category=<DataCategory.ARTICLES: 'articles'>
PROMPT: 
id=UUID('c23be0a9-3fab-42c0-b5ea-888c6007b357') template='Based on the following extract, generate five instruction-answer pairs. Each instruction must ask to write about a specific topic contained in the context. Each answer must provide a relevant paragraph based on the information found in the context. Only use concepts from the context to generate the instructions. Instructions must never explicitly mention a context, a system, a course, or an extract. Instructions must be self-contained and general. Answers must imitate the writing style of the context.     \nExample instruction: Explain the concept of an LLM Twin. Example answer: An LLM Twin is essentially an AI character that mimics your writing style, personality, and voice. It\'s designed to write just like you by incorporating these elements into a language model. The idea is to create a digital replica of your writing habits using advanced AI techniques. \nStructure the a