## Get raw data

In [1]:
full_name = "Paul Iusztin"

In [3]:
from llm_engineering.application import utils
from llm_engineering.domain.documents import UserDocument

In [4]:
first_name, last_name = utils.split_user_full_name(full_name)
first_name, last_name

('Paul', 'Iusztin')

In [6]:
user = UserDocument.get_or_create(first_name=first_name, last_name=last_name)
user

UserDocument(id=UUID('b5fa1f08-75f0-402d-8e88-d1357e346d9e'), first_name='Paul', last_name='Iusztin')

In [8]:
user_id = str(user.id)
user_id

'b5fa1f08-75f0-402d-8e88-d1357e346d9e'

In [10]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from llm_engineering.domain.base.nosql import NoSQLBaseDocument
from llm_engineering.domain.documents import ArticleDocument, Document, PostDocument, RepositoryDocument

In [11]:
def __fetch_articles(user_id) -> list[NoSQLBaseDocument]:
    return ArticleDocument.bulk_find(author_id=user_id)


def __fetch_posts(user_id) -> list[NoSQLBaseDocument]:
    return PostDocument.bulk_find(author_id=user_id)


def __fetch_repositories(user_id) -> list[NoSQLBaseDocument]:
    return RepositoryDocument.bulk_find(author_id=user_id)

In [12]:
with ThreadPoolExecutor() as executor:
    future_to_query = {
        executor.submit(__fetch_articles, user_id): "articles",
        executor.submit(__fetch_posts, user_id): "posts",
        executor.submit(__fetch_repositories, user_id): "repositories",
    }

    results = {}
    for future in as_completed(future_to_query):
        query_name = future_to_query[future]
        print(f"Query: '{query_name}'")
        try:
            results[query_name] = future.result()
        except Exception:
            print(f"Exception: '{query_name}' request failed.")

            results[query_name] = []
            
results

Query: 'posts'
Query: 'repositories'
Query: 'articles'


{'posts': [],
 'repositories': [],
 'articles': [ArticleDocument(id=UUID('34978aea-e179-44b5-975c-7deb64456380'), content={'Title': 'An End-to-End Framework for Production-Ready LLM Systems by Building Your LLM Twin', 'Subtitle': 'From data gathering to productionizing LLMs using LLMOps good practices.', 'Content': "End-to-End Framework for Production-Ready LLMs | Decoding MLOpen in appSign upSign inWriteSign upSign inTop highlightLLM Twin Course: Building Your Production-Ready AI ReplicaAn End-to-End Framework for Production-Ready LLM Systems by Building Your LLM TwinFrom data gathering to productionizing LLMs using LLMOps good practices.Paul Iusztin·FollowPublished inDecoding ML·16 min read·Mar 16, 20242.1K13ListenShare→ the 1st out of 12 lessons of the LLM Twin free courseWhat is your LLM Twin? It is an AI character that writes like yourself by incorporating your style, personality and voice into an LLM.Image by DALL-EWhy is this course different?By finishing the “LLM Twin: Building

In [13]:
results.values()

dict_values([[], [], [ArticleDocument(id=UUID('34978aea-e179-44b5-975c-7deb64456380'), content={'Title': 'An End-to-End Framework for Production-Ready LLM Systems by Building Your LLM Twin', 'Subtitle': 'From data gathering to productionizing LLMs using LLMOps good practices.', 'Content': "End-to-End Framework for Production-Ready LLMs | Decoding MLOpen in appSign upSign inWriteSign upSign inTop highlightLLM Twin Course: Building Your Production-Ready AI ReplicaAn End-to-End Framework for Production-Ready LLM Systems by Building Your LLM TwinFrom data gathering to productionizing LLMs using LLMOps good practices.Paul Iusztin·FollowPublished inDecoding ML·16 min read·Mar 16, 20242.1K13ListenShare→ the 1st out of 12 lessons of the LLM Twin free courseWhat is your LLM Twin? It is an AI character that writes like yourself by incorporating your style, personality and voice into an LLM.Image by DALL-EWhy is this course different?By finishing the “LLM Twin: Building Your Production-Ready AI R

In [14]:
documents = [doc for query_result in results.values() for doc in query_result]
len(documents)

50

In [15]:
documents[0]

ArticleDocument(id=UUID('34978aea-e179-44b5-975c-7deb64456380'), content={'Title': 'An End-to-End Framework for Production-Ready LLM Systems by Building Your LLM Twin', 'Subtitle': 'From data gathering to productionizing LLMs using LLMOps good practices.', 'Content': "End-to-End Framework for Production-Ready LLMs | Decoding MLOpen in appSign upSign inWriteSign upSign inTop highlightLLM Twin Course: Building Your Production-Ready AI ReplicaAn End-to-End Framework for Production-Ready LLM Systems by Building Your LLM TwinFrom data gathering to productionizing LLMs using LLMOps good practices.Paul Iusztin·FollowPublished inDecoding ML·16 min read·Mar 16, 20242.1K13ListenShare→ the 1st out of 12 lessons of the LLM Twin free courseWhat is your LLM Twin? It is an AI character that writes like yourself by incorporating your style, personality and voice into an LLM.Image by DALL-EWhy is this course different?By finishing the “LLM Twin: Building Your Production-Ready AI Replica” free course, y

In [17]:
documents[0].get_collection_name()

<DataCategory.ARTICLES: 'articles'>

In [16]:
def _get_metadata(documents: list[Document]) -> dict:
    metadata = {
        "num_documents": len(documents),
    }
    for document in documents:
        collection = document.get_collection_name()
        if collection not in metadata:
            metadata[collection] = {}
        if "authors" not in metadata[collection]:
            metadata[collection]["authors"] = list()

        metadata[collection]["num_documents"] = metadata[collection].get("num_documents", 0) + 1
        metadata[collection]["authors"].append(document.author_full_name)

    for value in metadata.values():
        if isinstance(value, dict) and "authors" in value:
            value["authors"] = list(set(value["authors"]))

    return metadata

_get_metadata(documents)

{'num_documents': 50,
 <DataCategory.ARTICLES: 'articles'>: {'authors': ['Paul Iusztin'],
  'num_documents': 50}}

## Cleaning

In [18]:
from llm_engineering.application.preprocessing import CleaningDispatcher
from llm_engineering.domain.cleaned_documents import CleanedDocument

[1;35mLoad pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2[0m




In [26]:
documents[0].content

{'Title': 'An End-to-End Framework for Production-Ready LLM Systems by Building Your LLM Twin',
 'Subtitle': 'From data gathering to productionizing LLMs using LLMOps good practices.',
 'Content': "End-to-End Framework for Production-Ready LLMs | Decoding MLOpen in appSign upSign inWriteSign upSign inTop highlightLLM Twin Course: Building Your Production-Ready AI ReplicaAn End-to-End Framework for Production-Ready LLM Systems by Building Your LLM TwinFrom data gathering to productionizing LLMs using LLMOps good practices.Paul Iusztin·FollowPublished inDecoding ML·16 min read·Mar 16, 20242.1K13ListenShare→ the 1st out of 12 lessons of the LLM Twin free courseWhat is your LLM Twin? It is an AI character that writes like yourself by incorporating your style, personality and voice into an LLM.Image by DALL-EWhy is this course different?By finishing the “LLM Twin: Building Your Production-Ready AI Replica” free course, you will learn how to design, train, and deploy a production-ready LLM t

In [23]:
valid_content = documents[0].content.values()
valid_content

dict_values(['An End-to-End Framework for Production-Ready LLM Systems by Building Your LLM Twin', 'From data gathering to productionizing LLMs using LLMOps good practices.', "End-to-End Framework for Production-Ready LLMs | Decoding MLOpen in appSign upSign inWriteSign upSign inTop highlightLLM Twin Course: Building Your Production-Ready AI ReplicaAn End-to-End Framework for Production-Ready LLM Systems by Building Your LLM TwinFrom data gathering to productionizing LLMs using LLMOps good practices.Paul Iusztin·FollowPublished inDecoding ML·16 min read·Mar 16, 20242.1K13ListenShare→ the 1st out of 12 lessons of the LLM Twin free courseWhat is your LLM Twin? It is an AI character that writes like yourself by incorporating your style, personality and voice into an LLM.Image by DALL-EWhy is this course different?By finishing the “LLM Twin: Building Your Production-Ready AI Replica” free course, you will learn how to design, train, and deploy a production-ready LLM twin of yourself powere

In [24]:
" #### ".join(valid_content)

"An End-to-End Framework for Production-Ready LLM Systems by Building Your LLM Twin #### From data gathering to productionizing LLMs using LLMOps good practices. #### End-to-End Framework for Production-Ready LLMs | Decoding MLOpen in appSign upSign inWriteSign upSign inTop highlightLLM Twin Course: Building Your Production-Ready AI ReplicaAn End-to-End Framework for Production-Ready LLM Systems by Building Your LLM TwinFrom data gathering to productionizing LLMs using LLMOps good practices.Paul Iusztin·FollowPublished inDecoding ML·16 min read·Mar 16, 20242.1K13ListenShare→ the 1st out of 12 lessons of the LLM Twin free courseWhat is your LLM Twin? It is an AI character that writes like yourself by incorporating your style, personality and voice into an LLM.Image by DALL-EWhy is this course different?By finishing the “LLM Twin: Building Your Production-Ready AI Replica” free course, you will learn how to design, train, and deploy a production-ready LLM twin of yourself powered by LLMs

In [27]:
from llm_engineering.application.preprocessing.operations import clean_text
clean_content = clean_text(" #### ".join(valid_content))
clean_content

'An End to End Framework for Production Ready LLM Systems by Building Your LLM Twin From data gathering to productionizing LLMs using LLMOps good practices. End to End Framework for Production Ready LLMs Decoding MLOpen in appSign upSign inWriteSign upSign inTop highlightLLM Twin Course Building Your Production Ready AI ReplicaAn End to End Framework for Production Ready LLM Systems by Building Your LLM TwinFrom data gathering to productionizing LLMs using LLMOps good practices.Paul Iusztin FollowPublished inDecoding ML 16 min read Mar 16, 20242.1K13ListenShare the 1st out of 12 lessons of the LLM Twin free courseWhat is your LLM Twin? It is an AI character that writes like yourself by incorporating your style, personality and voice into an LLM.Image by DALL EWhy is this course different?By finishing the LLM Twin Building Your Production Ready AI Replica free course, you will learn how to design, train, and deploy a production ready LLM twin of yourself powered by LLMs, vector DBs, and

In [28]:
documents[0].platform

'medium'

In [29]:
cleaned_documents = []

In [30]:
for document in documents:
    cleaned_document = CleaningDispatcher.dispatch(document)
    cleaned_documents.append(cleaned_document)
cleaned_document

[32m2025-07-14 05:35:29.469[0m | [1mINFO    [0m | [36mllm_engineering.application.preprocessing.dispatchers[0m:[36mdispatch[0m:[36m49[0m - [1mDocument cleaned successfully.[0m
[32m2025-07-14 05:35:29.471[0m | [1mINFO    [0m | [36mllm_engineering.application.preprocessing.dispatchers[0m:[36mdispatch[0m:[36m49[0m - [1mDocument cleaned successfully.[0m
[32m2025-07-14 05:35:29.473[0m | [1mINFO    [0m | [36mllm_engineering.application.preprocessing.dispatchers[0m:[36mdispatch[0m:[36m49[0m - [1mDocument cleaned successfully.[0m
[32m2025-07-14 05:35:29.476[0m | [1mINFO    [0m | [36mllm_engineering.application.preprocessing.dispatchers[0m:[36mdispatch[0m:[36m49[0m - [1mDocument cleaned successfully.[0m
[32m2025-07-14 05:35:29.478[0m | [1mINFO    [0m | [36mllm_engineering.application.preprocessing.dispatchers[0m:[36mdispatch[0m:[36m49[0m - [1mDocument cleaned successfully.[0m
[32m2025-07-14 05:35:29.479[0m | [1mINFO    [0m | [36ml

CleanedArticleDocument(id=UUID('a520fdac-65b4-4340-9ee2-d16a1390b838'), content='DML Top 6 ML Platform Features You Must Know to Build an ML System Why serving an ML model using a batch architecture is so powerful? Top 6 ML platform features you must know. SubscribeSign in Share this post DML Top 6 ML Platform Features You Must Know to Build an ML System decodingml.substack.com Copy link Facebook Email Note Other DML Top 6 ML Platform Features You Must Know to Build an ML System Why serving an ML model using a batch architecture is so powerful? Top 6 ML platform features you must know. Paul Iusztin Aug 31, 2023 3 Share this post DML Top 6 ML Platform Features You Must Know to Build an ML System decodingml.substack.com Copy link Facebook Email Note Other 2 Share _Hello there, I am Paul Iusztin _ _Within this newsletter, I will help you decode complex topics about ML MLOps one week at a time _ This week we will cover 1. Top 6 ML platform features you must know to build an ML system 2. Wh

In [31]:
def _get_metadata(cleaned_documents: list[CleanedDocument]) -> dict:
    metadata = {"num_documents": len(cleaned_documents)}
    for document in cleaned_documents:
        category = document.get_category()
        if category not in metadata:
            metadata[category] = {}
        if "authors" not in metadata[category]:
            metadata[category]["authors"] = list()

        metadata[category]["num_documents"] = metadata[category].get("num_documents", 0) + 1
        metadata[category]["authors"].append(document.author_full_name)

    for value in metadata.values():
        if isinstance(value, dict) and "authors" in value:
            value["authors"] = list(set(value["authors"]))

    return metadata

In [32]:
_get_metadata(cleaned_documents)

{'num_documents': 50,
 <DataCategory.ARTICLES: 'articles'>: {'authors': ['Paul Iusztin'],
  'num_documents': 50}}

## Chunk and embed cleaned documents

### Chunk articles

In [43]:
min_length = 1000
max_length = 2000

In [40]:
cleaned_content = cleaned_documents[0].content
cleaned_content[:1000]

'An End to End Framework for Production Ready LLM Systems by Building Your LLM Twin From data gathering to productionizing LLMs using LLMOps good practices. End to End Framework for Production Ready LLMs Decoding MLOpen in appSign upSign inWriteSign upSign inTop highlightLLM Twin Course Building Your Production Ready AI ReplicaAn End to End Framework for Production Ready LLM Systems by Building Your LLM TwinFrom data gathering to productionizing LLMs using LLMOps good practices.Paul Iusztin FollowPublished inDecoding ML 16 min read Mar 16, 20242.1K13ListenShare the 1st out of 12 lessons of the LLM Twin free courseWhat is your LLM Twin? It is an AI character that writes like yourself by incorporating your style, personality and voice into an LLM.Image by DALL EWhy is this course different?By finishing the LLM Twin Building Your Production Ready AI Replica free course, you will learn how to design, train, and deploy a production ready LLM twin of yourself powered by LLMs, vector DBs, and

In [41]:
import re
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", cleaned_content)
sentences[:5]

['An End to End Framework for Production Ready LLM Systems by Building Your LLM Twin From data gathering to productionizing LLMs using LLMOps good practices.',
 'End to End Framework for Production Ready LLMs Decoding MLOpen in appSign upSign inWriteSign upSign inTop highlightLLM Twin Course Building Your Production Ready AI ReplicaAn End to End Framework for Production Ready LLM Systems by Building Your LLM TwinFrom data gathering to productionizing LLMs using LLMOps good practices.Paul Iusztin FollowPublished inDecoding ML 16 min read Mar 16, 20242.1K13ListenShare the 1st out of 12 lessons of the LLM Twin free courseWhat is your LLM Twin?',
 'It is an AI character that writes like yourself by incorporating your style, personality and voice into an LLM.Image by DALL EWhy is this course different?By finishing the LLM Twin Building Your Production Ready AI Replica free course, you will learn how to design, train, and deploy a production ready LLM twin of yourself powered by LLMs, vector

In [44]:
extracts = []
current_chunk = ""
for sentence in sentences:
    sentence = sentence.strip()
    if not sentence:
        continue

    if len(current_chunk) + len(sentence) <= max_length:
        current_chunk += sentence + " "
    else:
        if len(current_chunk) >= min_length:
            extracts.append(current_chunk.strip())
        current_chunk = sentence + " "

if len(current_chunk) >= min_length:
    extracts.append(current_chunk.strip())

extracts

['An End to End Framework for Production Ready LLM Systems by Building Your LLM Twin From data gathering to productionizing LLMs using LLMOps good practices. End to End Framework for Production Ready LLMs Decoding MLOpen in appSign upSign inWriteSign upSign inTop highlightLLM Twin Course Building Your Production Ready AI ReplicaAn End to End Framework for Production Ready LLM Systems by Building Your LLM TwinFrom data gathering to productionizing LLMs using LLMOps good practices.Paul Iusztin FollowPublished inDecoding ML 16 min read Mar 16, 20242.1K13ListenShare the 1st out of 12 lessons of the LLM Twin free courseWhat is your LLM Twin? It is an AI character that writes like yourself by incorporating your style, personality and voice into an LLM.Image by DALL EWhy is this course different?By finishing the LLM Twin Building Your Production Ready AI Replica free course, you will learn how to design, train, and deploy a production ready LLM twin of yourself powered by LLMs, vector DBs, an