In [1]:
from pymongo import MongoClient

# CLIENT: MongoClient = MongoClient("localhost", 27017)
# DB = CLIENT.antonio
# PEOPLE = DB.people
# DOCUMENTS = DB.documents
# for person in PEOPLE.find():
#     print(person)

# import numpy as np
# from qdrant_client import QdrantClient
# import os
#
# qdrant_client = QdrantClient(
#     url="https://b7fce096-1c85-492d-b757-1724657c30f2.eu-west-2-0.aws.cloud.qdrant."
#     "io:6333",
#     api_key=os.getenv("QDRANT_API_KEY"),
# )
#
# query_embedding = np.random.rand(384).tolist()  # Simula un embedding de consulta
#
# search_results = qdrant_client.search(
#     collection_name="llms",
#     query_vector=query_embedding,
#     limit=3,  # Devuelve los 3 más similares
# )
#
# for result in search_results:
#     print(f"ID: {result.id}, Score: {result.score}, Data: {result.payload}")

In [1]:
"""
Script to generate train and test datasets in the format is instruction-answer pairs.
"""

import re
import os
import json
import pandas as pd
from pymongo import MongoClient
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted


def _get_documents() -> list[str]:
    """
    Obtains all documents from MongoDB.

    Returns
    -------
    List with all stored documents.
    """

    client: MongoClient = MongoClient("localhost", 27017)
    db = client.antonio
    docs = db.documents

    documents = []
    for doc in docs.find():
        documents.append(doc["content"]["Content"])

    return documents


def _clean_text(text: str) -> str:
    """
    Removes non-alphanumeric characters except for apostrophes, periods, commas,
    exclamation marks, and question marks. replace multiple consecutive whitespace
    characters with a single space.

    Parameters
    ----------
    text : Text to clean.

    Returns
    -------
    Cleaned text.
    """

    text = re.sub(r"[^\w\s.,!?']", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def _extract_chunks(
    documents: list[str], min_length: int = 1000, max_length: int = 2000
) -> list[str]:
    """
    Divides the documents into multiple chunks.

    Parameters
    ----------
    documents  : List with all the available documents.
    min_length : Minimum length of the chunk.
    max_length : Maximum length of the chunk.

    Returns
    -------
    Chunks obtained.
    """

    answers = []
    sentence_pattern = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s"

    for document in documents:
        cleaned_article = _clean_text(document)
        sentences = re.split(sentence_pattern, cleaned_article)
        current_chunk = ""

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            if len(current_chunk) + len(sentence) <= max_length:
                current_chunk += sentence + " "
            else:
                if len(current_chunk) >= min_length:
                    answers.append(current_chunk.strip())
                current_chunk = sentence + " "

        if len(current_chunk) >= min_length:
            answers.append(current_chunk.strip())

    return answers


def _generate_dpo_dataset(
    extract: str, temperature: float = 0.7
) -> list[tuple[str, str]]:
    """
    Generates triplets of instruction and accepted and rejected answers given the
    extract. Higher temperatures will give more diverse outputs.

    Parameters
    ----------
    extract     : Chunk of the document.
    temperature : Temperature.

    Returns
    -------
    Triplets of extract, accepted and rejected answer.
    """

    prompt = f"""Based on the following extract, generate five instruction-answer \
    triples. Each triple should consist of:
    1. An instruction asking about a specific topic in the context.
    2. A generated answer that attempts to answer the instruction based on the context.
    3. An extracted answer that is a relevant excerpt directly from the given context.
    Instructions must be self-contained and general, without explicitly mentioning a \
    context, system, course, or extract.

    Important:
    - Ensure that the extracted answer is a verbatim copy from the context, including \
    all punctuation and apostrophes.
    - Do not add any ellipsis (...) or [...] to indicate skipped text in the extracted \
    answer.
    - If the relevant text is not continuous, use two separate sentences from the \
    context instead of skipping text.
    
    Provide your response in JSON format with the following structure:
    {{
        "preference_triples": [
            {{
                "instruction": "...",
                "generated_answer": "...",
                "extracted_answer":"...",
                "..."
            }},
            ...
        ]
    }}
    
    Extract:
    {extract}
    """

    genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
    model = genai.GenerativeModel("gemini-2.5-pro-exp-03-25")
    try:
        response = model.generate_content(
            prompt, generation_config={"temperature": temperature}
        )
    except ResourceExhausted:
        print("WARNING: Requests per minute exceeded!")
        return []

    return response

In [2]:
documents = _get_documents()
chunks = _extract_chunks(documents)

In [5]:
response = _generate_dpo_dataset(chunks[0])

In [8]:
json_str = response.text.strip("`json\n").strip("`").strip()
json_data = json.loads(json_str)
triplets = [
    (pair["instruction"], pair["generated_answer"], pair["extracted_answer"])
    for pair in json_data["preference_triples"]
]

{'preference_triples': [{'instruction': "What defines an AI character designed to replicate an individual's writing style?",
   'generated_answer': "An AI character designed to replicate someone's writing is defined as one that incorporates the individual's specific style, personality, and voice into a large language model.",
   'extracted_answer': 'It is an AI character that writes like yourself by incorporating your style, personality and voice into an LLM.'},
  {'instruction': 'What skills are acquired upon completing a program focused on building a production-ready AI replica?',
   'generated_answer': 'By completing the program, you will gain the skills to design, train, and deploy a production-ready AI replica, utilizing large language models, vector databases, and established LLMOps practices.',
   'extracted_answer': 'By finishing the LLM Twin Building Your Production Ready AI Replica free course, you will learn how to design, train, and deploy a production ready LLM twin of you