In [1]:
from pymongo import MongoClient

# CLIENT: MongoClient = MongoClient("localhost", 27017)
# DB = CLIENT.antonio
# PEOPLE = DB.people
# DOCUMENTS = DB.documents
# for person in PEOPLE.find():
#     print(person)

# import numpy as np
# from qdrant_client import QdrantClient
# import os
#
# qdrant_client = QdrantClient(
#     url="https://b7fce096-1c85-492d-b757-1724657c30f2.eu-west-2-0.aws.cloud.qdrant."
#     "io:6333",
#     api_key=os.getenv("QDRANT_API_KEY"),
# )
#
# query_embedding = np.random.rand(384).tolist()  # Simula un embedding de consulta
#
# search_results = qdrant_client.search(
#     collection_name="llms",
#     query_vector=query_embedding,
#     limit=3,  # Devuelve los 3 más similares
# )
#
# for result in search_results:
#     print(f"ID: {result.id}, Score: {result.score}, Data: {result.payload}")

In [2]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split


def get_documents() -> list[str]:
    client: MongoClient = MongoClient("localhost", 27017)
    db = client.antonio
    docs = db.documents
    documents = []
    for doc in docs.find():
        documents.append(doc["content"]["Content"])
    return documents


def clean_text(text):
    text = re.sub(r"[^\w\s.,!?']", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def extract_chunks(
    documents: list[str], min_length: int = 1000, max_length: int = 2000
) -> list[str]:
    answers = []
    sentence_pattern = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s"
    for document in documents:
        cleaned_article = clean_text(document)
        sentences = re.split(sentence_pattern, cleaned_article)
        current_chunk = ""
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            if len(current_chunk) + len(sentence) <= max_length:
                current_chunk += sentence + " "
            else:
                if len(current_chunk) >= min_length:
                    answers.append(current_chunk.strip())
                current_chunk = sentence + " "
        if len(current_chunk) >= min_length:
            answers.append(current_chunk.strip())
    return answers

In [3]:
import os
import json
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted


def generate_instruction_answer_pairs(answer):
    prompt = f"""Based on the following extract, generate five instruction-answer \
    pairs. Each instruction must ask to write about a specific topic contained in the \
    context. Each answer must provide a relevant paragraph based on the information \
    found in the context. Only use concepts from the context to generate the \
    instructions. Instructions must never explicitly mention a context, a system, a \
    course, or an extract. Instructions must be self-contained and general. Answers \
    must imitate the writing style of the context.
    
    Example instruction: Explain the concept of an LLM Twin.
    Example answer: An LLM Twin is essentially an AI character that mimics your \
    writing style, personality, and voice. It's designed to write just like you by \
    incorporating these elements into a language model. The idea is to create a \
    digital replica of your writing habits using advanced AI techniques.
    
    Provide your response in JSON format with the following structure:
    {{
        "instruction_answer_pairs": [
            {{"instruction": "...", "answer": "..."}},
            ...
        ]
    }}
    
    Extract:
    {answer}
    """

    genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
    model = genai.GenerativeModel("gemini-2.5-pro-exp-03-25")
    try:
        response = model.generate_content(prompt)
    except ResourceExhausted:
        print("WARNING: Requests per minute exceeded!")
        return []

    json_str = response.text.strip("```json\n").strip("```").strip()
    json_data = json.loads(json_str)
    pairs = [
        (pair["instruction"], pair["answer"])
        for pair in json_data["instruction_answer_pairs"]
    ]

    return pairs

In [4]:
def generate_dataset(out_path: str = "") -> None:
    # Extract documents, generate chunks and obtain pairs
    documents = get_documents()
    chunks = extract_chunks(documents)
    instruction_answer_pairs = []
    # The free plan limit is 5 requests per minute, so may be there are some errors
    for chunk in chunks:
        instruction_answer_pairs += generate_instruction_answer_pairs(chunk)
    instructions, answers = zip(*instruction_answer_pairs)

    # Save dfs
    df = pd.DataFrame({"instructions": instructions, "answers": answers})
    df_train, df_test = train_test_split(df, test_size=0.2)
    df_train.to_csv(f"{out_path}df_train.csv")
    df_test.to_csv(f"{out_path}df_test.csv")

In [5]:
generate_dataset()

[("Describe the concept of an AI character designed to replicate a person's writing.", "This type of AI character is designed to write like a specific individual by incorporating their unique style, personality, and voice directly into a large language model (LLM). The goal is to create an AI that effectively mirrors the person's writing patterns and characteristics."), ('What are the core components involved in creating a production-ready AI replica?', 'Building a production-ready AI replica involves integrating several key technologies and practices. You would typically utilize Large Language Models (LLMs), vector databases (vector DBs), and adhere to LLMOps good practices throughout the design, training, and deployment phases.'), ('What are the benefits of building and deploying an end-to-end production-grade LLM system?', 'Building and deploying a complete, production-grade LLM system allows you to move beyond isolated scripts or notebooks. It provides practical experience in produ

AttributeError: module 'pandas' has no attribute 'Dataframe'