<a href="https://colab.research.google.com/github/bharathbolla/The-LLM-Cookbook-Practical-Recipes-for-Fine-Tuning-Optimization-and-Deployment/blob/main/Chapter_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from huggingface_hub import HfApi
from huggingface_hub import login

api = HfApi()
whoami = api.whoami(token="hf_xxxxxxxxxxxxxx")
print(whoami)
login("hf_xxxxxxxxxxxxxxxxxxxxxxxx")

## Recipe-1: Generating QA Pairs with a Judge Model


In [None]:
# --- Procedure 1: Generating Question-Answer Pairs with a Judge Model ---
# Goal: Use a multi-model, generate-and-rank pipeline to create high-quality synthetic data.
# Prerequisites: pip install transformers torch accelerate bitsandbytes
# Note: This procedure loads TWO models. While smaller models are used here to reduce VRAM,
#       it can still be resource-intensive (>16GB recommended).

import torch
from transformers import pipeline, AutoTokenizer
import json
import re

# 1. --- Configuration ---
# A small, fast model to generate multiple candidate answers
GENERATOR_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# A separate, powerful model (small but strong reasoning) to evaluate the candidates
JUDGE_MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"

# 2. --- Setup the Generator and Judge Model Pipelines ---
print(f"Loading GENERATOR model: {GENERATOR_MODEL_ID}")
try:
    generator_tokenizer = AutoTokenizer.from_pretrained(GENERATOR_MODEL_ID)
    generator_pipe = pipeline(
        "text-generation",
        model=GENERATOR_MODEL_ID,
        tokenizer=generator_tokenizer,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    print("Generator model pipeline loaded successfully.")
except Exception as e:
    print(f"Error loading generator model pipeline: {e}")
    exit()

print(f"\nLoading JUDGE model: {JUDGE_MODEL_ID}")
try:
    judge_tokenizer = AutoTokenizer.from_pretrained(JUDGE_MODEL_ID, trust_remote_code=True)
    judge_pipe = pipeline(
        "text-generation",
        model=JUDGE_MODEL_ID,
        tokenizer=judge_tokenizer,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )
    print("Judge model pipeline loaded successfully.")
except Exception as e:
    print(f"Error loading judge model pipeline: {e}")
    exit()

# 3. --- Define the Document and Prompt Templates ---
document = """
Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy, through a process that converts carbon dioxide and water into sugars (glucose) and oxygen. This process is crucial for life on Earth as it produces most of the oxygen in the atmosphere. The chemical equation for photosynthesis is 6CO2 + 6H2O → C6H12O6 + 6O2. Chlorophyll is the primary pigment used in photosynthesis; it absorbs blue and red light and reflects green light, which is why plants appear green.
"""

generator_prompt_template = """
Given the following document, generate three distinct question-answer pairs based on its content. The questions should be insightful and the answers should be accurate and concise. Format the output as a list of JSON objects, where each object has a "question" and an "answer" key.

Document:
\"\"\"
{document}
\"\"\"

JSON Output:
"""

judge_prompt_template = """
You are an expert evaluator. Your task is to analyze multiple sets of synthetically generated question-answer pairs based on a source document. Evaluate them based on the following criteria:
1.  **Accuracy**: Is the answer factually correct according to the document?
2.  **Relevance**: Does the question directly relate to a key concept in the document?
3.  **Clarity**: Are the question and answer easy to understand?

Below are the source document and the candidate QA sets.

Source Document:
\"\"\"
{document}
\"\"\"

---
Candidate QA Sets:
{candidate_sets}
---

First, provide a brief step-by-step reasoning for your choice, explaining which set is superior and why.
Finally, on a new line, state your final choice by reprinting the single best JSON list.

Reasoning:

Final Choice:
"""

# 4. --- Generate Multiple Candidate QA Sets ---
prompt = generator_prompt_template.format(document=document)
print("\nGenerating multiple candidate synthetic data sets...")
candidate_responses = []
try:
    # Generate two different sets of responses
    outputs = generator_pipe(
        prompt,
        max_new_tokens=300,
        num_return_sequences=2, # Generate two candidates
        do_sample=True,
        temperature=0.8,
        top_p=0.95,
        eos_token_id=generator_tokenizer.eos_token_id
    )

    for i, out in enumerate(outputs):
        generated_text = out['generated_text']
        response_part = generated_text[len(prompt):]
        # Find the JSON list within the response
        json_match = re.search(r'\[.*\]', response_part, re.DOTALL)
        if json_match:
            candidate_responses.append(json_match.group(0))
            print(f"\n--- Candidate {i+1} ---")
            print(candidate_responses[-1])

    if not candidate_responses:
        raise ValueError("No valid JSON lists were generated by the generator model.")

except Exception as e:
    print(f"\nAn error occurred during candidate generation: {e}")
    exit()

# 5. --- Use the Judge Model to Rank and Select the Best Set ---
print("\n--- Submitting candidates to the Judge Model for evaluation ---")

# Format the candidates for the judge prompt
formatted_candidates = ""
for i, candidate_json in enumerate(candidate_responses):
    formatted_candidates += f"Candidate {i+1}:\n{candidate_json}\n\n"

judge_prompt = judge_prompt_template.format(
    document=document,
    candidate_sets=formatted_candidates
)

try:
    judge_outputs = judge_pipe(
        judge_prompt,
        max_new_tokens=512,
        do_sample=False, # We want a deterministic judgment
        eos_token_id=judge_tokenizer.eos_token_id
    )

    judge_response = judge_outputs[0]['generated_text'][len(judge_prompt):]

    print("\n--- Judge's Full Response ---")
    print(judge_response)

    # --- Extract the Final Curated Dataset ---
    final_choice_match = re.search(r'Final Choice:\s*(\[.*\])', judge_response, re.DOTALL)

    if final_choice_match:
        final_json_str = final_choice_match.group(1)
        parsed_json = json.loads(final_json_str)
        pretty_json = json.dumps(parsed_json, indent=2)

        print("\n--- Final Curated QA Pairs (Selected by Judge) ---")
        print(pretty_json)
    else:
        print("\n--- Could not parse the final choice from the judge's output ---")

except Exception as e:
    print(f"\nAn error occurred during judgment: {e}")


2025-08-31 07:28:07.922171: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756625288.250429      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756625288.344962      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading GENERATOR model: TinyLlama/TinyLlama-1.1B-Chat-v1.0


tokenizer_config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0


Generator model pipeline loaded successfully.

Loading JUDGE model: HuggingFaceH4/zephyr-7b-beta


tokenizer_config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not in

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


Judge model pipeline loaded successfully.

Generating multiple candidate synthetic data sets...

--- Candidate 1 ---
[
    {"question": "What is photosynthesis and how does it produce oxygen?", "answer": "Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy through a process that converts carbon dioxide and water into sugars (glucose) and oxygen. This process is crucial for life on Earth as it produces most of the oxygen in the atmosphere."},
    {"question": "What is chlorophyll and how does it contribute to photosynthesis?", "answer": "Chlorophyll is the primary pigment used in photosynthesis. It absorbs blue and red light and reflects green light, which is why plants appear green. Chlorophyll is responsible for converting light energy into chemical energy."},
    {"question": "What is the chemical equation for photosynthesis and how does it work?", "answer": "The chemical equation for photosynthesis is 6CO2 + 6H2O → C6H

## Recipe-2: Generating Instructions (Self-Instruct Style)

In [None]:
# --- Recipe: Generating Instructions (Self-Instruct Style) ---
# Goal: Use an LLM to generate new, diverse instructions based on seed examples.
# Method: Prompts an instruction-tuned LLM with examples to generate more instructions.
# Note: This only generates the *instructions*, not the corresponding outputs.
#       Quality depends heavily on the LLM and the seed examples.

import torch
from transformers import pipeline

# --- Configuration ---
# Use a strong instruction-tuned model if possible
MODEL_ID = "google/gemma-2b-it" # Or Mistral-Instruct, etc.
MAX_NEW_TOKENS_INSTR = 300 # Max tokens for the list of new instructions

# Use GPU if available
device_index = 0 if torch.cuda.is_available() else -1
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32

# --- 1. Load Model and Tokenizer (via Pipeline) ---
print(f"Loading pipeline for model: {MODEL_ID}")
try:
    generator = pipeline(
        "text-generation",
        model=MODEL_ID,
        tokenizer=MODEL_ID,
        torch_dtype=dtype,
        device=device_index,
    )
    if generator.tokenizer.pad_token is None: generator.tokenizer.pad_token = generator.tokenizer.eos_token
    if generator.model.config.pad_token_id is None: generator.model.config.pad_token_id = generator.tokenizer.pad_token_id
    print("Pipeline loaded.")
except Exception as e:
    print(f"Error loading pipeline: {e}")
    exit()

# --- 2. Define Seed Instructions ---
# Provide a diverse set of examples to guide the generation
seed_instructions = [
    "Explain the difference between supervised and unsupervised learning.",
    "Write a short email inviting a colleague to a project meeting.",
    "Generate a list of 5 creative names for a new coffee shop.",
    "What are the main benefits of using renewable energy sources?",
    "Provide Python code to read a CSV file into a pandas DataFrame.",
]

# --- 3. Craft the Prompt for Instruction Generation ---
# The prompt asks the model to act as an instruction generator
# and create new instructions similar to, but distinct from, the seeds.
prompt = f"""You are an expert instruction generator. Your task is to create a list of 5 new, diverse instructions that are different from the examples provided below, but cover a similar range of topics and task types (e.g., explanation, writing, brainstorming, coding, Q&A).

Do not repeat the examples. Ensure the new instructions are clear and actionable.

Examples of existing instructions:
\"\"\"
- {seed_instructions[0]}
- {seed_instructions[1]}
- {seed_instructions[2]}
- {seed_instructions[3]}
- {seed_instructions[4]}
\"\"\"

Generate 5 new instructions below, formatted as a numbered list:
1. """ # Start the list for the model

print("--- Generating New Instructions ---")
print(f"Prompt being sent to model:\n{prompt}")

# --- 4. Generate New Instructions ---
try:
    outputs = generator(
        prompt,
        max_new_tokens=MAX_NEW_TOKENS_INSTR,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.8, # Higher temperature for more diversity
        top_p=0.95,
        pad_token_id=generator.tokenizer.eos_token_id
    )

    generated_text = outputs[0]['generated_text']
    # Extract the generated list part
    instruction_list_str = "1. " + generated_text.split("Generate 5 new instructions below, formatted as a numbered list:\n1. ")[-1].strip()

    print("\n--- Generated Instructions ---")
    print(instruction_list_str)

    # --- 5. Post-processing (Optional) ---
    # Split the string into a list of instructions
    generated_instructions = [line.split('. ', 1)[-1] for line in instruction_list_str.split('\n') if line.strip() and line[0].isdigit()]
    print("\n--- Parsed Instructions ---")
    print(generated_instructions)
    # Next step would be to filter these and then generate outputs for them (using another LLM call).

except Exception as e:
    print(f"Error during instruction generation: {e}")

# --- End of Recipe ---


Loading pipeline for model: google/gemma-2b-it


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Device set to use cuda:0


Pipeline loaded.
--- Generating New Instructions ---
Prompt being sent to model:
You are an expert instruction generator. Your task is to create a list of 5 new, diverse instructions that are different from the examples provided below, but cover a similar range of topics and task types (e.g., explanation, writing, brainstorming, coding, Q&A).

Do not repeat the examples. Ensure the new instructions are clear and actionable.

Examples of existing instructions:
"""
- Explain the difference between supervised and unsupervised learning.
- Write a short email inviting a colleague to a project meeting.
- Generate a list of 5 creative names for a new coffee shop.
- What are the main benefits of using renewable energy sources?
- Provide Python code to read a CSV file into a pandas DataFrame.
"""

Generate 5 new instructions below, formatted as a numbered list:
1. 

--- Generated Instructions ---
1. 🤖 Design a 3D printed object that can float in the air.
2. Create a persuasive argument for why 

## Recipe-3 Data Augmentation via Transformation

In [None]:
# --- Procedure 3: Data Augmentation via Transformation ---
# Goal: Augment a small dataset using a variety of LLM-based and rule-based transformation techniques.
# Prerequisites: pip install transformers[sentencepiece] torch accelerate
# Note: This procedure uses multiple models and can be resource-intensive.

import torch
from transformers import pipeline
import random

# 1. --- Setup Augmentation Models ---
print("Setting up augmentation models...")
# Use translation pipelines for multilingual augmentation (back-translation)
try:
    translator_en_de = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de", device=1)
    translator_de_en = pipeline("translation_de_to_en", model="Helsinki-NLP/opus-mt-de-en", device=1)
    print("Translation models loaded.")
except Exception as e:
    print(f"Could not load translation models: {e}")
    translator_en_de, translator_de_en = None, None

# Use a text generation model for more sophisticated transformations
try:
    augmenter_pipe = pipeline(
        "text-generation",
        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        torch_dtype=torch.bfloat16,
        device=1
    )
    print("Augmenter LLM loaded.")
except Exception as e:
    print(f"Could not load augmenter LLM: {e}")
    augmenter_pipe = None

# 2. --- Original Data Point ---
original_text = "The financial report indicates a significant increase in quarterly revenue due to strong sales in the technology sector."
print(f"\nOriginal Text:\n\"{original_text}\"")

# Dictionary to store all generated variations
augmented_samples = {"Original": original_text}

# 3. --- Apply Transformations ---

# a) LLM-Based Paraphrasing
print("\n--- Applying LLM-Based Paraphrasing ---")
if augmenter_pipe:
    paraphrase_prompt = f"<|system|>\nYou are a helpful assistant.</s>\n<|user|>\nParaphrase the following sentence while preserving its core meaning: \"{original_text}\"</s>\n<|assistant|>\n"
    paraphrased_text = augmenter_pipe(paraphrase_prompt, max_new_tokens=70, do_sample=True, temperature=0.7)[0]['generated_text'].split("<|assistant|>")[-1].strip()
    augmented_samples["Paraphrased"] = paraphrased_text

# b) Multilingual Augmentation (Back-Translation)
print("\n--- Applying Back-Translation (English -> German -> English) ---")
if translator_en_de and translator_de_en:
    translated_to_de = translator_en_de(original_text)[0]['translation_text']
    back_translated_text = translator_de_en(translated_to_de)[0]['translation_text']
    augmented_samples["Back-Translated"] = back_translated_text

# c) Length Variations (Summarize and Elaborate)
print("\n--- Applying Length Variations ---")
if augmenter_pipe:
    summarize_prompt = f"<|system|>\nYou are a helpful assistant.</s>\n<|user|>\nSummarize this sentence in fewer words: \"{original_text}\"</s>\n<|assistant|>\n"
    summarized_text = augmenter_pipe(summarize_prompt, max_new_tokens=25, do_sample=False)[0]['generated_text'].split("<|assistant|>")[-1].strip()
    augmented_samples["Summarized"] = summarized_text

    elaborate_prompt = f"<|system|>\nYou are a helpful assistant.</s>\n<|user|>\nElaborate on this sentence, adding more specific but plausible details: \"{original_text}\"</s>\n<|assistant|>\n"
    elaborated_text = augmenter_pipe(elaborate_prompt, max_new_tokens=80, do_sample=True, temperature=0.8)[0]['generated_text'].split("<|assistant|>")[-1].strip()
    augmented_samples["Elaborated"] = elaborated_text

# d) Counterfactual Generation
print("\n--- Applying Counterfactual Generation ---")
if augmenter_pipe:
    counterfactual_prompt = f"<|system|>\nYou are a helpful assistant.</s>\n<|user|>\nRewrite the following sentence to describe the opposite scenario (a negative outcome): \"{original_text}\"</s>\n<|assistant|>\n"
    counterfactual_text = augmenter_pipe(counterfactual_prompt, max_new_tokens=70, do_sample=True, temperature=0.7)[0]['generated_text'].split("<|assistant|>")[-1].strip()
    augmented_samples["Counterfactual"] = counterfactual_text

# e) Domain-Specific Augmentation (Style Transfer)
print("\n--- Applying Domain-Specific Style Transfer (to Legal) ---")
if augmenter_pipe:
    style_transfer_prompt = f"<|system|>\nYou are a helpful assistant.</s>\n<|user|>\nRewrite the following sentence in a formal, legal style: \"{original_text}\"</s>\n<|assistant|>\n"
    legal_style_text = augmenter_pipe(style_transfer_prompt, max_new_tokens=80, do_sample=True, temperature=0.5)[0]['generated_text'].split("<|assistant|>")[-1].strip()
    augmented_samples["Legal Style"] = legal_style_text

# f) Basic Transformations (Synonym Replacement)
print("\n--- Applying Basic Transformation (Synonym Replacement) ---")
def synonym_replacement(sentence, synonyms, n=1):
    words = sentence.split()
    new_words = words.copy()
    replaceable_words = [word for word in words if word.lower().strip(".,") in synonyms]
    if not replaceable_words:
        return sentence

    for _ in range(n):
        word_to_replace = random.choice(replaceable_words)
        # Preserve punctuation
        punctuation = ''
        clean_word = word_to_replace
        if not word_to_replace[-1].isalnum():
            punctuation = word_to_replace[-1]
            clean_word = word_to_replace[:-1]

        synonym = random.choice(synonyms[clean_word.lower()])

        for i, word in enumerate(new_words):
            if word == word_to_replace:
                new_words[i] = synonym + punctuation
                break
    return " ".join(new_words)

synonym_dict = {
    "report": ["statement", "summary", "document"],
    "significant": ["notable", "substantial", "major"],
    "increase": ["growth", "rise", "expansion"],
    "strong": ["robust", "powerful", "vigorous"]
}
synonym_replaced_text = synonym_replacement(original_text, synonym_dict, n=2)
augmented_samples["Synonym Replaced"] = synonym_replaced_text

# 4. --- Display All Augmented Samples ---
print("\n\n==============================================")
print("      Data Augmentation Results             ")
print("==============================================")
for augmentation_type, text in augmented_samples.items():
    print(f"\n--- {augmentation_type} ---")
    print(f"\"{text}\"")
print("\n==============================================")


Setting up augmentation models...


Device set to use cuda:1
Device set to use cuda:1


Translation models loaded.


Device set to use cuda:1


Augmenter LLM loaded.

Original Text:
"The financial report indicates a significant increase in quarterly revenue due to strong sales in the technology sector."

--- Applying LLM-Based Paraphrasing ---

--- Applying Back-Translation (English -> German -> English) ---

--- Applying Length Variations ---

--- Applying Counterfactual Generation ---

--- Applying Domain-Specific Style Transfer (to Legal) ---

--- Applying Basic Transformation (Synonym Replacement) ---


      Data Augmentation Results             

--- Original ---
"The financial report indicates a significant increase in quarterly revenue due to strong sales in the technology sector."

--- Paraphrased ---
"Le rapport financier indique une augmentation significative de la semaine passée en termes de revenu, grâce à une augmentation significative du marché des produits techniques."

--- Back-Translated ---
"The financial report indicates a significant increase in quarterly sales due to strong sales in the technology sector.