In [4]:
pip install openai

Collecting openai
  Using cached openai-1.57.4-py3-none-any.whl.metadata (24 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.8.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (5.2 kB)
Using cached openai-1.57.4-py3-none-any.whl (390 kB)
Using cached distro-1.9.0-py3-none-any.whl (20 kB)
Downloading jiter-0.8.2-cp39-cp39-macosx_11_0_arm64.whl (300 kB)
Installing collected packages: jiter, distro, openai
Successfully installed distro-1.9.0 jiter-0.8.2 openai-1.57.4
Note: you may need to restart the kernel to use updated packages.


In [None]:
import re
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:1234/v1",
    api_key = 'lms'
)

class PreprocessingAgent:
    def __init__(self):
        self.units_regex = r"\d+\s*(miles|kilometers|pounds|kilograms)"

    def identify_critical_segments(self, dialogue):
        # Rule-based unit detection
        critical_entities = []
        if re.search(self.units_regex, dialogue):
            critical_entities.append("Unit Conversion")

        # OpenAI-based detection of expressions
        expressions = self.detect_expressions(dialogue)
        if expressions:
            critical_entities.extend([f"Expression: {expr}" for expr in expressions])

        # OpenAI-based cultural reference detection
        cultural_reference = self.detect_cultural_reference(dialogue)
        if cultural_reference:
            critical_entities.append(f"Cultural Reference: {cultural_reference}")

        if critical_entities:
            return f"Detected Critical Entities: {', '.join(critical_entities)}"

        return "Standard"

    def detect_cultural_reference(self, dialogue):
        # LLM-based detection with OpenAI GPT model
        prompt = f"Analyze the following text and identify if it contains any cultural references or expressions. If so, explain:\n\n\"{dialogue}\""
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an expert in detecting cultural references in text."},
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content.strip()

    def detect_expressions(self, dialogue):
        # Use OpenAI GPT model to detect expressions dynamically
        prompt = f"Analyze the following text and list all idiomatic expressions or common phrases:\n\n\"{dialogue}\""
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an expert in identifying idiomatic expressions."},
                {"role": "user", "content": prompt}
            ]
        )
        expressions = response.choices[0].message.content.strip()
        return [expr.strip() for expr in expressions.split(',') if expr.strip()]

class TranslationAgent:
    def __init__(self):
        pass

    def translate(self, dialogue):
        # Use OpenAI GPT model for translation
        prompt = f"Translate the following English text into French:\n\n\"{dialogue}\""
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an expert translator from English to French."},
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content.strip()

class ContextualAgent:
    def __init__(self):
        pass

    def adapt_context(self, dialogue):
        # Use OpenAI GPT model for contextual adaptation
        prompt = f"Refine the following French translation to better fit the context:\n\n\"{dialogue}\""
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an expert in refining translations for better contextual accuracy."},
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content.strip()

class ManualValidationAgent:
    def __init__(self):
        pass

    def validate(self, original, contextual):
        print("Original:", original)
        print("Contextual Translation:", contextual)
        user_input = input("Enter your validation or press Enter to accept: ")
        return user_input if user_input.strip() else contextual

class OptimizationAgent:
    def __init__(self, max_chars=42):
        self.max_chars = max_chars

    def optimize(self, dialogue):
        if len(dialogue) > self.max_chars:
            return dialogue[:self.max_chars-3] + "..."
        return dialogue

class DialogueTranslationPipeline:
    def __init__(self):
        self.preprocessing_agent = PreprocessingAgent()
        self.translation_agent = TranslationAgent()
        self.contextual_agent = ContextualAgent()
        self.manual_validation_agent = ManualValidationAgent()
        self.optimization_agent = OptimizationAgent()

    def process_dialogue(self, dialogue):
        # Step 1: Preprocessing
        critical_type = self.preprocessing_agent.identify_critical_segments(dialogue)
        print(f"Critical Segment Type: {critical_type}")

        # Step 2: Translation
        translated = self.translation_agent.translate(dialogue)
        print(f"Translated: {translated}")

        # Step 3: Contextual Adaptation
        contextual = self.contextual_agent.adapt_context(translated)
        print(f"Contextual Translation: {contextual}")

        # Step 4: Manual Validation
        validated = self.manual_validation_agent.validate(dialogue, contextual)

        # Step 5: Optimization
        optimized = self.optimization_agent.optimize(validated)
        print(f"Final Optimized Translation: {optimized}")

        return optimized




In [8]:
# Example usage
pipeline = DialogueTranslationPipeline()
dialogue = "The meeting was a piece of cake, and he's gonna break the ice."
result = pipeline.process_dialogue(dialogue)
print("Result:", result)

Critical Segment Type: Detected Critical Entities: Expression: In the provided text, Expression: there are two idiomatic expressions:

1. **"A piece of cake"**: This idiom means that something is very easy to accomplish.

2. **"Break the ice"**: This expression refers to doing or saying something to relieve tension or get a conversation started in a social setting.

These phrases are commonly used to convey ease and the initiation of interaction, Expression: respectively., Cultural Reference: The text you provided does contain cultural references and expressions that are commonly used in English-speaking contexts.

1. **"The meeting was a piece of cake":** This is an idiomatic expression meaning that something was very easy to accomplish or required little effort. The phrase "piece of cake" originates from the notion that eating cake is enjoyable and simple, thus likening an easy task to this experience. It reflects a cultural tendency in English to use food-related metaphors for ease 

In [12]:
import json
from typing import Dict, Any, List
from openai import OpenAI

# ======================
# Agents & Functions
# ======================

class TerminologyAgent:
    def __init__(self, initial_glossary: Dict[str, str] = None):
        self.glossary = initial_glossary if initial_glossary else {}

    def apply_glossary(self, text: str) -> str:
        for term, trans in self.glossary.items():
            text = text.replace(term, trans)
        return text


class DomainExpertAgent:
    def __init__(self, domain_knowledge: Dict[str, str] = None):
        self.domain_knowledge = domain_knowledge if domain_knowledge else {}

    def refine_translation(self, text: str) -> str:
        for term, trans in self.domain_knowledge.items():
            text = text.replace(term, trans)
        return text


class TranslatorAgent:
    def __init__(self, target_lang: str = "en", terminology_agent: TerminologyAgent = None, domain_agent: DomainExpertAgent = None, client=None):
        self.target_lang = target_lang
        self.terminology_agent = terminology_agent
        self.domain_agent = domain_agent
        self.client = client

    def translate_text(self, source_text: str) -> str:
        prompt = f"Translate this text into {self.target_lang}: {source_text}"
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": f"You are an expert translator into {self.target_lang}."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )
        translated = response.choices[0].message.content.strip()

        if self.terminology_agent:
            translated = self.terminology_agent.apply_glossary(translated)
        if self.domain_agent:
            translated = self.domain_agent.refine_translation(translated)

        return translated


class StylisticEditorAgent:
    def __init__(self, client):
        self.client = client

    def improve_style(self, text_segments: List[str]) -> List[str]:
        improved_segments = []
        for segment in text_segments:
            prompt = (f"Improve the style and fluency of the following translated segment without changing its meaning:\n\nSegment: {segment}")
            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a professional stylistic editor."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.0
            )
            improved = response.choices[0].message.content.strip()
            improved_segments.append(improved)
        return improved_segments


class QualityCheckerAgent:
    def __init__(self, client):
        self.client = client

    def check_quality(self, text_segments: List[str]) -> bool:
        segment_list_str = "\n".join([f"Segment {i+1}: {seg}" for i, seg in enumerate(text_segments)])
        prompt = (
            "You are an expert in translation quality assessment. The following are translated segments. "
            "Evaluate each segment from 1 to 10 for its overall quality (accuracy, style, fluency). "
            "Then respond in strict JSON with the schema:\n"
            "{\n"
            "  \"ratings\": [\n"
            "    {\n"
            "      \"segment\": <segment_number>,\n"
            "      \"rating\": <integer_rating>,\n"
            "      \"comment\": \"justification\"\n"
            "    }\n"
            "  ],\n"
            "  \"all_above_seven\": <true_or_false>\n"
            "}\n\n"
            f"{segment_list_str}"
        )

        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a professional bilingual translator and reviewer."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )
        result_str = response.choices[0].message.content.strip()
        try:
            result_json = json.loads(result_str)
            return bool(result_json.get("all_above_seven", True))
        except json.JSONDecodeError:
            return True

    def suggest_fixes(self, text_segments: List[str]) -> List[str]:
        segment_list_str = "\n".join([f"Segment {i+1}: {seg}" for i, seg in enumerate(text_segments)])
        prompt = (
            "These segments need improvement. Please propose improved versions that maintain meaning but enhance accuracy and fluency.\n\n"
            f"{segment_list_str}\n\n"
            "Reply in JSON as:\n"
            "{\n"
            "  \"improved_segments\": [\n"
            "    {\"segment\": <segment_number>, \"improved_text\": \"...\"}\n"
            "  ]\n"
            "}"
        )

        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a professional translator and editor."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )
        result_str = response.choices[0].message.content.strip()
        try:
            result_json = json.loads(result_str)
            improved_segments = []
            for entry in result_json.get("improved_segments", []):
                improved_segments.append(entry["improved_text"])
            return improved_segments
        except json.JSONDecodeError:
            return text_segments


class GlobalCoordinatorAgent:
    def __init__(self, translators: List[TranslatorAgent], 
                 editor: StylisticEditorAgent, 
                 quality_checker: QualityCheckerAgent):
        self.translators = translators
        self.editor = editor
        self.quality_checker = quality_checker

    def process_segments(self, segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        # Traduction
        results = []
        for i, seg in enumerate(segments):
            translator = self.translators[i % len(self.translators)]
            translated_text = translator.translate_text(seg["text"])
            results.append({
                "speaker_id": seg["speaker_id"],
                "start": seg["start"],
                "end": seg["end"],
                "translated_text": translated_text
            })

        # Edition stylistique
        all_translations = [r["translated_text"] for r in results]
        improved = self.editor.improve_style(all_translations)
        for i, r in enumerate(results):
            r["translated_text"] = improved[i]

        # Vérification Qualité
        if not self.quality_checker.check_quality([r["translated_text"] for r in results]):
            fixed = self.quality_checker.suggest_fixes([r["translated_text"] for r in results])
            for i, r in enumerate(results):
                r["translated_text"] = fixed[i]

        return results


def extract_terminology_and_domain_knowledge(client: OpenAI, source_text: str, target_language: str = "en") -> (Dict[str, str], Dict[str, str]):
    function_definition = {
        "name": "extract_domain_and_glossary",
        "description": "Extract domain-specific terms and initial glossary terms from text, returning structured dictionaries.",
        "parameters": {
            "type": "object",
            "properties": {
                "domain_knowledge": {
                    "type": "object",
                    "description": "Key-value pairs of domain-specific terms mapped to their explanations or translations",
                    "additionalProperties": {"type": "string"}
                },
                "initial_glossary": {
                    "type": "object",
                    "description": "Key-value pairs of terms mapped to their target equivalents",
                    "additionalProperties": {"type": "string"}
                }
            },
            "required": ["domain_knowledge", "initial_glossary"]
        }
    }

    messages = [
        {"role": "system", "content": "You are a terminology extraction assistant."},
        {"role": "user", "content": f"""
Analyze the following text and identify:
1. Domain-specific terms (technical jargon, specialized components) relevant to the domain. For each domain-specific term, provide a short explanation or a translation into {target_language}.
2. An initial glossary of key terms (proper nouns, repetitive keywords) and their target equivalents in {target_language}.

Return your answer by calling the function `extract_domain_and_glossary` with a JSON structure:
{{
  "domain_knowledge": {{ "term_in_source": "explanation_or_translation_in_{target_language}" }},
  "initial_glossary": {{ "term_in_source": "equivalent_in_{target_language}" }}
}}
"""},
        {"role": "user", "content": "Voici un extrait technique d'aéronautique:\n\nDans ce manuel technique d'aéronautique, nous allons étudier la maintenance du turbopropulseur XJ-200. Le XJ-200 est un moteur à hélice couplé à une turbine à gaz spécialement conçu pour les avions de ligne régionaux. Il nécessite un ajustement précis du pas d'hélice, un contrôle régulier des injecteurs de carburant, et une calibration du compresseur. L'entreprise ACME AeroParts fournit également des pièces détachées spécifiques, comme les filtres à particules fines et les senseurs de température PT100."}
    ]

    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        functions=[function_definition],
        function_call={"name": "extract_domain_and_glossary"},
        temperature=0.0
    )
    print(response)
    arguments_str = response.choices[0].message.function_call.arguments
    parsed = json.loads(arguments_str)

    domain_knowledge = parsed["domain_knowledge"]
    initial_glossary = parsed["initial_glossary"]

    return domain_knowledge, initial_glossary




In [14]:
# ======================
# Utilisation de la pipeline
# ======================



# Initialisation du client OpenAI
client = OpenAI(
    base_url="http://localhost:1234/v1",
    api_key='lms'
)



# Exemple d'input JSON de segments
input_json = """
[
    {"speaker_id": "S1", "start":0.0, "end":2.5, "text":"Bonjour, comment l'examen s'est derouler ?"},
    {"speaker_id": "S2", "start":2.5, "end":5.0, "text":"l'examen etait un jeu d'enfant"},
    {"speaker_id": "S1", "start":5.0, "end":7.0, "text":"super, content de l'entendre"}
]
"""

segments = json.loads(input_json)


# Extraction automatique du glossaire et de la terminologie (par exemple en anglais)
target_language = "en"
domain_knowledge, initial_glossary = extract_terminology_and_domain_knowledge(client, "Texte source ci-dessus", target_language)

domain_agent = DomainExpertAgent(domain_knowledge=domain_knowledge)
terminology_agent = TerminologyAgent(initial_glossary=initial_glossary)

translator_1 = TranslatorAgent(target_lang=target_language, terminology_agent=terminology_agent, domain_agent=domain_agent, client=client)
translator_2 = TranslatorAgent(target_lang=target_language, terminology_agent=terminology_agent, domain_agent=domain_agent, client=client)

stylistic_editor = StylisticEditorAgent(client=client)
quality_checker = QualityCheckerAgent(client=client)

coordinator = GlobalCoordinatorAgent(translators=[translator_1, translator_2],
                                     editor=stylistic_editor, 
                                     quality_checker=quality_checker)

final_results = coordinator.process_segments(segments)

# Définition de la fonction pour la sortie structurée
function_definition_output = {
    "name": "generate_output",
    "description": "Generate structured output with cultural references, wordplay, idiomatic expressions, acronyms, measurement units, and final translated segments.",
    "parameters": {
        "type": "object",
        "properties": {
            "cultural_references": {
                "type": "array",
                "items": {"type": "string"},
                "description": "List of detected cultural references."
            },
            "wordplay": {
                "type": "array",
                "items": {"type": "string"},
                "description": "List of instances of wordplay."
            },
            "idiomatic_expressions": {
                "type": "array",
                "items": {"type": "string"},
                "description": "List of idiomatic expressions."
            },
            "acronyms": {
                "type": "array",
                "items": {"type": "string"},
                "description": "List of acronyms."
            },
            "measurement_units": {
                "type": "array",
                "items": {"type": "string"},
                "description": "List of measurement units."
            },
            "translated_segments": {
                "type": "array",
                "description": "The translated segments.",
                "items": {
                    "type": "object",
                    "properties": {
                        "speaker_id": {"type": "string"},
                        "start": {"type": "number"},
                        "end": {"type": "number"},
                        "translated_text": {"type": "string"}
                    },
                    "required": ["speaker_id", "start", "end", "translated_text"]
                }
            }
        },
        "required": ["translated_segments"]
    }
}

final_prompt = (
    "Analyze the following translated segments and extract:\n"
    "- Cultural references\n"
    "- Wordplay\n"
    "- Idiomatic expressions\n"
    "- Acronyms\n"
    "- Measurement units\n\n"
    "Then return a JSON structure with these fields plus the translated segments."
)

messages = [
    {"role": "system", "content": "You are a helpful assistant that returns data in a structured format."},
    {"role": "user", "content": final_prompt},
    {"role": "user", "content": json.dumps(final_results, ensure_ascii=False)}
]

response = client.chat.completions.create(
    model="gpt-4",
    messages=messages,
    functions=[function_definition_output],
    function_call={"name": "generate_output"},
    temperature=0.0
)

structured_output = response.choices[0].message.function_call.arguments
parsed_output = json.loads(structured_output)

# Affichage final
print(json.dumps(parsed_output, indent=4, ensure_ascii=False))


ChatCompletion(id='chatcmpl-ybugmkiivwnq8hsijfryno', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='To analyze the given text and extract domain-specific terms and an initial glossary, I\'ll first read through the text to understand its context and identify key terms.\n\nThe text is about a technical manual for maintaining the XJ-200 turboprop engine, which is used in regional airliners. It mentions specific maintenance tasks and components related to this type of aircraft engine.\n\n### Domain-Specific Terms\n\n1. **Turbopropulseur (Turboprop Engine)**: A turboprop engine is a type of aircraft engine that combines a turbine engine with a propeller. It\'s efficient for medium-speed, medium-range flights.\n\n2. **Hélice (Propeller)**: A propeller is a device with blades that rotate to generate thrust, pulling or pushing an aircraft through the air.\n\n3. **Turbine à gaz (Gas Turbine)**: A gas turbine is a type of engine that operates

AttributeError: 'NoneType' object has no attribute 'arguments'

In [18]:
from typing import Dict, Any, List
from pydantic import BaseModel, Field


class ExtractionResult(BaseModel):
    """Model for holding extraction results from domain and glossary extraction."""
    domain_knowledge: Dict[str, str] = Field(...)
    initial_glossary: Dict[str, str] = Field(...)


class SegmentInput(BaseModel):
    """Model representing input segments to be translated."""
    speaker_id: str
    start: float
    end: float
    text: str


class TranslatedSegment(BaseModel):
    """Model representing a translated segment."""
    speaker_id: str
    start: float
    end: float
    translated_text: str


class FinalOutput(BaseModel):
    """Final output model after analyzing translated segments."""
    cultural_references: List[str] = Field(default=[])
    wordplay: List[str] = Field(default=[])
    idiomatic_expressions: List[str] = Field(default=[])
    acronyms: List[str] = Field(default=[])
    measurement_units: List[str] = Field(default=[])
    translated_segments: List[TranslatedSegment] = Field(...)


In [19]:
from typing import Dict, Any, List
import json

class TerminologyAgent:
    """Applies a given glossary to translated text."""
    def __init__(self, initial_glossary: Dict[str, str] = None):
        self.glossary = initial_glossary if initial_glossary else {}

    def apply_glossary(self, text: str) -> str:
        for term, trans in self.glossary.items():
            text = text.replace(term, trans)
        return text


class DomainExpertAgent:
    """Refines translation based on domain-specific knowledge."""
    def __init__(self, domain_knowledge: Dict[str, str] = None):
        self.domain_knowledge = domain_knowledge if domain_knowledge else {}

    def refine_translation(self, text: str) -> str:
        for term, trans in self.domain_knowledge.items():
            text = text.replace(term, trans)
        return text


class TranslatorAgent:
    """Translates text using the given client (OpenAI) and applies terminology/domain refinements."""
    def __init__(self, target_lang: str, terminology_agent: TerminologyAgent, domain_agent: DomainExpertAgent, client):
        self.target_lang = target_lang
        self.terminology_agent = terminology_agent
        self.domain_agent = domain_agent
        self.client = client

    def translate_text(self, source_text: str) -> str:
        prompt = f"Translate this text into {self.target_lang}: {source_text}"
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": f"You are an expert translator into {self.target_lang}."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )
        translated = response.choices[0].message.content.strip()
        translated = self.terminology_agent.apply_glossary(translated)
        translated = self.domain_agent.refine_translation(translated)
        return translated


class StylisticEditorAgent:
    """Improves style and fluency of translated segments."""
    def __init__(self, client):
        self.client = client

    def improve_style(self, text_segments: List[str]) -> List[str]:
        improved_segments = []
        for segment in text_segments:
            prompt = (f"Improve the style and fluency of the following translated segment "
                      f"without changing its meaning:\n\nSegment: {segment}")
            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a professional stylistic editor."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.0
            )
            improved = response.choices[0].message.content.strip()
            improved_segments.append(improved)
        return improved_segments


class QualityCheckerAgent:
    """Checks the quality of the translated segments and suggests improvements if needed."""
    def __init__(self, client):
        self.client = client

    def check_quality(self, text_segments: List[str]) -> bool:
        segment_list_str = "\n".join([f"Segment {i+1}: {seg}" for i, seg in enumerate(text_segments)])
        prompt = (
            "You are an expert in translation quality assessment. The following are translated segments. "
            "Evaluate each segment from 1 to 10 for its overall quality (accuracy, style, fluency). "
            "Then respond in strict JSON with the schema:\n"
            "{\n"
            "  \"ratings\": [\n"
            "    {\n"
            "      \"segment\": <segment_number>,\n"
            "      \"rating\": <integer_rating>,\n"
            "      \"comment\": \"justification\"\n"
            "    }\n"
            "  ],\n"
            "  \"all_above_seven\": <true_or_false>\n"
            "}\n\n"
            f"{segment_list_str}"
        )

        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a professional bilingual translator and reviewer."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )
        result_str = response.choices[0].message.content.strip()
        try:
            result_json = json.loads(result_str)
            return bool(result_json.get("all_above_seven", True))
        except json.JSONDecodeError:
            # If parsing fails, assume quality is acceptable
            return True

    def suggest_fixes(self, text_segments: List[str]) -> List[str]:
        segment_list_str = "\n".join([f"Segment {i+1}: {seg}" for i, seg in enumerate(text_segments)])
        prompt = (
            "These segments need improvement. Please propose improved versions that maintain meaning "
            "but enhance accuracy and fluency.\n\n"
            f"{segment_list_str}\n\n"
            "Reply in JSON as:\n"
            "{\n"
            "  \"improved_segments\": [\n"
            "    {\"segment\": <segment_number>, \"improved_text\": \"...\"}\n"
            "  ]\n"
            "}"
        )

        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a professional translator and editor."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )
        result_str = response.choices[0].message.content.strip()
        try:
            result_json = json.loads(result_str)
            improved_segments = []
            for entry in result_json.get("improved_segments", []):
                improved_segments.append(entry["improved_text"])
            return improved_segments
        except json.JSONDecodeError:
            # If parsing fails, return original segments
            return text_segments


class GlobalCoordinatorAgent:
    """Coordinates translation, editing, and quality checking of segments."""
    def __init__(self, translators: List[TranslatorAgent], 
                 editor: StylisticEditorAgent, 
                 quality_checker: QualityCheckerAgent):
        self.translators = translators
        self.editor = editor
        self.quality_checker = quality_checker

    def process_segments(self, segments: List[dict]) -> List[dict]:
        # Translation
        results = []
        for i, seg in enumerate(segments):
            translator = self.translators[i % len(self.translators)]
            translated_text = translator.translate_text(seg["text"])
            results.append({
                "speaker_id": seg["speaker_id"],
                "start": seg["start"],
                "end": seg["end"],
                "translated_text": translated_text
            })

        # Stylistic Editing
        all_translations = [r["translated_text"] for r in results]
        improved = self.editor.improve_style(all_translations)
        for i, r in enumerate(results):
            r["translated_text"] = improved[i]

        # Quality Check
        final_texts = [r["translated_text"] for r in results]
        if not self.quality_checker.check_quality(final_texts):
            fixed = self.quality_checker.suggest_fixes(final_texts)
            for i, r in enumerate(results):
                r["translated_text"] = fixed[i]

        return results


In [21]:
import json
import re
from openai import OpenAI

def extract_terminology_and_domain_knowledge(client: OpenAI, source_text: str, target_language: str = "en"):
    """
    Extract domain-specific terms and glossary entries from a given source text.

    Attempts to force the model to respond with a function call.
    If not successful, attempts to parse JSON from the assistant's textual output.
    """
    function_definition = {
        "name": "extract_domain_and_glossary",
        "description": "Extract domain-specific terms and initial glossary terms from text, returning structured dictionaries.",
        "parameters": {
            "type": "object",
            "properties": {
                "domain_knowledge": {
                    "type": "object",
                    "description": "Key-value pairs of domain-specific terms mapped to their explanations or translations",
                    "additionalProperties": {"type": "string"}
                },
                "initial_glossary": {
                    "type": "object",
                    "description": "Key-value pairs of terms mapped to their target equivalents",
                    "additionalProperties": {"type": "string"}
                }
            },
            "required": ["domain_knowledge", "initial_glossary"]
        }
    }

    messages = [
        {"role": "system", "content": "You are a terminology extraction assistant. Only respond by calling the function. Do not provide any other text."},
        {"role": "user", "content": f"""
Analyze the following text and identify:
1. Domain-specific terms (technical jargon, specialized components) relevant to the domain. For each domain-specific term, provide a short explanation or a translation into {target_language}.
2. An initial glossary of key terms (proper nouns, repetitive keywords) and their target equivalents in {target_language}.

Return your answer by calling the function `extract_domain_and_glossary` only, with a JSON structure:
{{
  "domain_knowledge": {{ "term_in_source": "explanation_or_translation_in_{target_language}" }},
  "initial_glossary": {{ "term_in_source": "equivalent_in_{target_language}" }}
}}
"""},
        {"role": "user", "content": source_text}
    ]

    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        functions=[function_definition],
        function_call={"name": "extract_domain_and_glossary"},
        temperature=0.0
    )

    function_call_data = response.choices[0].message.function_call

    if function_call_data is not None and function_call_data.arguments:
        arguments_str = function_call_data.arguments
        parsed = json.loads(arguments_str)
    else:
        # Fallback: Attempt to parse from assistant's textual output
        assistant_content = response.choices[0].message.content
        match = re.search(r'```json\n(.*?)\n```', assistant_content, re.DOTALL)
        if match:
            json_str = match.group(1)
            parsed = json.loads(json_str)
        else:
            # If we cannot parse anything, return empty dicts
            parsed = {"domain_knowledge": {}, "initial_glossary": {}}

    extraction_result = ExtractionResult(**parsed)
    return extraction_result.domain_knowledge, extraction_result.initial_glossary


In [23]:
import json
import re
from openai import OpenAI

# from models import SegmentInput, TranslatedSegment, FinalOutput
# from agents import (DomainExpertAgent, TerminologyAgent, TranslatorAgent, 
#                     StylisticEditorAgent, QualityCheckerAgent, GlobalCoordinatorAgent)
# from functions import extract_terminology_and_domain_knowledge


def main():
    # Initialize the OpenAI client
    client = OpenAI(
        base_url="http://localhost:1234/v1",
        api_key='lms'
    )

    # Example input
    input_json = """
    [
        {"speaker_id": "S1", "start":0.0, "end":2.5, "text":"Bonjour, comment l'examen s'est derouler ?"},
        {"speaker_id": "S2", "start":2.5, "end":5.0, "text":"l'examen etait un jeu d'enfant"},
        {"speaker_id": "S1", "start":5.0, "end":7.0, "text":"super, content de l'entendre"}
    ]
    """

    segments = [SegmentInput(**item) for item in json.loads(input_json)]

    # Extract domain knowledge and initial glossary
    target_language = "en"
    domain_knowledge, initial_glossary = extract_terminology_and_domain_knowledge(client, "Texte source ci-dessus", target_language)

    domain_agent = DomainExpertAgent(domain_knowledge=domain_knowledge)
    terminology_agent = TerminologyAgent(initial_glossary=initial_glossary)

    translator_1 = TranslatorAgent(target_lang=target_language, terminology_agent=terminology_agent, domain_agent=domain_agent, client=client)
    translator_2 = TranslatorAgent(target_lang=target_language, terminology_agent=terminology_agent, domain_agent=domain_agent, client=client)

    stylistic_editor = StylisticEditorAgent(client=client)
    quality_checker = QualityCheckerAgent(client=client)

    coordinator = GlobalCoordinatorAgent(translators=[translator_1, translator_2],
                                         editor=stylistic_editor, 
                                         quality_checker=quality_checker)

    # Process segments
    final_results_raw = coordinator.process_segments([s.model_dump() for s in segments])
    final_results = [TranslatedSegment(**r) for r in final_results_raw]

    # Define function schema for the final output
    function_definition_output = {
        "name": "generate_output",
        "description": "Generate structured output with cultural references, wordplay, idiomatic expressions, acronyms, measurement units, and final translated segments.",
        "parameters": {
            "type": "object",
            "properties": {
                "cultural_references": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "List of detected cultural references."
                },
                "wordplay": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "List of instances of wordplay."
                },
                "idiomatic_expressions": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "List of idiomatic expressions."
                },
                "acronyms": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "List of acronyms."
                },
                "measurement_units": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "List of measurement units."
                },
                "translated_segments": {
                    "type": "array",
                    "description": "The translated segments.",
                    "items": {
                        "type": "object",
                        "properties": {
                            "speaker_id": {"type": "string"},
                            "start": {"type": "number"},
                            "end": {"type": "number"},
                            "translated_text": {"type": "string"}
                        },
                        "required": ["speaker_id", "start", "end", "translated_text"]
                    }
                }
            },
            "required": ["translated_segments"]
        }
    }

    final_prompt = (
        "Analyze the following translated segments and extract:\n"
        "- Cultural references\n"
        "- Wordplay\n"
        "- Idiomatic expressions\n"
        "- Acronyms\n"
        "- Measurement units\n\n"
        "Then return a JSON structure with these fields plus the translated segments."
    )

    messages = [
        {"role": "system", "content": "You are a helpful assistant that returns data in a structured format."},
        {"role": "user", "content": final_prompt},
        {"role": "user", "content": json.dumps([r.model_dump() for r in final_results], ensure_ascii=False)}
    ]

    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        functions=[function_definition_output],
        function_call={"name": "generate_output"},
        temperature=0.0
    )

    # Attempt to retrieve function call arguments
    func_call_data = response.choices[0].message.function_call
    if func_call_data is not None and func_call_data.arguments:
        structured_output_str = func_call_data.arguments
    else:
        # Fallback: Attempt to parse JSON from message content
        assistant_content = response.choices[0].message.content
        match = re.search(r'```json\n(.*?)\n```', assistant_content, re.DOTALL)
        if match:
            structured_output_str = match.group(1)
        else:
            # Ensure a dictionary structure to avoid TypeError
            structured_output_str = json.dumps({"translated_segments": [r.model_dump() for r in final_results]})

    # Load structured output as JSON
    structured_output = json.loads(structured_output_str)

    # Ensure structured_output is a dict (mapping)
    # If it's not, wrap it or handle it. Typically, the prompt should produce a dict.
    if not isinstance(structured_output, dict):
        # If the model incorrectly returned a list, wrap it.
        # For example, if structured_output is a list of segments:
        structured_output = {"translated_segments": structured_output}

    # Now parse into FinalOutput
    parsed_output = FinalOutput(**structured_output)

    # Print final output
    print(json.dumps(parsed_output.model_dump(), indent=4, ensure_ascii=False))





In [24]:
main()


{
    "cultural_references": [],
    "wordplay": [],
    "idiomatic_expressions": [],
    "acronyms": [],
    "measurement_units": [],
    "translated_segments": [
        {
            "speaker_id": "S1",
            "start": 0.0,
            "end": 2.5,
            "translated_text": "Here's a revised version of the translated segment with improved style and fluency:\n\n\"Hello, how did your exam go?\"\n\nChanges made:\n- Added \"your\" to make the sentence more polite and considerate of the person being addressed.\n- Changed \"derouler\" to \"go\", which is a more natural and idiomatic way to express the idea of an exam passing or completing.\n- Added \"your\" before \"exam\" to make the sentence more specific and considerate of the person being addressed.\n\nThe revised segment maintains the same meaning as the original, but with a more polished and natural-sounding tone."
        },
        {
            "speaker_id": "S2",
            "start": 2.5,
            "end": 5.0,
       