In [5]:
import os, sys
# import openai
from openai import AzureOpenAI
# !pip install python-dotenv
from dotenv import load_dotenv
import json
from openai import OpenAI

In [6]:
def APIKeyManager(model_type, key_path):
    
    load_dotenv(dotenv_path=key_path, override=True)
    if model_type=='azure':
        client = AzureOpenAI(
            api_version=os.environ["AZURE_API_VERSION"],
            azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
            api_key=os.environ["AZURE_API_KEY"],
        )
        return client
    elif model_type=='fanar':
        client = OpenAI(
            base_url = "https://api.fanar.qa/v1",
            api_key  = os.environ["FANAR_API_KEY"],
        )
        client.default_params = {"model": "Fanar-C-1-8.7B"}
        return client    
    elif model_type=='gemini':
        pass
    return client

# Load environment variables
model_type="fanar"
deployment = APIKeyManager(model_type, "./azure.env")
model = "Fanar-C-1-8.7B"


In [7]:
import json
import re

class TopicClassifier:
    def __init__(self, deployment, model="gpt-4o"):
        self.model = model
        self.deployment = deployment

    def classify_topic(self, topic, information):
        """
        Classify podcast topic and determine optimal approach
        
        Args:
            topic: Main topic of the podcast episode
            information: Background information about the topic
            
        Returns:
            JSON with classification results
        """
        
        prompt = f"""
You are an expert in analyzing and classifying topics for Arabic podcast production.

Task: Analyze the following topic and determine the best approach for an Arabic podcast.

Topic: {topic}
Background Information: {information}

Analyze the topic and return the result in JSON format with these exact keys:

{{
    "primary_category": "Main category from the available options",
    "category_justification": "Reason for choosing this category",
    "optimal_style": "Best discussion style from available options",
    "discourse_pattern": "Appropriate discourse pattern",
    "audience_engagement_goal": "Audience engagement objective",
    "cultural_sensitivity_level": "Cultural sensitivity level",
    "controversy_potential": "Controversy potential level",
    "key_discussion_angles": [
        "First main discussion angle",
        "Second point of interest for Arabic audiences"
    ],
    "natural_tension_points": [
        "First natural tension point in the topic",
        "Second aspect that might generate healthy debate"
    ],
    "cultural_connection_opportunities": [
        "First opportunity to connect with Arabic culture",
        "Second relevant local or regional reference"
    ]
}}

Available Categories (choose one):
1. "العلوم والتكنولوجيا" - For technical, scientific topics and innovations
2. "السياسة والشؤون العامة" - For political topics, current events, public affairs
3. "القضايا الاجتماعية" - For social topics, relationships, values, social challenges
4. "الرياضة والترفيه" - For sports, arts, entertainment topics
5. "التاريخ والثقافة" - For historical, heritage, cultural topics

Available Styles (choose one):
- "حواري" - Natural friendly dialogue between host and guest
- "تعليمي" - Focus on explanation and education in entertaining way
- "ترفيهي" - Fun and light with humorous touches
- "تحليلي" - Deep, specialized analytical discussion

Discourse Patterns (choose one):
- "رسمي" - Formal and respectful language
- "ودي" - Warm and familiar language
- "جدلي" - Lively discussion with multiple viewpoints
- "سردي" - Storytelling and narrative style

Cultural Sensitivity Levels (choose one):
- "عالي" - Requires extreme caution in handling
- "متوسط" - Needs moderate cultural consideration
- "منخفض" - Generally acceptable topic

Controversy Potential (choose one):
- "عالية" - Inherently controversial topic
- "متوسطة" - May generate some disagreements
- "منخفضة" - Generally acceptable topic

CRITICAL REQUIREMENTS:
- All JSON values must be in Modern Standard Arabic (MSA)
- JSON keys must be in English
- Use ONLY English commas (,) - NEVER Arabic commas (،)
- Use ONLY standard double quotes (") - NEVER Arabic quotes
- Do NOT include any explanatory text before or after JSON
- Do NOT include confidence scores like "الثقة: 95%"
- Do NOT include ```json markers
- Return ONLY valid JSON that can be parsed by json.loads()
- Analyze the topic deeply considering Arabic cultural context
- Focus on what makes the topic appealing to Arabic audiences
- Optimal episode duration is 10 minutes
"""

        response = self.deployment.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "You are an expert in analyzing topics for Arabic podcasts. Return ONLY valid JSON with English punctuation. No explanatory text. No confidence scores. No Arabic commas."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3  # Lower temperature for more consistent classification
        )
        
        return self._clean_json_response(response.choices[0].message.content)

    def _clean_json_response(self, response):
        """Clean the JSON response to ensure it's parseable"""
        if not response:
            return "{}"
        
        # Remove any text before the first { and after the last }
        start_idx = response.find('{')
        end_idx = response.rfind('}')
        
        if start_idx != -1 and end_idx != -1:
            clean_json = response[start_idx:end_idx+1]
        else:
            clean_json = response
        
        # Replace Arabic punctuation with English equivalents
        clean_json = clean_json.replace('،', ',')  # Arabic comma to English comma
        clean_json = clean_json.replace('"', '"')  # Arabic quote to English quote
        clean_json = clean_json.replace('"', '"')  # Arabic quote to English quote
        clean_json = clean_json.replace(''', "'")  # Arabic apostrophe
        clean_json = clean_json.replace(''', "'")  # Arabic apostrophe
        
        # Remove confidence scores and meta text
        meta_patterns = [
            r'الثقة:\s*\d+%',
            r'الدقة:\s*\d+%',
            r'معدل الثقة:\s*\d+%',
            r'\n.*الثقة.*',
            r'\n.*confidence.*',
            r'\n.*accuracy.*'
        ]
        
        for pattern in meta_patterns:
            clean_json = re.sub(pattern, '', clean_json, flags=re.IGNORECASE)
        
        # Fix common JSON issues
        # Remove trailing commas before closing braces/brackets
        clean_json = re.sub(r',(\s*[}\]])', r'\1', clean_json)
        
        # Ensure proper quote escaping
        clean_json = clean_json.replace('\\"', '"')
        
        return clean_json.strip()

    def classify_with_validation(self, topic, information):
        """
        Classify topic with automatic validation and retry
        """
        max_attempts = 3
        
        for attempt in range(max_attempts):
            try:
                # Get classification
                classification_result = self.classify_topic(topic, information)
                
                # Try to parse JSON
                parsed_result = json.loads(classification_result)
                
                # Validate required fields
                required_fields = [
                    "primary_category", "category_justification", "optimal_style",
                    "discourse_pattern", "audience_engagement_goal", 
                    "cultural_sensitivity_level", "controversy_potential",
                    "key_discussion_angles", "natural_tension_points",
                    "cultural_connection_opportunities"
                ]
                
                missing_fields = [field for field in required_fields if field not in parsed_result]
                
                if not missing_fields:
                    print(f"✅ Classification successful on attempt {attempt + 1}")
                    return classification_result, parsed_result
                else:
                    print(f"⚠️ Attempt {attempt + 1}: Missing fields: {missing_fields}")
                    
            except json.JSONDecodeError as e:
                print(f"⚠️ Attempt {attempt + 1}: JSON parsing error: {e}")
                if attempt == max_attempts - 1:
                    print("Raw response for debugging:")
                    print(classification_result[:500])
            except Exception as e:
                print(f"⚠️ Attempt {attempt + 1}: General error: {e}")
        
        # If all attempts fail, return fallback
        print("📝 Using fallback classification...")
        fallback_result = self._get_fallback_classification(topic)
        return json.dumps(fallback_result, ensure_ascii=False, indent=2), fallback_result

    def _get_fallback_classification(self, topic):
        """Provide fallback classification if all attempts fail"""
        return {
            "primary_category": "القضايا الاجتماعية",
            "category_justification": "تصنيف افتراضي للموضوع المطروح",
            "optimal_style": "حواري",
            "discourse_pattern": "ودي",
            "audience_engagement_goal": "زيادة الوعي والفهم حول الموضوع",
            "cultural_sensitivity_level": "متوسط",
            "controversy_potential": "متوسطة",
            "key_discussion_angles": [
                "الجوانب الأساسية للموضوع",
                "التأثيرات على المجتمع العربي"
            ],
            "natural_tension_points": [
                "وجهات النظر المختلفة حول الموضوع",
                "التحديات والفرص المرتبطة"
            ],
            "cultural_connection_opportunities": [
                "الربط بالقيم العربية التقليدية",
                "التجارب المحلية ذات الصلة"
            ]
        }

    def analyze_classification_quality(self, parsed_result):
        """Analyze the quality of the classification result"""
        analysis = {
            "category_appropriateness": self._assess_category_fit(parsed_result.get("primary_category", "")),
            "style_consistency": self._assess_style_choice(parsed_result.get("optimal_style", "")),
            "cultural_awareness": len(parsed_result.get("cultural_connection_opportunities", [])),
            "discussion_depth": len(parsed_result.get("key_discussion_angles", [])),
            "sensitivity_awareness": parsed_result.get("cultural_sensitivity_level", "") != "",
            "engagement_focus": parsed_result.get("audience_engagement_goal", "") != ""
        }
        
        # Calculate overall score
        score_factors = [
            analysis["category_appropriateness"] > 0,
            analysis["style_consistency"],
            analysis["cultural_awareness"] >= 2,
            analysis["discussion_depth"] >= 2,
            analysis["sensitivity_awareness"],
            analysis["engagement_focus"]
        ]
        
        analysis["overall_score"] = sum(score_factors) * 100 // len(score_factors)
        analysis["quality_grade"] = (
            "ممتاز" if analysis["overall_score"] >= 85 else
            "جيد" if analysis["overall_score"] >= 70 else
            "مقبول" if analysis["overall_score"] >= 55 else
            "يحتاج تحسين"
        )
        
        return analysis

    def _assess_category_fit(self, category):
        """Assess if the category choice seems appropriate"""
        valid_categories = [
            "العلوم والتكنولوجيا", "السياسة والشؤون العامة", 
            "القضايا الاجتماعية", "الرياضة والترفيه", "التاريخ والثقافة"
        ]
        return 1 if category in valid_categories else 0

    def _assess_style_choice(self, style):
        """Assess if the style choice is valid"""
        valid_styles = ["حواري", "تعليمي", "ترفيهي", "تحليلي"]
        return style in valid_styles

# Enhanced Testing Function
def test_topic_classifier(deployment, model_name="Fanar-C-1-8.7B"):
    """
    Test the topic classifier with comprehensive validation
    """
    print("🧪 Testing Topic Classifier with Enhanced Validation...")
    print("=" * 60)
    
    classifier = TopicClassifier(deployment, model_name)
    
    # Test topic
    topic = "الذكاء الاصطناعي والهوية العربية: كيف نحافظ على ثقافتنا في العصر الرقمي"
    
    information = '''
مع انتشار تقنيات الذكاء الاصطناعي بسرعة في العالم العربي، تزداد المخاوف حول تأثيرها على الهوية الثقافية واللغة العربية. 
تشير الدراسات إلى أن 78% من المحتوى الرقمي باللغة الإنجليزية، بينما المحتوى العربي لا يتجاوز 3%. 
معظم نماذج الذكاء الاصطناعي الحالية مدربة على بيانات غربية، مما يثير تساؤلات حول قدرتها على فهم السياق الثقافي العربي.
في المقابل، تسعى دول مثل الإمارات والسعودية لتطوير نماذج ذكاء اصطناعي عربية مثل "جايس" و"الحوراء" لمواجهة هذا التحدي.
التحدي الأكبر يكمن في كيفية الاستفادة من هذه التقنيات لتعزيز الثقافة العربية بدلاً من تهميشها، وضمان أن تخدم الذكاء الاصطناعي قيمنا ومبادئنا.
'''
    
    # Run classification with validation
    classification_result, parsed_result = classifier.classify_with_validation(topic, information)
    
    print("📊 Classification Results:")
    print(f"Primary Category: {parsed_result.get('primary_category', 'N/A')}")
    print(f"Optimal Style: {parsed_result.get('optimal_style', 'N/A')}")
    print(f"Discourse Pattern: {parsed_result.get('discourse_pattern', 'N/A')}")
    print(f"Cultural Sensitivity: {parsed_result.get('cultural_sensitivity_level', 'N/A')}")
    print(f"Controversy Potential: {parsed_result.get('controversy_potential', 'N/A')}")
    
    # Analyze quality
    quality_analysis = classifier.analyze_classification_quality(parsed_result)
    print(f"\n📈 Quality Analysis:")
    print(f"Overall Score: {quality_analysis['overall_score']}/100")
    print(f"Quality Grade: {quality_analysis['quality_grade']}")
    print(f"Cultural Awareness: {quality_analysis['cultural_awareness']} connections")
    print(f"Discussion Angles: {quality_analysis['discussion_depth']} angles")
    
    # Show key discussion points
    print(f"\n🎯 Key Discussion Angles:")
    for i, angle in enumerate(parsed_result.get('key_discussion_angles', []), 1):
        print(f"  {i}. {angle}")
    
    print(f"\n🌍 Cultural Connections:")
    for i, connection in enumerate(parsed_result.get('cultural_connection_opportunities', []), 1):
        print(f"  {i}. {connection}")
    
    return classification_result, parsed_result

# Usage:
# classifier = TopicClassifier(deployment, "Fanar-C-1-8.7B")
# classification_result, parsed_result = classifier.classify_with_validation(topic, information)

In [8]:
# Testing Instructions:

# To test Step 1, add this to a new cell in your notebook:

# Test Step 1: Topic Classification

# Test with the singlehood topic
topic = "الذكاء الاصطناعي والهوية العربية: كيف نحافظ على ثقافتنا في العصر الرقمي"

information = '''
مع انتشار تقنيات الذكاء الاصطناعي بسرعة في العالم العربي، تزداد المخاوف حول تأثيرها على الهوية الثقافية واللغة العربية. 
تشير الدراسات إلى أن 78% من المحتوى الرقمي باللغة الإنجليزية، بينما المحتوى العربي لا يتجاوز 3%. 
معظم نماذج الذكاء الاصطناعي الحالية مدربة على بيانات غربية، مما يثير تساؤلات حول قدرتها على فهم السياق الثقافي العربي.
في المقابل، تسعى دول مثل الإمارات والسعودية لتطوير نماذج ذكاء اصطناعي عربية مثل "جايس" و"الحوراء" لمواجهة هذا التحدي.
التحدي الأكبر يكمن في كيفية الاستفادة من هذه التقنيات لتعزيز الثقافة العربية بدلاً من تهميشها، وضمان أن تخدم الذكاء الاصطناعي قيمنا ومبادئنا.
'''

classifier = TopicClassifier(deployment, model)
classification_result = classifier.classify_topic(topic, information)
print("Classification Result:")
print(classification_result)

try:
    parsed_result = json.loads(classification_result)
    print(f"✅ Category: {parsed_result['primary_category']}")
    print(f"✅ Style: {parsed_result['optimal_style']}")
except json.JSONDecodeError:
    print("❌ JSON parsing failed")




Classification Result:
{
  "primary_category": "العلوم_والتكنولوجيا",
  "category_justification": "يتناول الموضوع تحديات وتطبيقات الذكاء الاصطناعي مع التركيز على دورها في المجتمع العربي.",
  "optimal_style": "تعليمي",
  "discourse_pattern": "جدلي",
  "audience_engagement_goal": "توعية المستمعين بأهمية واستخدامات الذكاء الاصطناعي بشكل مسؤول.",
  "cultural_sensitivity_level": "متوسط",
  "controversy_potential": "متوسطة",
  "key_discussion_angles": ["تأثيرات الذكاء الاصطناعي على اللغة والثقافة العربية", "استراتيجيات تطوير نماذج ذكاء اصطناعي محلية"],
  "natural_tension_points": ["مقارنة بين قوة البيانات الغربية ونقص الموارد العربية", "دور الحكومات والمطورين العرب في تعزيز الابتكار المحلي"],
  "cultural_connection_opportunities": ["مناقشة الأمثلة التاريخية للتفوق الفكري العربي", "إبراز قصص النجاح الحديثة للمبتكرين العرب"]
}
✅ Category: العلوم_والتكنولوجيا
✅ Style: تعليمي


In [9]:
import json
import re

class SimplePersonaGenerator:
    def __init__(self, deployment, model="gpt-4o"):
        self.model = model
        self.deployment = deployment

    def generate_personas(self, topic, information, classification_result):
        """
        Generate simple but effective host and guest personas
        
        Args:
            topic: Main topic of the podcast episode
            information: Background information about the topic
            classification_result: JSON string from Step 1 classification
            
        Returns:
            JSON with simple host and guest personas
        """
        
        # Parse classification to understand the requirements
        try:
            classification = json.loads(classification_result)
        except:
            raise ValueError("Invalid classification JSON provided")
        
        primary_category = classification.get("primary_category", "")
        optimal_style = classification.get("optimal_style", "")
        discourse_pattern = classification.get("discourse_pattern", "")
        cultural_sensitivity = classification.get("cultural_sensitivity_level", "")
        
        prompt = f"""
You are an expert in designing Arabic podcast personas.

Task: Create simple and suitable host and guest personas for this topic.

Topic: {topic}
Information: {information}
Category: {primary_category}
Required Style: {optimal_style}
Discourse Pattern: {discourse_pattern}
Cultural Sensitivity: {cultural_sensitivity}

Return the result in this exact JSON format:

{{
    "host": {{
        "name": "Host's Arabic name",
        "age": numeric_age,
        "background": "Brief background in one sentence",
        "personality": "Personality description in one sentence",
        "speaking_style": "Speaking style in one sentence"
    }},
    "guest": {{
        "name": "Guest's Arabic name", 
        "age": numeric_age,
        "background": "Brief background in one sentence",
        "expertise": "Area of expertise in one sentence",
        "personality": "Personality description in one sentence",
        "speaking_style": "Speaking style in one sentence"
    }},
    "why_good_match": "Why this host and guest are suitable for this topic - one sentence"
}}

Requirements:
- Use familiar Arabic names (like أحمد، محمد، فاطمة، نور، علي، لمى، سارة، خالد)
- Simple and believable personas
- Suitable for the topic and required style: {optimal_style}
- Host should be curious and guest should be expert or have experience
- All JSON values must be in Modern Standard Arabic (MSA)
- JSON keys should be in English
- Use ONLY English commas (,) - NEVER Arabic commas (،)
- Use ONLY standard double quotes (") - NEVER Arabic quotes
- Age should be realistic numbers (25-55 range)
- Do NOT include ```json markers
- Do NOT include confidence scores or extra text
- Return only valid JSON

CRITICAL REQUIREMENTS:
- Host personality should match {optimal_style} style
- Guest expertise should be relevant to: {topic}
- Consider cultural sensitivity level: {cultural_sensitivity}
- Make personas realistic and relatable to Arabic audiences
"""
        
        response = self.deployment.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": f"You are an expert in designing simple and effective podcast personas. Style: {optimal_style}. Always provide JSON values in Modern Standard Arabic while keeping keys in English. No extra text."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.6
        )
        
        return self._clean_json_response(response.choices[0].message.content)

    def _clean_json_response(self, response):
        """Clean the JSON response to ensure it's parseable"""
        if not response:
            return "{}"
        
        # Remove any text before the first { and after the last }
        start_idx = response.find('{')
        end_idx = response.rfind('}')
        
        if start_idx != -1 and end_idx != -1:
            clean_json = response[start_idx:end_idx+1]
        else:
            clean_json = response
        
        # Replace Arabic punctuation with English equivalents
        clean_json = clean_json.replace('،', ',')  # Arabic comma to English comma
        clean_json = clean_json.replace('"', '"')  # Arabic quote to English quote
        clean_json = clean_json.replace('"', '"')  # Arabic quote to English quote
        clean_json = clean_json.replace(''', "'")  # Arabic apostrophe
        clean_json = clean_json.replace(''', "'")  # Arabic apostrophe
        
        # Remove confidence scores and meta text
        meta_patterns = [
            r'الثقة:\s*\d+%',
            r'الدقة:\s*\d+%',
            r'معدل الثقة:\s*\d+%',
            r'\n.*الثقة.*',
            r'\n.*confidence.*',
            r'\n.*accuracy.*',
            r'ملاحظة:.*',
            r'تعليق:.*'
        ]
        
        for pattern in meta_patterns:
            clean_json = re.sub(pattern, '', clean_json, flags=re.IGNORECASE)
        
        # Fix common JSON issues
        # Remove trailing commas before closing braces/brackets
        clean_json = re.sub(r',(\s*[}\]])', r'\1', clean_json)
        
        # Ensure proper quote escaping
        clean_json = clean_json.replace('\\"', '"')
        
        return clean_json.strip()

    def generate_personas_with_validation(self, topic, information, classification_result):
        """
        Generate personas with automatic validation and retry
        """
        max_attempts = 3
        
        for attempt in range(max_attempts):
            try:
                # Get personas
                personas_result = self.generate_personas(topic, information, classification_result)
                
                # Try to parse JSON
                parsed_result = json.loads(personas_result)
                
                # Validate required fields
                validation_result = self._validate_personas(parsed_result)
                
                if validation_result["is_valid"]:
                    print(f"✅ Persona generation successful on attempt {attempt + 1}")
                    return personas_result, parsed_result
                else:
                    print(f"⚠️ Attempt {attempt + 1}: Validation issues: {validation_result['issues']}")
                    
            except json.JSONDecodeError as e:
                print(f"⚠️ Attempt {attempt + 1}: JSON parsing error: {e}")
                if attempt == max_attempts - 1:
                    print("Raw response for debugging:")
                    print(personas_result[:500])
            except Exception as e:
                print(f"⚠️ Attempt {attempt + 1}: General error: {e}")
        
        # If all attempts fail, return fallback
        print("📝 Using fallback personas...")
        fallback_result = self._get_fallback_personas(topic, classification_result)
        return json.dumps(fallback_result, ensure_ascii=False, indent=2), fallback_result

    def _validate_personas(self, parsed_result):
        """Validate the generated personas"""
        issues = []
        
        # Check main structure
        if "host" not in parsed_result:
            issues.append("Missing host section")
        if "guest" not in parsed_result:
            issues.append("Missing guest section")
        if "why_good_match" not in parsed_result:
            issues.append("Missing why_good_match section")
        
        # Check host fields
        host = parsed_result.get("host", {})
        required_host_fields = ["name", "age", "background", "personality", "speaking_style"]
        for field in required_host_fields:
            if field not in host or not host[field]:
                issues.append(f"Missing or empty host.{field}")
        
        # Check guest fields
        guest = parsed_result.get("guest", {})
        required_guest_fields = ["name", "age", "background", "expertise", "personality", "speaking_style"]
        for field in required_guest_fields:
            if field not in guest or not guest[field]:
                issues.append(f"Missing or empty guest.{field}")
        
        # Check age validity
        if "age" in host:
            try:
                age = int(host["age"])
                if age < 20 or age > 70:
                    issues.append(f"Host age {age} unrealistic (should be 20-70)")
            except (ValueError, TypeError):
                issues.append("Host age should be a number")
        
        if "age" in guest:
            try:
                age = int(guest["age"])
                if age < 20 or age > 70:
                    issues.append(f"Guest age {age} unrealistic (should be 20-70)")
            except (ValueError, TypeError):
                issues.append("Guest age should be a number")
        
        # Check for Arabic content
        all_text = " ".join([
            str(host.get("name", "")), str(host.get("background", "")),
            str(guest.get("name", "")), str(guest.get("background", "")),
            str(parsed_result.get("why_good_match", ""))
        ])
        
        arabic_chars = len(re.findall(r'[\u0600-\u06FF]', all_text))
        if arabic_chars < 10:
            issues.append("Insufficient Arabic content")
        
        return {
            "is_valid": len(issues) == 0,
            "issues": issues,
            "score": max(0, 100 - len(issues) * 15)
        }

    def _get_fallback_personas(self, topic, classification_result):
        """Provide fallback personas if generation fails"""
        try:
            classification = json.loads(classification_result)
            optimal_style = classification.get("optimal_style", "حواري")
        except:
            optimal_style = "حواري"
        
        # Determine appropriate personas based on style
        if optimal_style == "تعليمي":
            host_personality = "مقدم متحمس للتعلم ويطرح أسئلة واضحة"
            guest_personality = "خبير صبور يشرح المعلومات بطريقة مبسطة"
        elif optimal_style == "تحليلي":
            host_personality = "مقدم مفكر يطرح أسئلة عميقة ومدروسة"
            guest_personality = "محلل خبير يقدم رؤى متخصصة ومعمقة"
        elif optimal_style == "ترفيهي":
            host_personality = "مقدم مرح ومتفاعل يضيف روح الدعابة"
            guest_personality = "ضيف ودود وطريف يشارك تجاربه بمرح"
        else:  # حواري
            host_personality = "مقدم ودود وفضولي يحب الحوار الطبيعي"
            guest_personality = "ضيف متفتح ومتعاون يشارك خبراته بصراحة"
        
        return {
            "host": {
                "name": "أحمد السالم",
                "age": 35,
                "background": "مقدم برامج إذاعية مع خبرة في المواضيع المتنوعة",
                "personality": host_personality,
                "speaking_style": "يتحدث بوضوح ويطرح أسئلة مفتوحة لإثراء الحوار"
            },
            "guest": {
                "name": "نور العلي",
                "age": 40,
                "background": "خبير ومختص في مجال الموضوع المطروح",
                "expertise": "لديه معرفة عميقة وتجربة عملية في مجال النقاش",
                "personality": guest_personality,
                "speaking_style": "يعبر عن أفكاره بوضوح ويستخدم أمثلة من الواقع"
            },
            "why_good_match": "المقدم يجيد طرح الأسئلة والضيف لديه الخبرة للإجابة بفعالية"
        }

    def analyze_persona_quality(self, parsed_result):
        """Analyze the quality of generated personas"""
        analysis = {
            "name_authenticity": self._assess_arabic_names(parsed_result),
            "age_realism": self._assess_age_realism(parsed_result),
            "background_relevance": self._assess_background_relevance(parsed_result),
            "personality_distinctiveness": self._assess_personality_distinctiveness(parsed_result),
            "style_alignment": self._assess_style_alignment(parsed_result),
            "cultural_appropriateness": self._assess_cultural_appropriateness(parsed_result)
        }
        
        # Calculate overall score
        score_factors = [
            analysis["name_authenticity"],
            analysis["age_realism"],
            analysis["background_relevance"] > 0,
            analysis["personality_distinctiveness"],
            analysis["style_alignment"] > 0,
            analysis["cultural_appropriateness"]
        ]
        
        analysis["overall_score"] = sum(score_factors) * 100 // len(score_factors)
        analysis["quality_grade"] = (
            "ممتاز" if analysis["overall_score"] >= 85 else
            "جيد" if analysis["overall_score"] >= 70 else
            "مقبول" if analysis["overall_score"] >= 55 else
            "يحتاج تحسين"
        )
        
        return analysis

    def _assess_arabic_names(self, parsed_result):
        """Check if names are authentic Arabic names"""
        host_name = parsed_result.get("host", {}).get("name", "")
        guest_name = parsed_result.get("guest", {}).get("name", "")
        
        common_names = [
            "أحمد", "محمد", "علي", "خالد", "عمر", "يوسف", "حسن", "كريم",
            "فاطمة", "عائشة", "نور", "لمى", "سارة", "مريم", "زينب", "رقية"
        ]
        
        host_authentic = any(name in host_name for name in common_names)
        guest_authentic = any(name in guest_name for name in common_names)
        
        return host_authentic and guest_authentic

    def _assess_age_realism(self, parsed_result):
        """Check if ages are realistic"""
        try:
            host_age = int(parsed_result.get("host", {}).get("age", 0))
            guest_age = int(parsed_result.get("guest", {}).get("age", 0))
            return 25 <= host_age <= 55 and 25 <= guest_age <= 65
        except:
            return False

    def _assess_background_relevance(self, parsed_result):
        """Assess if backgrounds are relevant and detailed"""
        host_bg = parsed_result.get("host", {}).get("background", "")
        guest_bg = parsed_result.get("guest", {}).get("background", "")
        guest_exp = parsed_result.get("guest", {}).get("expertise", "")
        
        relevance_score = 0
        if len(host_bg) > 20:
            relevance_score += 1
        if len(guest_bg) > 20:
            relevance_score += 1
        if len(guest_exp) > 20:
            relevance_score += 1
        
        return relevance_score

    def _assess_personality_distinctiveness(self, parsed_result):
        """Check if host and guest have distinct personalities"""
        host_personality = parsed_result.get("host", {}).get("personality", "")
        guest_personality = parsed_result.get("guest", {}).get("personality", "")
        
        # Simple check: personalities should be different
        similarity = len(set(host_personality.split()) & set(guest_personality.split()))
        total_words = len(set(host_personality.split()) | set(guest_personality.split()))
        
        return similarity / total_words < 0.5 if total_words > 0 else False

    def _assess_style_alignment(self, parsed_result):
        """Assess if personas align with the intended style"""
        host_style = parsed_result.get("host", {}).get("speaking_style", "")
        guest_style = parsed_result.get("guest", {}).get("speaking_style", "")
        
        style_indicators = {
            "حواري": ["ودود", "طبيعي", "تفاعل", "حوار"],
            "تعليمي": ["تعليم", "شرح", "توضيح", "تبسيط"],
            "تحليلي": ["تحليل", "عمق", "تخصص", "دقة"],
            "ترفيهي": ["مرح", "دعابة", "ترفيه", "خفة"]
        }
        
        # This is a simplified assessment
        return len(host_style) > 15 and len(guest_style) > 15

    def _assess_cultural_appropriateness(self, parsed_result):
        """Check if personas are culturally appropriate"""
        all_content = " ".join([
            str(parsed_result.get("host", {}).get("name", "")),
            str(parsed_result.get("host", {}).get("background", "")),
            str(parsed_result.get("guest", {}).get("name", "")),
            str(parsed_result.get("guest", {}).get("background", "")),
            str(parsed_result.get("why_good_match", ""))
        ])
        
        # Check for Arabic content
        arabic_ratio = len(re.findall(r'[\u0600-\u06FF]', all_content)) / len(all_content) if all_content else 0
        return arabic_ratio > 0.3

# Enhanced Testing Function
def test_persona_generator(deployment, topic, information, classification_result, model_name="Fanar-C-1-8.7B"):
    """
    Test the persona generator with comprehensive validation
    """
    print("🧪 Testing Persona Generator with Enhanced Validation...")
    print("=" * 60)
    
    generator = SimplePersonaGenerator(deployment, model_name)
    
    # Run persona generation with validation
    personas_result, parsed_result = generator.generate_personas_with_validation(topic, information, classification_result)
    
    print("👥 Generated Personas:")
    host = parsed_result.get("host", {})
    guest = parsed_result.get("guest", {})
    
    print(f"\n🎤 Host: {host.get('name', 'N/A')} (عمر {host.get('age', 'N/A')})")
    print(f"   الخلفية: {host.get('background', 'N/A')}")
    print(f"   الشخصية: {host.get('personality', 'N/A')}")
    print(f"   أسلوب الحديث: {host.get('speaking_style', 'N/A')}")
    
    print(f"\n🎯 Guest: {guest.get('name', 'N/A')} (عمر {guest.get('age', 'N/A')})")
    print(f"   الخلفية: {guest.get('background', 'N/A')}")
    print(f"   الخبرة: {guest.get('expertise', 'N/A')}")
    print(f"   الشخصية: {guest.get('personality', 'N/A')}")
    print(f"   أسلوب الحديث: {guest.get('speaking_style', 'N/A')}")
    
    print(f"\n🤝 Why Good Match: {parsed_result.get('why_good_match', 'N/A')}")
    
    # Analyze quality
    quality_analysis = generator.analyze_persona_quality(parsed_result)
    print(f"\n📈 Quality Analysis:")
    print(f"Overall Score: {quality_analysis['overall_score']}/100")
    print(f"Quality Grade: {quality_analysis['quality_grade']}")
    print(f"Name Authenticity: {'✅' if quality_analysis['name_authenticity'] else '❌'}")
    print(f"Age Realism: {'✅' if quality_analysis['age_realism'] else '❌'}")
    print(f"Background Relevance: {quality_analysis['background_relevance']}/3")
    print(f"Personality Distinctiveness: {'✅' if quality_analysis['personality_distinctiveness'] else '❌'}")
    
    return personas_result, parsed_result

# Usage:
# generator = SimplePersonaGenerator(deployment, "Fanar-C-1-8.7B")
# personas_result, parsed_result = generator.generate_personas_with_validation(topic, information, classification_result)

In [10]:
generator = SimplePersonaGenerator(deployment, model)
personas_result, parsed_result = generator.generate_personas_with_validation(topic, information, classification_result)
print("Personas Result:")
print(personas_result)
    # Results are guaranteed to be valid JSON with all required fields
host = parsed_result['host']
guest = parsed_result['guest']
print(f"Host: {host['name']} ({host['age']} years)")
print(f"Guest: {guest['name']} ({guest['age']} years)")

✅ Persona generation successful on attempt 1
Personas Result:
{
  "host": {
    "name": "أحمد بن علي",
    "age": 40,
    "background": "مقدم برامج إذاعية معروف بشغفه بالتكنولوجيا والقضايا الاجتماعية",
    "personality": "شخص مرح ومتحمس يسعى لفهم التحديات المعاصرة",
    "speaking_style": "متفاعل مع الجمهور ويستخدم الأمثلة اليومية لشرح المفاهيم التقنية"
  },
  "guest": {
    "name": "د. فاتن راشد",
    "age": 38,
    "background": "باحثة متخصصة في الذكاء الاصطناعي وتطبيقاته اللغوية والثقافية",
    "expertise": "تبحث عن طرق لحماية اللغة العربية وسياقها الثقافي باستخدام الذكاء الاصطناعي",
    "personality": "عالمة دقيقة ومنطقية لكن لها حس إنساني قوي",
    "speaking_style": "توضيح علمي دقيق ممزوج بتوضيحات عملية وشرح للمخاطر والمنافع"
  },
  "why_good_match": "أحمد وفاطن مناسبان للنقاش لأن الأول لديه فضول اجتماعي وثقافي ويفهمهما الجمهور أما الثانية فهي خبيرة تكنولوجية ومعرفية قادرة على تقديم وجهة نظر علمية عميقة."
}
Host: أحمد بن علي (40 years)
Guest: د. فاتن راشد (38 years)


In [11]:
import json
import re

class FixedConversationStructureGenerator:
    def __init__(self, deployment, model="gpt-4o"):
        self.model = model
        self.deployment = deployment

    def generate_conversation_structure(self, topic, information, classification_result, personas_result):
        """
        Step 3: Generate core conversation structure with Arabic-only content
        """
        
        # Parse inputs
        try:
            classification = json.loads(classification_result)
            personas = json.loads(personas_result)
        except:
            raise ValueError("Invalid JSON provided for classification or personas")
        
        # Extract key info
        primary_category = classification.get("primary_category", "")
        optimal_style = classification.get("optimal_style", "")
        discourse_pattern = classification.get("discourse_pattern", "")
        
        host = personas.get("host", {})
        guest = personas.get("guest", {})
        host_name = host.get('name', 'المقدم')
        guest_name = guest.get('name', 'الضيف')
        host_background = host.get('background', '')
        guest_background = guest.get('background', '')
        
        prompt = f"""
You are an expert in designing conversation structures for Arabic podcasts.

CRITICAL LANGUAGE REQUIREMENTS:
- Use ONLY Arabic language (Modern Standard Arabic)
- NO English words, phrases, or sentences
- NO Chinese, Japanese, or any other foreign languages
- NO foreign characters, symbols, or punctuation
- Arabic text ONLY with standard JSON punctuation (, : " {{ }})

Task: Create a conversation structure for this Arabic podcast episode.

Topic: {topic}
Category: {primary_category}
Style: {optimal_style}
Host: {host_name}
Guest: {guest_name}

Generate ONLY this JSON structure with Arabic content:

{{
    "episode_topic": "Arabic episode topic here",
    "personas": {{
        "host": {{
            "name": "{host_name}",
            "background": "Arabic background description",
            "speaking_style": "Arabic speaking style description"
        }},
        "guest": {{
            "name": "{guest_name}",
            "background": "Arabic background description",
            "speaking_style": "Arabic speaking style description"
        }}
    }},
    "conversation_flow": {{
        "intro1": {{
            "opening_line": "Arabic opening line for host",
            "podcast_introduction": "Arabic podcast introduction",
            "episode_hook": "Arabic engaging hook about topic"
        }},
        "intro2": {{
            "topic_introduction": "Arabic topic introduction",
            "guest_welcome": "Arabic welcome message for guest",
            "guest_bio_highlight": "Arabic guest background highlight"
        }},
        "main_discussion": [
            {{
                "point_title": "Arabic first discussion point",
                "personal_angle": "Arabic personal connection"
            }},
            {{
                "point_title": "Arabic second discussion point", 
                "personal_angle": "Arabic personal angle"
            }},
            {{
                "point_title": "Arabic third discussion point",
                "personal_angle": "Arabic concluding angle"
            }}
        ],
        "closing": {{
            "conclusion": {{
                "main_takeaways": "Arabic main takeaways",
                "guest_final_message": "Arabic guest final message",
                "host_closing_thoughts": "Arabic host closing thoughts"
            }},
            "outro": {{
                "guest_appreciation": "Arabic thank guest message",
                "audience_thanks": "Arabic thank audience message",
                "call_to_action": "Arabic call for engagement",
                "final_goodbye": "Arabic final goodbye"
            }}
        }}
    }},
    "cultural_context": {{
        "proverbs_sayings": [
            "Arabic proverb related to topic",
            "Arabic wisdom saying"
        ],
        "regional_references": [
            "Arabic local reference related to topic",
            "Arabic regional experience"
        ]
    }}
}}

CRITICAL FORMATTING REQUIREMENTS:
- Use ONLY English commas (,) not Arabic commas (،)
- Use ONLY standard double quotes (") not Arabic quotes
- Return ONLY the JSON structure above
- NO explanatory text before or after JSON
- NO confidence scores or meta-text
- NO ```json markers or code blocks
- All Arabic text must be grammatically correct MSA

Replace all placeholder text with actual Arabic content specific to the topic: {topic}
"""
        
        response = self.deployment.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "Generate Arabic podcast conversation structure. Return ONLY valid JSON with Arabic content. NO foreign languages. Use English JSON punctuation only."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3  # Lower temperature for more predictable output
        )
        
        return self._clean_json_response(response.choices[0].message.content)

    def _clean_json_response(self, response):
        """Enhanced Arabic-only JSON cleaning method"""
        if not response:
            return "{}"
        
        # Remove any text before first { and after last }
        start_idx = response.find('{')
        end_idx = response.rfind('}')
        
        if start_idx != -1 and end_idx != -1:
            clean_json = response[start_idx:end_idx+1]
        else:
            clean_json = response
        
        # Replace Arabic punctuation with English equivalents for JSON
        clean_json = clean_json.replace('،', ',')  # Arabic comma to English comma
        clean_json = clean_json.replace('"', '"')  # Arabic quote to English quote
        clean_json = clean_json.replace('"', '"')  # Arabic quote to English quote
        clean_json = clean_json.replace(''', "'")  # Arabic apostrophe
        clean_json = clean_json.replace(''', "'")  # Arabic apostrophe
        
        # Remove foreign language characters (Chinese, Japanese, etc.)
        # Keep only Arabic Unicode ranges + basic JSON syntax
        foreign_patterns = [
            r'[\u4e00-\u9fff]',  # Chinese characters
            r'[\u3040-\u309f]',  # Hiragana
            r'[\u30a0-\u30ff]',  # Katakana
            r'[\u3000-\u303f]',  # Japanese punctuation
            r'[\uff00-\uffef]',  # Fullwidth characters
            r'[\u2000-\u206f]',  # General punctuation (some problematic ones)
        ]
        
        for pattern in foreign_patterns:
            clean_json = re.sub(pattern, '', clean_json)
        
        # Remove specific problematic characters we've seen
        problematic_chars = ['千', '浮', '起', '提', '您', '足', '于', '、', '！', 'Indeed', 'how']
        for char in problematic_chars:
            clean_json = clean_json.replace(char, '')
        
        # Remove English words mixed in Arabic text (basic detection)
        # This is a simple approach - remove common English words found in previous outputs
        english_words = [
            'Translation:', 'Hello', 'everyone', 'welcome', 'back', 'Indeed',
            'how', 'preserving', 'culture', 'and', 'identity', 'in', 'digital', 'age',
            'NLP', 'để', 'tweaking', 'idees', 'AbdulRahman', 'Fatima'
        ]
        
        for word in english_words:
            clean_json = clean_json.replace(word, '')
        
        # Remove meta-text patterns
        meta_patterns = [
            r'الثقة:\s*\d+%',
            r'الدقة:\s*\d+%',
            r'معدل الثقة:\s*\d+%',
            r'\n.*الثقة.*',
            r'\n.*confidence.*',
            r'\n.*accuracy.*',
            r'ملاحظة:.*',
            r'تعليق:.*',
            r'\(Translation:.*?\)',
            r'\*\*.*?\*\*',  # Remove markdown bold
        ]
        
        for pattern in meta_patterns:
            clean_json = re.sub(pattern, '', clean_json, flags=re.DOTALL | re.IGNORECASE)
        
        # Fix common JSON issues
        # Remove trailing commas before closing braces/brackets
        clean_json = re.sub(r',(\s*[}\]])', r'\1', clean_json)
        
        # Fix missing commas between properties
        clean_json = re.sub(r'"\s*\n\s*"', '",\n"', clean_json)
        
        # Remove multiple spaces
        clean_json = re.sub(r'\s+', ' ', clean_json)
        
        # Ensure proper quote escaping
        clean_json = clean_json.replace('\\"', '"')
        
        return clean_json.strip()

    def _validate_arabic_only(self, text):
        """Validate that text contains only Arabic and basic punctuation"""
        if not text:
            return False, "Empty text"
        
        # Check for foreign language characters
        foreign_patterns = [
            (r'[\u4e00-\u9fff]', "Chinese characters detected"),
            (r'[\u3040-\u309f\u30a0-\u30ff]', "Japanese characters detected"),
            (r'\b[a-zA-Z]{2,}\b', "English words detected"),
            (r'[千浮起提您足于、！]', "Specific foreign characters detected")
        ]
        
        for pattern, message in foreign_patterns:
            if re.search(pattern, text):
                return False, message
        
        # Check for minimum Arabic content
        arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
        total_chars = len(re.sub(r'[\s\{\}",:\[\]]', '', text))  # Exclude JSON syntax
        
        if total_chars > 0:
            arabic_ratio = arabic_chars / total_chars
            if arabic_ratio < 0.8:  # At least 80% Arabic
                return False, f"Insufficient Arabic content: {arabic_ratio:.2%}"
        
        return True, "Arabic validation successful"

    def generate_conversation_structure_with_validation(self, topic, information, classification_result, personas_result):
        """
        Generate conversation structure with Arabic-only validation and retry
        """
        max_attempts = 3
        
        for attempt in range(max_attempts):
            try:
                # Get structure
                structure_result = self.generate_conversation_structure(topic, information, classification_result, personas_result)
                
                # Validate Arabic-only content
                is_arabic_valid, arabic_message = self._validate_arabic_only(structure_result)
                if not is_arabic_valid:
                    print(f"⚠️ Attempt {attempt + 1}: Arabic validation failed: {arabic_message}")
                    continue
                
                # Try to parse JSON
                parsed_result = json.loads(structure_result)
                
                # Validate structure completeness
                is_structure_valid, structure_message = self.validate_conversation_structure(structure_result)
                
                if is_structure_valid:
                    print(f"✅ Conversation structure generation successful on attempt {attempt + 1}")
                    return structure_result, parsed_result
                else:
                    print(f"⚠️ Attempt {attempt + 1}: Structure validation failed: {structure_message}")
                    
            except json.JSONDecodeError as e:
                print(f"⚠️ Attempt {attempt + 1}: JSON parsing error: {e}")
                if attempt == max_attempts - 1:
                    print("Raw response for debugging:")
                    print(structure_result[:300] + "..." if len(structure_result) > 300 else structure_result)
            except Exception as e:
                print(f"⚠️ Attempt {attempt + 1}: General error: {e}")
        
        # If all attempts fail, return fallback
        print("📝 Using fallback conversation structure...")
        fallback_result = self._get_fallback_structure(topic, classification_result, personas_result)
        return json.dumps(fallback_result, ensure_ascii=False, indent=2), fallback_result

    def _get_fallback_structure(self, topic, classification_result, personas_result):
        """Provide Arabic-only fallback conversation structure"""
        try:
            classification = json.loads(classification_result)
            personas = json.loads(personas_result)
            optimal_style = classification.get("optimal_style", "حواري")
        except:
            optimal_style = "حواري"
        
        host = personas.get("host", {})
        guest = personas.get("guest", {})
        host_name = host.get('name', 'المقدم')
        guest_name = guest.get('name', 'الضيف')
        
        return {
            "episode_topic": f"نقاش حول {topic}",
            "personas": {
                "host": {
                    "name": host_name,
                    "background": host.get('background', 'مقدم برامج إذاعية متخصص'),
                    "speaking_style": host.get('speaking_style', 'يتحدث بوضوح ويطرح أسئلة مدروسة')
                },
                "guest": {
                    "name": guest_name,
                    "background": guest.get('background', 'خبير متخصص في الموضوع'),
                    "speaking_style": guest.get('speaking_style', 'يشرح بوضوح ويقدم أمثلة عملية')
                }
            },
            "conversation_flow": {
                "intro1": {
                    "opening_line": f"مرحباً بكم مستمعينا الكرام، معكم {host_name} في حلقة جديدة",
                    "podcast_introduction": "نناقش اليوم موضوعاً مهماً يهم الجميع ويستحق التأمل",
                    "episode_hook": f"موضوع حلقتنا اليوم هو {topic} وأثره على حياتنا"
                },
                "intro2": {
                    "topic_introduction": f"سنتحدث اليوم عن {topic} وجوانبه المختلفة والمهمة",
                    "guest_welcome": f"معي اليوم الضيف المتميز {guest_name}، أهلاً وسهلاً بك",
                    "guest_bio_highlight": f"{guest_name} خبير متخصص في هذا المجال ولديه خبرة واسعة"
                },
                "main_discussion": [
                    {
                        "point_title": "الجانب الأول والأساسي للموضوع",
                        "personal_angle": "كيف يؤثر هذا الموضوع على حياتنا اليومية وتجاربنا"
                    },
                    {
                        "point_title": "الجانب الثاني والتحديات المرتبطة",
                        "personal_angle": "التحديات والفرص المتاحة في هذا المجال"
                    },
                    {
                        "point_title": "الجانب الثالث والحلول المقترحة",
                        "personal_angle": "النصائح والتوجيهات العملية للمستقبل"
                    }
                ],
                "closing": {
                    "conclusion": {
                        "main_takeaways": "الخلاصات المهمة والنقاط الأساسية من نقاشنا اليوم",
                        "guest_final_message": "رسالة أخيرة ومهمة من الضيف لجمهور المستمعين",
                        "host_closing_thoughts": "أفكار ختامية وتأملات من المقدم حول الموضوع"
                    },
                    "outro": {
                        "guest_appreciation": f"شكراً جزيلاً {guest_name} على هذا النقاش المفيد والثري",
                        "audience_thanks": "شكراً لكم مستمعينا الكرام على متابعتكم واهتمامكم",
                        "call_to_action": "تفاعلوا معنا وشاركونا آراءكم عبر وسائل التواصل الاجتماعي",
                        "final_goodbye": "إلى اللقاء في حلقة قادمة بإذن الله"
                    }
                }
            },
            "cultural_context": {
                "proverbs_sayings": [
                    "العلم نور والجهل ظلام",
                    "في التأني السلامة وفي العجلة الندامة"
                ],
                "regional_references": [
                    "التجربة العربية الغنية في هذا المجال",
                    "الخبرات المحلية والإقليمية ذات الصلة بالموضوع"
                ]
            }
        }

    def validate_conversation_structure(self, structure_json):
        """Enhanced validation for conversation structure"""
        required_keys = ["episode_topic", "personas", "conversation_flow", "cultural_context"]
        
        conversation_flow_required = ["intro1", "intro2", "main_discussion", "closing"]
        intro1_required = ["opening_line", "podcast_introduction", "episode_hook"]
        intro2_required = ["topic_introduction", "guest_welcome", "guest_bio_highlight"]
        
        try:
            structure = json.loads(structure_json)
            missing_keys = []
            
            # Check main structure
            for key in required_keys:
                if key not in structure:
                    missing_keys.append(key)
            
            # Check conversation flow
            if "conversation_flow" in structure:
                conv_flow = structure["conversation_flow"]
                for key in conversation_flow_required:
                    if key not in conv_flow:
                        missing_keys.append(f"conversation_flow.{key}")
                
                # Check intro1
                if "intro1" in conv_flow:
                    intro1 = conv_flow["intro1"]
                    for key in intro1_required:
                        if key not in intro1:
                            missing_keys.append(f"intro1.{key}")
                
                # Check intro2
                if "intro2" in conv_flow:
                    intro2 = conv_flow["intro2"]
                    for key in intro2_required:
                        if key not in intro2:
                            missing_keys.append(f"intro2.{key}")
                
                # Check main discussion
                if "main_discussion" in conv_flow:
                    main_disc = conv_flow["main_discussion"]
                    if not isinstance(main_disc, list) or len(main_disc) < 3:
                        missing_keys.append("main_discussion (need at least 3 points)")
                    else:
                        for i, point in enumerate(main_disc):
                            if "point_title" not in point:
                                missing_keys.append(f"main_discussion[{i}].point_title")
                            if "personal_angle" not in point:
                                missing_keys.append(f"main_discussion[{i}].personal_angle")
            
            if missing_keys:
                return False, f"Missing required keys: {missing_keys}"
            
            return True, "Conversation structure validation successful"
            
        except json.JSONDecodeError as e:
            return False, f"Invalid JSON format: {e}"

    def analyze_structure_quality(self, structure_json):
        """Enhanced quality analysis with Arabic-only validation"""
        try:
            structure = json.loads(structure_json)
            
            analysis = {
                "structure_completeness": 0,
                "content_quality": 0,
                "cultural_integration": 0,
                "arabic_purity": 0
            }
            
            # Check completeness
            conv_flow = structure.get("conversation_flow", {})
            completeness_indicators = [
                bool(conv_flow.get("intro1")),
                bool(conv_flow.get("intro2")),
                bool(conv_flow.get("main_discussion")),
                bool(conv_flow.get("closing")),
                len(conv_flow.get("main_discussion", [])) >= 3
            ]
            analysis["structure_completeness"] = sum(completeness_indicators) * 20
            
            # Check content quality
            intro1 = conv_flow.get("intro1", {})
            intro2 = conv_flow.get("intro2", {})
            quality_indicators = [
                len(intro1.get("opening_line", "")) > 15,
                len(intro1.get("episode_hook", "")) > 15,
                len(intro2.get("topic_introduction", "")) > 15,
                len(intro2.get("guest_welcome", "")) > 10
            ]
            analysis["content_quality"] = sum(quality_indicators) * 25
            
            # Check cultural integration
            cultural = structure.get("cultural_context", {})
            cultural_indicators = [
                len(cultural.get("proverbs_sayings", [])) >= 1,
                len(cultural.get("regional_references", [])) >= 1
            ]
            analysis["cultural_integration"] = sum(cultural_indicators) * 50
            
            # Check Arabic purity
            full_text = json.dumps(structure, ensure_ascii=False)
            is_arabic_pure, _ = self._validate_arabic_only(full_text)
            analysis["arabic_purity"] = 100 if is_arabic_pure else 0
            
            # Calculate overall score
            analysis["overall_score"] = min(100, sum([
                analysis["structure_completeness"],
                analysis["content_quality"],
                analysis["cultural_integration"],
                analysis["arabic_purity"]
            ]) // 4)
            
            analysis["quality_grade"] = (
                "ممتاز" if analysis["overall_score"] >= 90 else
                "جيد جداً" if analysis["overall_score"] >= 80 else
                "جيد" if analysis["overall_score"] >= 70 else
                "مقبول" if analysis["overall_score"] >= 60 else
                "يحتاج تحسين"
            )
            
            analysis["ready_for_next_step"] = analysis["overall_score"] >= 75
            
            return analysis
            
        except:
            return {"error": "Could not analyze structure quality"}

# Enhanced Testing Function
def test_fixed_conversation_structure_generator(deployment, topic, information, classification_result, personas_result, model_name="Fanar-C-1-8.7B"):
    """
    Test the fixed conversation structure generator with Arabic-only validation
    """
    print("🧪 Testing Fixed Conversation Structure Generator...")
    print("=" * 60)
    
    generator = FixedConversationStructureGenerator(deployment, model_name)
    
    # Run generation with validation
    structure_result, parsed_result = generator.generate_conversation_structure_with_validation(
        topic, information, classification_result, personas_result
    )
    
    print("📋 Generated Structure:")
    print(f"Episode Topic: {parsed_result.get('episode_topic', 'N/A')}")
    
    # Show conversation flow
    conv_flow = parsed_result.get("conversation_flow", {})
    intro1 = conv_flow.get("intro1", {})
    main_discussion = conv_flow.get("main_discussion", [])
    
    print(f"\n🎬 Intro1 Opening: {intro1.get('opening_line', 'N/A')[:80]}...")
    print(f"📝 Discussion Points: {len(main_discussion)}")
    for i, point in enumerate(main_discussion, 1):
        print(f"  {i}. {point.get('point_title', 'N/A')[:60]}...")
    
    # Show cultural context
    cultural = parsed_result.get("cultural_context", {})
    proverbs = cultural.get("proverbs_sayings", [])
    print(f"\n🏛️ Cultural Proverbs: {len(proverbs)}")
    for proverb in proverbs:
        print(f"  • {proverb}")
    
    # Enhanced quality analysis
    quality_analysis = generator.analyze_structure_quality(structure_result)
    print(f"\n📈 Quality Analysis:")
    print(f"Overall Score: {quality_analysis['overall_score']}/100")
    print(f"Quality Grade: {quality_analysis['quality_grade']}")
    print(f"Structure Completeness: {quality_analysis['structure_completeness']}/100")
    print(f"Content Quality: {quality_analysis['content_quality']}/100")
    print(f"Cultural Integration: {quality_analysis['cultural_integration']}/100")
    print(f"Arabic Purity: {quality_analysis['arabic_purity']}/100")
    print(f"Ready for Next Step: {'✅' if quality_analysis['ready_for_next_step'] else '❌'}")
    
    # Arabic validation check
    is_arabic_valid, arabic_message = generator._validate_arabic_only(structure_result)
    print(f"\n🔍 Arabic Validation: {'✅' if is_arabic_valid else '❌'} {arabic_message}")
    
    return structure_result, parsed_result

# Usage:
# generator = FixedConversationStructureGenerator(deployment, "Fanar-C-1-8.7B")
# structure_result, parsed_result = generator.generate_conversation_structure_with_validation(topic, information, classification_result, personas_result)

# Test the fixed generator
# test_result = test_fixed_conversation_structure_generator(deployment, topic, information, classification_result, personas_result)

In [12]:
# Initialize with Arabic-only focus
generator = FixedConversationStructureGenerator(deployment, model)

# Generate with validation
structure_result, parsed_result = generator.generate_conversation_structure_with_validation(
    topic, information, classification_result, personas_result
)
print("Conversation Structure Result:")
print(structure_result)
# Test with comprehensive checks
test_result = test_fixed_conversation_structure_generator(
    deployment, topic, information, classification_result, personas_result
)

⚠️ Attempt 1: Arabic validation failed: English words detected
⚠️ Attempt 2: Arabic validation failed: English words detected
⚠️ Attempt 3: Arabic validation failed: English words detected
📝 Using fallback conversation structure...
Conversation Structure Result:
{
  "episode_topic": "نقاش حول الذكاء الاصطناعي والهوية العربية: كيف نحافظ على ثقافتنا في العصر الرقمي",
  "personas": {
    "host": {
      "name": "أحمد بن علي",
      "background": "مقدم برامج إذاعية معروف بشغفه بالتكنولوجيا والقضايا الاجتماعية",
      "speaking_style": "متفاعل مع الجمهور ويستخدم الأمثلة اليومية لشرح المفاهيم التقنية"
    },
    "guest": {
      "name": "د. فاتن راشد",
      "background": "باحثة متخصصة في الذكاء الاصطناعي وتطبيقاته اللغوية والثقافية",
      "speaking_style": "توضيح علمي دقيق ممزوج بتوضيحات عملية وشرح للمخاطر والمنافع"
    }
  },
  "conversation_flow": {
    "intro1": {
      "opening_line": "مرحباً بكم مستمعينا الكرام، معكم أحمد بن علي في حلقة جديدة",
      "podcast_introduction": "نناقش الي

In [13]:
import json
import time

class SectionalDialogueContentEnhancer:
    def __init__(self, deployment, model="gpt-4o", enhancement_level="minimal"):
        self.model = model
        self.deployment = deployment
        self.enhancement_level = enhancement_level  # "minimal", "standard", "full"

    def enhance_intro_sections(self, topic, classification_result, personas_result, intro1, intro2):
        """
        Chunk 1: Enhance intro1 and intro2 sections (REDUCED)
        """
        try:
            classification = json.loads(classification_result)
            personas = json.loads(personas_result)
        except:
            raise ValueError("Invalid JSON provided")
        
        optimal_style = classification.get("optimal_style", "")
        primary_category = classification.get("primary_category", "")
        
        host = personas.get("host", {})
        guest = personas.get("guest", {})
        host_name = host.get('name', 'المقدم')
        guest_name = guest.get('name', 'الضيف')
        
        # Determine enhancement scope based on level
        if self.enhancement_level == "minimal":
            intro1_fields = "spontaneity_elements"
            intro2_fields = "cultural_connections"
            item_count = "2"
        elif self.enhancement_level == "standard":
            intro1_fields = "spontaneity_elements"
            intro2_fields = "cultural_connections"
            item_count = "3"
        else:  # full
            intro1_fields = "spontaneity_elements"
            intro2_fields = "cultural_connections"
            item_count = "3-4"
        
        prompt = f"""
You are an expert in enhancing Arabic podcast introductions.

Task: Enhance ONLY the intro sections with natural dialogue elements.

Topic: {topic}
Category: {primary_category}
Style: {optimal_style}

Host: {host_name} - {host.get('background', '')}
Guest: {guest_name} - {guest.get('background', '')}

Current intro1: {json.dumps(intro1, ensure_ascii=False)}
Current intro2: {json.dumps(intro2, ensure_ascii=False)}

ENHANCEMENT REQUIREMENTS:

For intro1, ADD this field:
- "{intro1_fields}": [{item_count} natural spontaneous phrases that the host might use when opening, in MSA]

For intro2, ADD this field:  
- "{intro2_fields}": [{item_count} ways to connect this topic to Arab culture/values, in MSA]

Return the enhanced sections in this exact format:
{{
    "intro1": {{
        [keep all existing intro1 fields],
        "{intro1_fields}": [new content]
    }},
    "intro2": {{
        [keep all existing intro2 fields],
        "{intro2_fields}": [new content]
    }}
}}

CRITICAL REQUIREMENTS:
- Keep ALL existing content unchanged
- Add only the specified new fields
- All new values in Modern Standard Arabic (MSA)
- Use English punctuation only (no ،)
- Return only valid JSON, no extra text
- Make content specific to topic: {topic}
- Match the {optimal_style} style
- Keep arrays short and impactful
"""

        response = self.deployment.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": f"You enhance Arabic podcast intros. Style: {optimal_style}. Return only valid JSON with English punctuation."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.6
        )

        return self._clean_json_response(response.choices[0].message.content)

    def enhance_main_discussion_point(self, topic, classification_result, personas_result, discussion_point, point_index):
        """
        Chunk 2: Enhance individual main discussion points (REDUCED)
        """
        try:
            classification = json.loads(classification_result)
            personas = json.loads(personas_result)
        except:
            raise ValueError("Invalid JSON provided")
        
        optimal_style = classification.get("optimal_style", "")
        cultural_sensitivity = classification.get("cultural_sensitivity_level", "")
        
        host = personas.get("host", {})
        guest = personas.get("guest", {})
        host_name = host.get('name', 'المقدم')
        guest_name = guest.get('name', 'الضيف')

        # Determine enhancement scope based on level
        if self.enhancement_level == "minimal":
            enhancement_fields = """
    "spontaneous_triggers": ["trigger 1 in MSA", "trigger 2 in MSA"],
    "cultural_references": ["reference 1 in MSA", "reference 2 in MSA"]"""
            field_count = "2"
        elif self.enhancement_level == "standard":
            enhancement_fields = """
    "spontaneous_triggers": ["trigger 1 in MSA", "trigger 2 in MSA"],
    "cultural_references": ["reference 1 in MSA", "reference 2 in MSA"],
    "natural_transitions": "transition phrase in MSA\""""
            field_count = "3"
        else:  # full
            enhancement_fields = """
    "spontaneous_triggers": ["trigger 1 in MSA", "trigger 2 in MSA"],
    "disagreement_points": "disagreement description in MSA",
    "cultural_references": ["reference 1 in MSA", "reference 2 in MSA"],
    "natural_transitions": "transition phrase in MSA",
    "emotional_triggers": "emotional description in MSA\""""
            field_count = "5"

        prompt = f"""
You are an expert in enhancing Arabic podcast discussion points.

Task: Enhance ONE discussion point with rich dialogue elements.

Topic: {topic}
Style: {optimal_style}
Point #{point_index + 1}

Current discussion point: {json.dumps(discussion_point, ensure_ascii=False)}

Add EXACTLY these {field_count} fields. Keep all existing fields unchanged:

{{
    [all existing fields from discussion_point],{enhancement_fields}
}}

CRITICAL REQUIREMENTS:
- Keep ALL existing fields exactly as they are
- Add only the {field_count} new fields shown above
- All new content in Modern Standard Arabic (MSA)
- Use English punctuation ONLY (no ،)
- Return only valid JSON, no extra text
- Make content relevant to topic: {topic}
- Keep responses concise and actionable
"""

        response = self.deployment.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": f"You enhance Arabic podcast discussion points. Style: {optimal_style}. Return only valid JSON with English punctuation."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7
        )

        return self._clean_json_response(response.choices[0].message.content)

    def enhance_closing_sections(self, topic, classification_result, personas_result, closing_section):
        """
        Chunk 3: Enhance closing (conclusion + outro) sections (REDUCED)
        """
        try:
            classification = json.loads(classification_result)
            personas = json.loads(personas_result)
        except:
            raise ValueError("Invalid JSON provided")
        
        optimal_style = classification.get("optimal_style", "")
        
        host = personas.get("host", {})
        guest = personas.get("guest", {})
        host_name = host.get('name', 'المقدم')
        guest_name = guest.get('name', 'الضيف')

        # Determine enhancement scope based on level
        if self.enhancement_level == "minimal":
            conclusion_fields = '"emotional_closure": "how to create emotional satisfaction for listeners, in MSA"'
            outro_fields = '"memorable_ending": "a memorable way to end that listeners will remember, in MSA"'
        elif self.enhancement_level == "standard":
            conclusion_fields = '''
        "emotional_closure": "how to create emotional satisfaction for listeners, in MSA",
        "key_insights": ["insight 1 in MSA", "insight 2 in MSA"]'''
            outro_fields = '"memorable_ending": "a memorable way to end that listeners will remember, in MSA"'
        else:  # full
            conclusion_fields = '''
        "emotional_closure": "how to create emotional satisfaction for listeners, in MSA",
        "key_insights": [2-3 key insights that should be highlighted in the wrap-up, in MSA]'''
            outro_fields = '''
        "memorable_ending": "a memorable way to end that listeners will remember, in MSA",
        "connection_building": "ways to build ongoing connection with the audience, in MSA"'''

        prompt = f"""
You are an expert in enhancing Arabic podcast closings.

Task: Enhance the closing section with natural wrap-up elements.

Topic: {topic}
Style: {optimal_style}

Host: {host_name} - {host.get('background', '')}
Guest: {guest_name} - {guest.get('background', '')}

Current closing: {json.dumps(closing_section, ensure_ascii=False)}

ENHANCEMENT REQUIREMENTS:

For conclusion subsection, ADD:
{conclusion_fields}

For outro subsection, ADD:
{outro_fields}

Return enhanced closing in this exact format:
{{
    "conclusion": {{
        [keep all existing conclusion fields],
        {conclusion_fields}
    }},
    "outro": {{
        [keep all existing outro fields],
        {outro_fields}
    }}
}}

CRITICAL REQUIREMENTS:
- Keep ALL existing content unchanged
- Add only the specified new fields
- All new values in Modern Standard Arabic (MSA)
- Use English punctuation only (no ،)
- Return only valid JSON, no extra text
- Make content feel conclusive and satisfying
- Keep insights concise and actionable
"""

        response = self.deployment.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": f"You enhance Arabic podcast closings. Style: {optimal_style}. Return only valid JSON with English punctuation."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7
        )

        return self._clean_json_response(response.choices[0].message.content)

    def create_global_elements(self, topic, classification_result, personas_result):
        """
        Chunk 4: Create global elements (SIMPLIFIED AND REDUCED)
        """
        try:
            classification = json.loads(classification_result)
            personas = json.loads(personas_result)
        except:
            raise ValueError("Invalid JSON provided")
        
        optimal_style = classification.get("optimal_style", "")
        cultural_sensitivity = classification.get("cultural_sensitivity_level", "")
        primary_category = classification.get("primary_category", "")
        
        host = personas.get("host", {})
        guest = personas.get("guest", {})
        host_name = host.get('name', 'المقدم')
        guest_name = guest.get('name', 'الضيف')

        # Determine enhancement scope based on level
        if self.enhancement_level == "minimal":
            structure = '''
{
    "spontaneous_moments": {
        "natural_interruptions": [
            "first natural interruption in MSA",
            "second natural interruption in MSA"
        ],
        "emotional_reactions": [
            "first emotional reaction in MSA",
            "second emotional reaction in MSA"
        ]
    },
    "dialogue_techniques": {
        "questioning_styles": [
            "first questioning style in MSA",
            "second questioning style in MSA"
        ],
        "audience_engagement": [
            "first engagement technique in MSA",
            "second engagement technique in MSA"
        ]
    }
}'''
        elif self.enhancement_level == "standard":
            structure = '''
{
    "spontaneous_moments": {
        "natural_interruptions": [
            "first natural interruption in MSA",
            "second natural interruption in MSA"
        ],
        "emotional_reactions": [
            "first emotional reaction in MSA",
            "second emotional reaction in MSA"
        ],
        "personal_stories": [
            "first personal story in MSA",
            "second personal story in MSA"
        ]
    },
    "dialogue_techniques": {
        "questioning_styles": [
            "first questioning style in MSA",
            "second questioning style in MSA"
        ],
        "storytelling_moments": [
            "first storytelling moment in MSA",
            "second storytelling moment in MSA"
        ],
        "audience_engagement": [
            "first engagement technique in MSA",
            "second engagement technique in MSA"
        ]
    }
}'''
        else:  # full
            structure = '''
{
    "spontaneous_moments": {
        "natural_interruptions": [
            "first natural interruption in MSA",
            "second natural interruption in MSA",
            "third natural interruption in MSA"
        ],
        "emotional_reactions": [
            "first emotional reaction in MSA",
            "second emotional reaction in MSA", 
            "third emotional reaction in MSA"
        ],
        "personal_stories": [
            "first personal story in MSA",
            "second personal story in MSA"
        ],
        "humorous_moments": [
            "first humorous moment in MSA",
            "second humorous moment in MSA"
        ]
    },
    "personality_interactions": {
        "host_strengths": "host strengths description in MSA",
        "guest_expertise": "guest expertise description in MSA",
        "natural_chemistry": "chemistry description in MSA",
        "tension_points": "tension points description in MSA",
        "collaboration_moments": "collaboration description in MSA"
    },
    "dialogue_techniques": {
        "questioning_styles": [
            "first questioning style in MSA",
            "second questioning style in MSA",
            "third questioning style in MSA"
        ],
        "storytelling_moments": [
            "first storytelling moment in MSA",
            "second storytelling moment in MSA"
        ],
        "audience_engagement": [
            "first engagement technique in MSA",
            "second engagement technique in MSA",
            "third engagement technique in MSA"
        ],
        "emotional_peaks": [
            "first emotional peak in MSA",
            "second emotional peak in MSA"
        ]
    }
}'''

        prompt = f"""
You are an expert in creating global dialogue elements for Arabic podcasts.

Task: Create global sections that enhance the overall conversation flow.

Topic: {topic}
Category: {primary_category}
Style: {optimal_style}
Enhancement Level: {self.enhancement_level}

Host: {host_name} - {host.get('background', '')}
Guest: {guest_name} - {guest.get('background', '')}

Create EXACTLY this JSON structure with proper English punctuation:

{structure}

CRITICAL REQUIREMENTS:
- Replace placeholder text with actual content in Modern Standard Arabic (MSA)
- Use ONLY English commas (,) and standard quotes (")
- NO Arabic commas (،) or special punctuation
- NO extra text before or after JSON
- NO explanatory text
- Make content specific to {host_name}, {guest_name}, and topic: {topic}
- Follow the EXACT structure shown above
- Keep content concise and actionable
- Ensure all arrays have exactly the specified number of items
"""

        response = self.deployment.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "You create global dialogue elements. Return ONLY valid JSON with English punctuation. No Arabic commas. No extra text."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7
        )

        return self._clean_json_response(response.choices[0].message.content)

    def enhance_dialogue_content(self, topic, information, classification_result, personas_result, structure_result):
        """
        Main orchestration method: Coordinates all chunks with configurable enhancement levels
        """
        print(f"🔧 Starting sectional dialogue enhancement (Level: {self.enhancement_level})...")
        print("=" * 50)
        
        try:
            structure = json.loads(structure_result)
        except:
            raise ValueError("Invalid structure JSON provided")
        
        # Extract sections
        conv_flow = structure.get("conversation_flow", {})
        intro1 = conv_flow.get("intro1", {})
        intro2 = conv_flow.get("intro2", {})
        main_discussion = conv_flow.get("main_discussion", [])
        closing = conv_flow.get("closing", {})
        
        # Chunk 1: Enhance intro sections
        print("📝 Chunk 1: Enhancing intro sections...")
        try:
            enhanced_intros_json = self.enhance_intro_sections(
                topic, classification_result, personas_result, intro1, intro2
            )
            enhanced_intros = json.loads(enhanced_intros_json)
            
            # Update structure
            structure["conversation_flow"]["intro1"].update(enhanced_intros.get("intro1", {}))
            structure["conversation_flow"]["intro2"].update(enhanced_intros.get("intro2", {}))
            print("✅ Intro sections enhanced successfully")
            
        except Exception as e:
            print(f"⚠️ Error enhancing intros: {e}")
        
        # Small delay between chunks
        time.sleep(1)
        
        # Chunk 2: Enhance main discussion points (one by one)
        print("📝 Chunk 2: Enhancing main discussion points...")
        enhanced_discussion_points = []
        
        for i, point in enumerate(main_discussion):
            print(f"  Enhancing discussion point {i+1}/{len(main_discussion)}...")
            try:
                enhanced_point_json = self.enhance_main_discussion_point(
                    topic, classification_result, personas_result, point, i
                )
                enhanced_point = json.loads(enhanced_point_json)
                enhanced_discussion_points.append(enhanced_point)
                print(f"  ✅ Point {i+1} enhanced successfully")
                
                # Small delay between points
                time.sleep(0.5)
                
            except Exception as e:
                print(f"  ⚠️ Error enhancing point {i+1}: {e}")
                print(f"  🔄 Using fallback enhancement for point {i+1}...")
                
                # Try fallback enhancement for this point
                enhanced_point = self._create_fallback_discussion_point(
                    topic, classification_result, personas_result, point, i
                )
                enhanced_discussion_points.append(enhanced_point)
                print(f"  ✅ Point {i+1} enhanced with fallback method")
        
        # Update structure with enhanced discussion points
        structure["conversation_flow"]["main_discussion"] = enhanced_discussion_points
        print("✅ All main discussion points processed")
        
        # Small delay between chunks
        time.sleep(1)
        
        # Chunk 3: Enhance closing sections
        print("📝 Chunk 3: Enhancing closing sections...")
        try:
            enhanced_closing_json = self.enhance_closing_sections(
                topic, classification_result, personas_result, closing
            )
            enhanced_closing = json.loads(enhanced_closing_json)
            
            # Update structure
            structure["conversation_flow"]["closing"].update(enhanced_closing)
            print("✅ Closing sections enhanced successfully")
            
        except Exception as e:
            print(f"⚠️ Error enhancing closing: {e}")
        
        # Small delay between chunks  
        time.sleep(1)
        
        # Chunk 4: Create global elements
        print("📝 Chunk 4: Creating global elements...")
        try:
            global_elements_json = self.create_global_elements(
                topic, classification_result, personas_result
            )
            global_elements = json.loads(global_elements_json)
            
            # Add global elements to structure
            structure.update(global_elements)
            print("✅ Global elements created successfully")
            
        except Exception as e:
            print(f"⚠️ Error creating global elements: {e}")
            print("🔄 Attempting to create fallback global elements...")
            
            # Create fallback global elements
            try:
                fallback_elements = self._create_fallback_global_elements(
                    topic, classification_result, personas_result
                )
                structure.update(fallback_elements)
                print("✅ Fallback global elements created successfully")
            except Exception as fallback_error:
                print(f"⚠️ Fallback also failed: {fallback_error}")
                print("📝 Using minimal default global elements...")
                # Add minimal default elements so validation doesn't fail
                structure.update(self._get_minimal_global_elements())
        
        print("=" * 50)
        print(f"🎉 Sectional dialogue enhancement completed! (Level: {self.enhancement_level})")
        
        return json.dumps(structure, ensure_ascii=False, indent=2)

    def _clean_json_response(self, response):
        """Helper method to clean JSON response - Enhanced version"""
        response = response.strip()
        
        # Remove any text before first { and after last }
        start_idx = response.find('{')
        end_idx = response.rfind('}')
        
        if start_idx != -1 and end_idx != -1:
            clean_json = response[start_idx:end_idx+1]
        else:
            clean_json = response
        
        # Replace Arabic commas and punctuation with English equivalents
        clean_json = clean_json.replace('،', ',')
        clean_json = clean_json.replace('"', '"')
        clean_json = clean_json.replace('"', '"')
        clean_json = clean_json.replace(''', "'")
        clean_json = clean_json.replace(''', "'")
        
        # Fix common JSON issues
        # Remove trailing commas before closing braces/brackets
        import re
        clean_json = re.sub(r',(\s*[}\]])', r'\1', clean_json)
        
        # Ensure proper quote escaping
        clean_json = clean_json.replace('\\"', '"')
        
        return clean_json

    def validate_enhanced_content(self, enhanced_json):
        """Validate the enhanced dialogue content (adapted for different levels)"""
        try:
            enhanced = json.loads(enhanced_json)
            
            missing_elements = []
            
            # Check global sections (varies by level)
            if self.enhancement_level == "minimal":
                required_global = ["spontaneous_moments", "dialogue_techniques"]
            elif self.enhancement_level == "standard":
                required_global = ["spontaneous_moments", "dialogue_techniques"]
            else:  # full
                required_global = ["spontaneous_moments", "personality_interactions", "dialogue_techniques"]
                
            for element in required_global:
                if element not in enhanced:
                    missing_elements.append(element)
            
            # Check enhanced conversation flow
            conv_flow = enhanced.get("conversation_flow", {})
            
            # Check intro1 enhancements
            intro1 = conv_flow.get("intro1", {})
            if "spontaneity_elements" not in intro1:
                missing_elements.append("intro1.spontaneity_elements")
            
            # Check intro2 enhancements  
            intro2 = conv_flow.get("intro2", {})
            if "cultural_connections" not in intro2:
                missing_elements.append("intro2.cultural_connections")
            
            # Check main discussion enhancements (varies by level)
            main_disc = conv_flow.get("main_discussion", [])
            if self.enhancement_level == "minimal":
                required_point_fields = ["spontaneous_triggers", "cultural_references"]
            elif self.enhancement_level == "standard":
                required_point_fields = ["spontaneous_triggers", "cultural_references", "natural_transitions"]
            else:  # full
                required_point_fields = ["spontaneous_triggers", "disagreement_points", "cultural_references", "natural_transitions", "emotional_triggers"]
            
            for i, point in enumerate(main_disc):
                for field in required_point_fields:
                    if field not in point:
                        missing_elements.append(f"main_discussion[{i}].{field}")
            
            # Check closing enhancements (varies by level)
            closing = conv_flow.get("closing", {})
            conclusion = closing.get("conclusion", {})
            outro = closing.get("outro", {})
            
            if "emotional_closure" not in conclusion:
                missing_elements.append("closing.conclusion.emotional_closure")
            if "memorable_ending" not in outro:
                missing_elements.append("closing.outro.memorable_ending")
                
            # Additional checks for standard/full levels
            if self.enhancement_level in ["standard", "full"]:
                if "key_insights" not in conclusion:
                    missing_elements.append("closing.conclusion.key_insights")
            if self.enhancement_level == "full":
                if "connection_building" not in outro:
                    missing_elements.append("closing.outro.connection_building")
            
            if missing_elements:
                return False, f"Missing enhanced elements: {missing_elements}"
            
            return True, f"Sectional dialogue content enhancement validation successful (Level: {self.enhancement_level})"
            
        except json.JSONDecodeError:
            return False, "Invalid JSON format in enhanced content"

    def _create_fallback_discussion_point(self, topic, classification_result, personas_result, discussion_point, point_index):
        """Create fallback enhancement for a single discussion point (level-aware)"""
        enhanced_point = discussion_point.copy()
        
        # Add minimal enhancements based on level
        if self.enhancement_level == "minimal":
            enhanced_point.update({
                "spontaneous_triggers": [
                    "هذا يثير تساؤلاً مهماً",
                    "دعني أشارككم تجربة في هذا المجال"
                ],
                "cultural_references": [
                    "كما يقول المثل: العلم نور",
                    "تراثنا يعلمنا أهمية التوازن في كل شيء"
                ]
            })
        elif self.enhancement_level == "standard":
            enhanced_point.update({
                "spontaneous_triggers": [
                    "هذا يثير تساؤلاً مهماً",
                    "دعني أشارككم تجربة في هذا المجال"
                ],
                "cultural_references": [
                    "كما يقول المثل: العلم نور",
                    "تراثنا يعلمنا أهمية التوازن في كل شيء"
                ],
                "natural_transitions": "هذا يقودنا إلى نقطة مهمة أخرى"
            })
        else:  # full
            enhanced_point.update({
                "spontaneous_triggers": [
                    "هذا يثير تساؤلاً مهماً",
                    "دعني أشارككم تجربة في هذا المجال"
                ],
                "disagreement_points": "قد تختلف وجهات النظر حول أفضل طريقة للتعامل مع هذه القضية",
                "cultural_references": [
                    "كما يقول المثل: العلم نور",
                    "تراثنا يعلمنا أهمية التوازن في كل شيء"
                ],
                "natural_transitions": "هذا يقودنا إلى نقطة مهمة أخرى",
                "emotional_triggers": "هذا الموضوع يلامس قلوب كل من يهتم بمستقبل ثقافتنا"
            })
        
        return enhanced_point

    def _create_fallback_global_elements(self, topic, classification_result, personas_result):
        """Create fallback global elements based on enhancement level"""
        try:
            personas = json.loads(personas_result)
        except:
            personas = {}
        
        host = personas.get("host", {})
        guest = personas.get("guest", {})
        host_name = host.get('name', 'المقدم')
        guest_name = guest.get('name', 'الضيف')
        
        fallback_elements = {
            "spontaneous_moments": {
                "natural_interruptions": [
                    "اسمحوا لي أن أضيف نقطة هنا",
                    "هذا يذكرني بموقف مشابه"
                ],
                "emotional_reactions": [
                    "هذا مؤثر فعلاً",
                    "لم أفكر في الأمر من هذه الزاوية"
                ]
            },
            "dialogue_techniques": {
                "questioning_styles": [
                    "أسئلة مفتوحة لتعميق النقاش",
                    "أسئلة تحليلية للوصول للجذور"
                ],
                "audience_engagement": [
                    "طرح أسئلة يفكر فيها المستمع",
                    "استخدام أمثلة من الواقع"
                ]
            }
        }
        
        # Add more elements for standard/full levels
        if self.enhancement_level in ["standard", "full"]:
            fallback_elements["spontaneous_moments"]["personal_stories"] = [
                "أتذكر موقفاً مشابهاً حدث معي",
                "في تجربتي الشخصية وجدت أن"
            ]
            fallback_elements["dialogue_techniques"]["storytelling_moments"] = [
                "سرد تجارب شخصية ذات صلة",
                "قصص نجاح ملهمة"
            ]
        
        if self.enhancement_level == "full":
            fallback_elements["spontaneous_moments"]["humorous_moments"] = [
                "هذا يذكرني بنكتة لطيفة",
                "الموقف له جانب طريف"
            ]
            fallback_elements["personality_interactions"] = {
                "host_strengths": f"{host_name} ماهر في طرح الأسئلة المناسبة وتوجيه الحوار",
                "guest_expertise": f"{guest_name} يقدم معرفة عميقة في مجال تخصصه",
                "natural_chemistry": "يتفاعل المقدم والضيف بطريقة طبيعية ومريحة",
                "tension_points": "قد يختلفان في بعض وجهات النظر مما يثري النقاش",
                "collaboration_moments": "يبنيان على أفكار بعضهما البعض لإثراء المحتوى"
            }
            fallback_elements["dialogue_techniques"]["emotional_peaks"] = [
                "لحظات تأملية عميقة",
                "قصص مؤثرة تلامس القلب"
            ]
        
        return fallback_elements

    def _get_minimal_global_elements(self):
        """Return minimal default global elements"""
        if self.enhancement_level == "minimal":
            return {
                "spontaneous_moments": {
                    "natural_interruptions": [
                        "اسمحوا لي أن أضيف نقطة هنا",
                        "هذا يذكرني بموقف مشابه"
                    ],
                    "emotional_reactions": [
                        "هذا مؤثر فعلاً",
                        "لم أفكر في الأمر من هذه الزاوية"
                    ]
                },
                "dialogue_techniques": {
                    "questioning_styles": [
                        "أسئلة مفتوحة لتعميق النقاش",
                        "أسئلة تحليلية للوصول للجذور"
                    ],
                    "audience_engagement": [
                        "طرح أسئلة يفكر فيها المستمع",
                        "استخدام أمثلة من الواقع"
                    ]
                }
            }
        elif self.enhancement_level == "standard":
            return {
                "spontaneous_moments": {
                    "natural_interruptions": [
                        "اسمحوا لي أن أضيف نقطة هنا",
                        "هذا يذكرني بموقف مشابه"
                    ],
                    "emotional_reactions": [
                        "هذا مؤثر فعلاً",
                        "لم أفكر في الأمر من هذه الزاوية"
                    ],
                    "personal_stories": [
                        "أتذكر موقفاً مشابهاً حدث معي",
                        "في تجربتي الشخصية وجدت أن"
                    ]
                },
                "dialogue_techniques": {
                    "questioning_styles": [
                        "أسئلة مفتوحة لتعميق النقاش",
                        "أسئلة تحليلية للوصول للجذور"
                    ],
                    "storytelling_moments": [
                        "سرد تجارب شخصية ذات صلة",
                        "قصص نجاح ملهمة"
                    ],
                    "audience_engagement": [
                        "طرح أسئلة يفكر فيها المستمع",
                        "استخدام أمثلة من الواقع"
                    ]
                }
            }
        else:  # full
            return {
                "spontaneous_moments": {
                    "natural_interruptions": [
                        "اسمحوا لي أن أضيف نقطة هنا",
                        "هذا يذكرني بموقف مشابه",
                        "انتظر، هذا مهم جداً"
                    ],
                    "emotional_reactions": [
                        "هذا مؤثر فعلاً",
                        "لم أفكر في الأمر من هذه الزاوية",
                        "أتفق معك تماماً"
                    ],
                    "personal_stories": [
                        "أتذكر موقفاً مشابهاً حدث معي",
                        "في تجربتي الشخصية وجدت أن"
                    ],
                    "humorous_moments": [
                        "هذا يذكرني بنكتة لطيفة",
                        "الموقف له جانب طريف"
                    ]
                },
                "personality_interactions": {
                    "host_strengths": "المقدم ماهر في طرح الأسئلة المناسبة وتوجيه الحوار",
                    "guest_expertise": "الضيف يقدم معرفة عميقة في مجال تخصصه",
                    "natural_chemistry": "يتفاعل المقدم والضيف بطريقة طبيعية ومريحة",
                    "tension_points": "قد يختلفان في بعض وجهات النظر مما يثري النقاش",
                    "collaboration_moments": "يبنيان على أفكار بعضهما البعض لإثراء المحتوى"
                },
                "dialogue_techniques": {
                    "questioning_styles": [
                        "أسئلة مفتوحة لتعميق النقاش",
                        "أسئلة تحليلية للوصول للجذور",
                        "أسئلة شخصية لإضافة البعد الإنساني"
                    ],
                    "storytelling_moments": [
                        "سرد تجارب شخصية ذات صلة",
                        "قصص نجاح ملهمة"
                    ],
                    "audience_engagement": [
                        "طرح أسئلة يفكر فيها المستمع",
                        "استخدام أمثلة من الواقع",
                        "دعوة المستمعين للتفاعل"
                    ],
                    "emotional_peaks": [
                        "لحظات تأملية عميقة",
                        "قصص مؤثرة تلامس القلب"
                    ]
                }
            }

# Enhanced Testing Function with Level Selection
def test_enhanced_dialogue_content_enhancer(deployment, topic, information, classification_result, personas_result, structure_result, model_name="Fanar-C-1-8.7B", enhancement_level="minimal"):
    """
    Test the enhanced dialogue content enhancer with level selection
    """
    print(f"🧪 Testing Enhanced Dialogue Content Enhancer (Level: {enhancement_level})...")
    print("=" * 60)
    
    enhancer = SectionalDialogueContentEnhancer(deployment, model_name, enhancement_level)
    
    # Run enhancement
    enhanced_result = enhancer.enhance_dialogue_content(
        topic, information, classification_result, personas_result, structure_result
    )
    
    # Validate enhanced content
    is_valid, validation_message = enhancer.validate_enhanced_content(enhanced_result)
    
    print(f"\n📊 Enhancement Results (Level: {enhancement_level}):")
    print(f"Validation: {'✅ Valid' if is_valid else '❌ Invalid'}")
    print(f"Message: {validation_message}")
    
    # Quick content analysis
    try:
        enhanced_data = json.loads(enhanced_result)
        
        # Count enhancement fields
        conv_flow = enhanced_data.get("conversation_flow", {})
        intro1_enhancements = len([k for k in conv_flow.get("intro1", {}).keys() if k not in ["opening_line", "podcast_introduction", "episode_hook", "tone_guidance"]])
        intro2_enhancements = len([k for k in conv_flow.get("intro2", {}).keys() if k not in ["topic_introduction", "guest_welcome", "guest_bio_highlight", "transition_to_discussion"]])
        
        main_discussion = conv_flow.get("main_discussion", [])
        discussion_enhancements = 0
        for point in main_discussion:
            discussion_enhancements += len([k for k in point.keys() if k not in ["point_title", "personal_angle"]])
        
        closing_enhancements = 0
        closing = conv_flow.get("closing", {})
        conclusion = closing.get("conclusion", {})
        outro = closing.get("outro", {})
        closing_enhancements += len([k for k in conclusion.keys() if k not in ["main_takeaways", "guest_final_message", "host_closing_thoughts"]])
        closing_enhancements += len([k for k in outro.keys() if k not in ["guest_appreciation", "audience_thanks", "call_to_action", "final_goodbye"]])
        
        global_sections = len([k for k in enhanced_data.keys() if k not in ["episode_topic", "personas", "conversation_flow", "cultural_context", "language_style", "technical_notes"]])
        
        print(f"\n📈 Enhancement Statistics:")
        print(f"Intro1 Enhancements: {intro1_enhancements}")
        print(f"Intro2 Enhancements: {intro2_enhancements}")
        print(f"Discussion Enhancements: {discussion_enhancements}")
        print(f"Closing Enhancements: {closing_enhancements}")
        print(f"Global Sections Added: {global_sections}")
        
        # Estimate size reduction vs original
        total_enhancements = intro1_enhancements + intro2_enhancements + discussion_enhancements + closing_enhancements + global_sections
        
        if enhancement_level == "minimal":
            expected_vs_full = "~60% smaller than full enhancement"
        elif enhancement_level == "standard":
            expected_vs_full = "~40% smaller than full enhancement"
        else:
            expected_vs_full = "Full enhancement level"
        
        print(f"Total Enhancement Fields: {total_enhancements}")
        print(f"Size vs Full: {expected_vs_full}")
        
        # Show sample enhanced content
        print(f"\n🎯 Sample Enhanced Content:")
        intro1 = conv_flow.get("intro1", {})
        if "spontaneity_elements" in intro1:
            spont_elements = intro1["spontaneity_elements"]
            print(f"Intro1 Spontaneity: {len(spont_elements)} elements")
            for i, element in enumerate(spont_elements, 1):
                print(f"  {i}. {element[:60]}...")
        
        global_spont = enhanced_data.get("spontaneous_moments", {})
        if "natural_interruptions" in global_spont:
            interruptions = global_spont["natural_interruptions"]
            print(f"Natural Interruptions: {len(interruptions)} items")
            for i, interruption in enumerate(interruptions, 1):
                print(f"  {i}. {interruption[:60]}...")
        
    except Exception as e:
        print(f"⚠️ Error analyzing enhanced content: {e}")
    
    return enhanced_result

# Usage examples for different enhancement levels:
"""
# Minimal enhancement (fastest, most concise)
enhancer_minimal = SectionalDialogueContentEnhancer(deployment, "Fanar-C-1-8.7B", "minimal")
result_minimal = enhancer_minimal.enhance_dialogue_content(topic, information, classification_result, personas_result, structure_result)

# Standard enhancement (balanced)
enhancer_standard = SectionalDialogueContentEnhancer(deployment, "Fanar-C-1-8.7B", "standard")
result_standard = enhancer_standard.enhance_dialogue_content(topic, information, classification_result, personas_result, structure_result)

# Full enhancement (comprehensive but largest)
enhancer_full = SectionalDialogueContentEnhancer(deployment, "Fanar-C-1-8.7B", "full")
result_full = enhancer_full.enhance_dialogue_content(topic, information, classification_result, personas_result, structure_result)

# Test with specific level
enhanced_result = test_enhanced_dialogue_content_enhancer(
    deployment, topic, information, classification_result, personas_result, structure_result, 
    model_name="Fanar-C-1-8.7B", enhancement_level="minimal"
)
"""

'\n# Minimal enhancement (fastest, most concise)\nenhancer_minimal = SectionalDialogueContentEnhancer(deployment, "Fanar-C-1-8.7B", "minimal")\nresult_minimal = enhancer_minimal.enhance_dialogue_content(topic, information, classification_result, personas_result, structure_result)\n\n# Standard enhancement (balanced)\nenhancer_standard = SectionalDialogueContentEnhancer(deployment, "Fanar-C-1-8.7B", "standard")\nresult_standard = enhancer_standard.enhance_dialogue_content(topic, information, classification_result, personas_result, structure_result)\n\n# Full enhancement (comprehensive but largest)\nenhancer_full = SectionalDialogueContentEnhancer(deployment, "Fanar-C-1-8.7B", "full")\nresult_full = enhancer_full.enhance_dialogue_content(topic, information, classification_result, personas_result, structure_result)\n\n# Test with specific level\nenhanced_result = test_enhanced_dialogue_content_enhancer(\n    deployment, topic, information, classification_result, personas_result, structure

In [14]:
# Standard enhancement (balanced)
enhancer_standard = SectionalDialogueContentEnhancer(deployment, model, "standard")
result_standard = enhancer_standard.enhance_dialogue_content(topic, information, classification_result, personas_result, structure_result)
print("Standard Enhancement Result:")
print(result_standard)

🔧 Starting sectional dialogue enhancement (Level: standard)...
📝 Chunk 1: Enhancing intro sections...
✅ Intro sections enhanced successfully
📝 Chunk 2: Enhancing main discussion points...
  Enhancing discussion point 1/3...
  ✅ Point 1 enhanced successfully
  Enhancing discussion point 2/3...
  ✅ Point 2 enhanced successfully
  Enhancing discussion point 3/3...
  ⚠️ Error enhancing point 3: Expecting ',' delimiter: line 4 column 42 (char 153)
  🔄 Using fallback enhancement for point 3...
  ✅ Point 3 enhanced with fallback method
✅ All main discussion points processed
📝 Chunk 3: Enhancing closing sections...
✅ Closing sections enhanced successfully
📝 Chunk 4: Creating global elements...
✅ Global elements created successfully
🎉 Sectional dialogue enhancement completed! (Level: standard)
Standard Enhancement Result:
{
  "episode_topic": "نقاش حول الذكاء الاصطناعي والهوية العربية: كيف نحافظ على ثقافتنا في العصر الرقمي",
  "personas": {
    "host": {
      "name": "أحمد بن علي",
      "back

In [15]:
import json
import time

class MinimalPolishEnhancer:
    def __init__(self, deployment, model="gpt-4o"):
        self.model = model
        self.deployment = deployment

    def enhance_spontaneous_moments_values(self, topic, classification_result, personas_result, current_spontaneous_moments):
        """
        Chunk 1: Enhance values in existing spontaneous_moments (no new fields)
        """
        try:
            classification = json.loads(classification_result)
            personas = json.loads(personas_result)
        except:
            raise ValueError("Invalid JSON provided")
        
        optimal_style = classification.get("optimal_style", "")
        host = personas.get("host", {})
        guest = personas.get("guest", {})
        host_name = host.get('name', 'المقدم')
        guest_name = guest.get('name', 'الضيف')

        prompt = f"""
You are an expert in enhancing Arabic podcast dialogue quality.

Task: Enhance ONLY the VALUES in the existing spontaneous_moments structure. Keep the exact same fields and array lengths.

Topic: {topic}
Style: {optimal_style}
Host: {host_name} - {host.get('background', '')}
Guest: {guest_name} - {guest.get('background', '')}

Current spontaneous moments: {json.dumps(current_spontaneous_moments, ensure_ascii=False)}

ENHANCEMENT REQUIREMENTS:
- Keep the EXACT same JSON structure and field names
- Keep the EXACT same number of items in each array
- ONLY enhance the quality and specificity of existing values
- Make each phrase more natural and topic-specific
- Connect phrases directly to the topic: {topic}
- Make content specific to {host_name} and {guest_name}'s backgrounds
- Ensure phrases sound more authentic and conversational

Return the enhanced structure with the same fields but better values:

{{
    [exact same structure as input, but with enhanced values]
}}

CRITICAL REQUIREMENTS:
- All enhanced values in Modern Standard Arabic (MSA)
- Make content specific to topic: {topic} and these exact personas
- Improve naturalness and authenticity of existing phrases
- Use English punctuation only (no ،)
- Return only valid JSON, no extra text
- Do NOT add new fields or arrays
- Do NOT change array lengths
"""

        response = self.deployment.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": f"You enhance existing values only. Style: {optimal_style}. Keep exact structure. Return only valid JSON with English punctuation."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7
        )

        return self._clean_json_response(response.choices[0].message.content)

    def enhance_cultural_context_values(self, topic, classification_result, current_cultural_context):
        """
        Chunk 2: Enhance values in existing cultural_context (no new fields)
        """
        try:
            classification = json.loads(classification_result)
        except:
            classification = {}
        
        optimal_style = classification.get("optimal_style", "")
        primary_category = classification.get("primary_category", "")

        prompt = f"""
You are an expert in enhancing Arabic cultural references.

Task: Enhance ONLY the VALUES in the existing cultural_context structure. Keep the exact same fields and array lengths.

Topic: {topic}
Category: {primary_category}

Current cultural context: {json.dumps(current_cultural_context, ensure_ascii=False)}

ENHANCEMENT REQUIREMENTS:
- Keep the EXACT same JSON structure and field names
- Keep the EXACT same number of items in each array
- ONLY enhance the quality and relevance of existing values
- Make proverbs more directly relevant to the topic: {topic}
- Make regional references more specific and meaningful
- Ensure cultural authenticity and accuracy

Example enhancement:
- Before: "العلم نور"
- After: "العلم نور، والذكاء الاصطناعي يمكن أن يكون شمعة تضيء طريق الحفاظ على تراثنا"

Return the enhanced cultural context with better, more topic-specific values:

{{
    [exact same structure as input, but with enhanced values]
}}

CRITICAL REQUIREMENTS:
- All enhanced values in Modern Standard Arabic (MSA)
- Make all content directly relevant to topic: {topic}
- Maintain cultural authenticity and accuracy
- Use English punctuation only (no ،)
- Return only valid JSON, no extra text
- Do NOT add new fields or arrays
- Do NOT change array lengths
"""

        response = self.deployment.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": f"You enhance cultural context values only. Keep exact structure. Return only valid JSON with English punctuation."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.6
        )

        return self._clean_json_response(response.choices[0].message.content)

    def enhance_dialogue_techniques_values(self, topic, classification_result, personas_result, current_dialogue_techniques):
        """
        Chunk 3: Enhance values in existing dialogue_techniques (no new fields)
        """
        try:
            classification = json.loads(classification_result)
            personas = json.loads(personas_result)
        except:
            raise ValueError("Invalid JSON provided")
        
        optimal_style = classification.get("optimal_style", "")
        host = personas.get("host", {})
        guest = personas.get("guest", {})
        host_name = host.get('name', 'المقدم')
        guest_name = guest.get('name', 'الضيف')

        prompt = f"""
You are an expert in enhancing Arabic podcast dialogue techniques.

Task: Enhance ONLY the VALUES in the existing dialogue_techniques structure. Keep the exact same fields and array lengths.

Topic: {topic}
Style: {optimal_style}
Host: {host_name} - {host.get('background', '')}
Guest: {guest_name} - {guest.get('background', '')}

Current dialogue techniques: {json.dumps(current_dialogue_techniques, ensure_ascii=False)}

ENHANCEMENT REQUIREMENTS:
- Keep the EXACT same JSON structure and field names
- Keep the EXACT same number of items in each array
- ONLY enhance the quality and specificity of existing values
- Make techniques more specific to the topic: {topic}
- Tailor content to {host_name} and {guest_name}'s expertise
- Make techniques more actionable and practical

Example enhancement:
- Before: "أسئلة مفتوحة لتعميق النقاش"
- After: "أسئلة مفتوحة حول كيفية تطوير ذكاء اصطناعي يحافظ على جمالية اللغة العربية وعمقها الثقافي"

Return the enhanced dialogue techniques with better, more specific values:

{{
    [exact same structure as input, but with enhanced values]
}}

CRITICAL REQUIREMENTS:
- All enhanced values in Modern Standard Arabic (MSA)
- Make content specific to topic: {topic} and these personas
- Improve practicality and specificity of techniques
- Use English punctuation only (no ،)
- Return only valid JSON, no extra text
- Do NOT add new fields or arrays
- Do NOT change array lengths
"""

        response = self.deployment.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": f"You enhance dialogue technique values only. Style: {optimal_style}. Keep exact structure. Return only valid JSON with English punctuation."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7
        )

        return self._clean_json_response(response.choices[0].message.content)

    def enhance_main_discussion_values(self, topic, classification_result, personas_result, current_main_discussion):
        """
        Chunk 4: Enhance values in existing main_discussion points (no new fields)
        """
        try:
            classification = json.loads(classification_result)
            personas = json.loads(personas_result)
        except:
            raise ValueError("Invalid JSON provided")
        
        optimal_style = classification.get("optimal_style", "")
        host = personas.get("host", {})
        guest = personas.get("guest", {})
        host_name = host.get('name', 'المقدم')
        guest_name = guest.get('name', 'الضيف')

        prompt = f"""
You are an expert in enhancing Arabic podcast discussion content.

Task: Enhance ONLY the VALUES in the existing main_discussion structure. Keep the exact same fields and array lengths.

Topic: {topic}
Style: {optimal_style}
Host: {host_name} - {host.get('background', '')}
Guest: {guest_name} - {guest.get('background', '')}

Current main discussion: {json.dumps(current_main_discussion, ensure_ascii=False)}

ENHANCEMENT REQUIREMENTS:
- Keep the EXACT same JSON structure and field names for each discussion point
- Keep the EXACT same number of items in each array
- ONLY enhance the quality and specificity of existing values
- Make spontaneous_triggers more natural and topic-specific
- Make cultural_references more directly relevant to the topic
- Make natural_transitions smoother and more contextual
- Ensure content reflects {host_name} and {guest_name}'s specific expertise

Example enhancement:
- Before: "هذا يثير تساؤلاً مهماً"
- After: "هذا يثير تساؤلاً مهماً حول قدرة الذكاء الاصطناعي على فهم السياق الثقافي وراء الكلمات العربية"

Return the enhanced main discussion with better, more specific values:

[
    {{
        [exact same structure as each input point, but with enhanced values]
    }},
    ...
]

CRITICAL REQUIREMENTS:
- All enhanced values in Modern Standard Arabic (MSA)
- Make content specific to topic: {topic} and these personas
- Improve naturalness and conversational flow
- Use English punctuation only (no ،)
- Return only valid JSON array, no extra text
- Do NOT add new fields to discussion points
- Do NOT change array lengths within points
"""

        response = self.deployment.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": f"You enhance discussion point values only. Style: {optimal_style}. Keep exact structure. Return only valid JSON with English punctuation."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7
        )

        return self._clean_json_response(response.choices[0].message.content)

    def enhance_intro_outro_values(self, topic, classification_result, personas_result, current_intro1, current_intro2, current_closing):
        """
        Chunk 5: Enhance values in existing intro and outro sections (no new fields)
        """
        try:
            classification = json.loads(classification_result)
            personas = json.loads(personas_result)
        except:
            raise ValueError("Invalid JSON provided")
        
        optimal_style = classification.get("optimal_style", "")
        host = personas.get("host", {})
        guest = personas.get("guest", {})
        host_name = host.get('name', 'المقدم')
        guest_name = guest.get('name', 'الضيف')

        prompt = f"""
You are an expert in enhancing Arabic podcast intros and outros.

Task: Enhance ONLY the VALUES in the existing intro1, intro2, and closing structures. Keep the exact same fields.

Topic: {topic}
Style: {optimal_style}
Host: {host_name} - {host.get('background', '')}
Guest: {guest_name} - {guest.get('background', '')}

Current intro1: {json.dumps(current_intro1, ensure_ascii=False)}
Current intro2: {json.dumps(current_intro2, ensure_ascii=False)}
Current closing: {json.dumps(current_closing, ensure_ascii=False)}

ENHANCEMENT REQUIREMENTS:
- Keep the EXACT same JSON structure and field names
- Keep the EXACT same number of items in each array (if any)
- ONLY enhance the quality and specificity of existing values
- Make opening_line more engaging and natural
- Make episode_hook more compelling and topic-specific
- Make guest_welcome more personal and authentic
- Make closing thoughts more memorable and impactful
- Ensure content reflects the personalities of {host_name} and {guest_name}

Example enhancement:
- Before: "أهلاً بكم مستمعينا الكرام"
- After: "أهلاً بكم مستمعينا الكرام في رحلة استكشافية مثيرة لنتعرف على كيفية جعل الذكاء الاصطناعي حارساً لتراثنا العربي"

Return the enhanced sections:

{{
    "intro1": {{
        [exact same structure as input, but with enhanced values]
    }},
    "intro2": {{
        [exact same structure as input, but with enhanced values]
    }},
    "closing": {{
        [exact same structure as input, but with enhanced values]
    }}
}}

CRITICAL REQUIREMENTS:
- All enhanced values in Modern Standard Arabic (MSA)
- Make content specific to topic: {topic} and these personas
- Improve engagement and naturalness
- Use English punctuation only (no ،)
- Return only valid JSON, no extra text
- Do NOT add new fields
- Do NOT change array lengths
"""

        response = self.deployment.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": f"You enhance intro/outro values only. Style: {optimal_style}. Keep exact structure. Return only valid JSON with English punctuation."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7
        )

        return self._clean_json_response(response.choices[0].message.content)

    def apply_minimal_polish(self, topic, information, classification_result, personas_result, enhanced_content_result):
        """
        Main orchestration method: Coordinates all value enhancement chunks
        """
        print("🎨 Starting minimal polish (value enhancement only)...")
        print("=" * 50)
        
        try:
            enhanced_content = json.loads(enhanced_content_result)
        except:
            raise ValueError("Invalid enhanced content JSON provided")
        
        # Chunk 1: Enhance spontaneous moments values
        print("✨ Chunk 1: Enhancing spontaneous moments values...")
        try:
            current_spontaneous = enhanced_content.get("spontaneous_moments", {})
            if current_spontaneous:  # Only enhance if exists
                enhanced_spontaneous_json = self.enhance_spontaneous_moments_values(
                    topic, classification_result, personas_result, current_spontaneous
                )
                enhanced_spontaneous = json.loads(enhanced_spontaneous_json)
                enhanced_content["spontaneous_moments"] = enhanced_spontaneous
                print("✅ Spontaneous moments values enhanced successfully")
            else:
                print("⏭️ No spontaneous moments to enhance")
        except Exception as e:
            print(f"⚠️ Error enhancing spontaneous moments: {e}")
        
        time.sleep(1)
        
        # Chunk 2: Enhance cultural context values
        print("✨ Chunk 2: Enhancing cultural context values...")
        try:
            current_cultural = enhanced_content.get("cultural_context", {})
            if current_cultural:  # Only enhance if exists
                enhanced_cultural_json = self.enhance_cultural_context_values(
                    topic, classification_result, current_cultural
                )
                enhanced_cultural = json.loads(enhanced_cultural_json)
                enhanced_content["cultural_context"] = enhanced_cultural
                print("✅ Cultural context values enhanced successfully")
            else:
                print("⏭️ No cultural context to enhance")
        except Exception as e:
            print(f"⚠️ Error enhancing cultural context: {e}")
        
        time.sleep(1)
        
        # Chunk 3: Enhance dialogue techniques values
        print("✨ Chunk 3: Enhancing dialogue techniques values...")
        try:
            current_dialogue = enhanced_content.get("dialogue_techniques", {})
            if current_dialogue:  # Only enhance if exists
                enhanced_dialogue_json = self.enhance_dialogue_techniques_values(
                    topic, classification_result, personas_result, current_dialogue
                )
                enhanced_dialogue = json.loads(enhanced_dialogue_json)
                enhanced_content["dialogue_techniques"] = enhanced_dialogue
                print("✅ Dialogue techniques values enhanced successfully")
            else:
                print("⏭️ No dialogue techniques to enhance")
        except Exception as e:
            print(f"⚠️ Error enhancing dialogue techniques: {e}")
        
        time.sleep(1)
        
        # Chunk 4: Enhance main discussion values
        print("✨ Chunk 4: Enhancing main discussion values...")
        try:
            conv_flow = enhanced_content.get("conversation_flow", {})
            current_main_discussion = conv_flow.get("main_discussion", [])
            if current_main_discussion:  # Only enhance if exists
                enhanced_discussion_json = self.enhance_main_discussion_values(
                    topic, classification_result, personas_result, current_main_discussion
                )
                enhanced_discussion = json.loads(enhanced_discussion_json)
                enhanced_content["conversation_flow"]["main_discussion"] = enhanced_discussion
                print("✅ Main discussion values enhanced successfully")
            else:
                print("⏭️ No main discussion to enhance")
        except Exception as e:
            print(f"⚠️ Error enhancing main discussion: {e}")
        
        time.sleep(1)
        
        # Chunk 5: Enhance intro and outro values
        print("✨ Chunk 5: Enhancing intro and outro values...")
        try:
            conv_flow = enhanced_content.get("conversation_flow", {})
            current_intro1 = conv_flow.get("intro1", {})
            current_intro2 = conv_flow.get("intro2", {})
            current_closing = conv_flow.get("closing", {})
            
            if current_intro1 or current_intro2 or current_closing:  # Only enhance if any exist
                enhanced_sections_json = self.enhance_intro_outro_values(
                    topic, classification_result, personas_result, 
                    current_intro1, current_intro2, current_closing
                )
                enhanced_sections = json.loads(enhanced_sections_json)
                
                if "intro1" in enhanced_sections and current_intro1:
                    enhanced_content["conversation_flow"]["intro1"] = enhanced_sections["intro1"]
                if "intro2" in enhanced_sections and current_intro2:
                    enhanced_content["conversation_flow"]["intro2"] = enhanced_sections["intro2"]
                if "closing" in enhanced_sections and current_closing:
                    enhanced_content["conversation_flow"]["closing"] = enhanced_sections["closing"]
                
                print("✅ Intro and outro values enhanced successfully")
            else:
                print("⏭️ No intro/outro sections to enhance")
        except Exception as e:
            print(f"⚠️ Error enhancing intro/outro: {e}")
        
        print("=" * 50)
        print("🎉 Minimal polish completed! Same structure, enhanced values.")
        
        return json.dumps(enhanced_content, ensure_ascii=False, indent=2)

    def _clean_json_response(self, response):
        """Enhanced JSON cleaning method"""
        response = response.strip()
        
        # Remove any text before first { and after last }
        start_idx = response.find('{')
        end_idx = response.rfind('}')
        
        if start_idx != -1 and end_idx != -1:
            clean_json = response[start_idx:end_idx+1]
        else:
            clean_json = response
        
        # Handle arrays
        if clean_json.strip().startswith('['):
            start_idx = response.find('[')
            end_idx = response.rfind(']')
            if start_idx != -1 and end_idx != -1:
                clean_json = response[start_idx:end_idx+1]
        
        # Replace Arabic punctuation with English equivalents
        clean_json = clean_json.replace('،', ',')
        clean_json = clean_json.replace('"', '"')
        clean_json = clean_json.replace('"', '"')
        clean_json = clean_json.replace(''', "'")
        clean_json = clean_json.replace(''', "'")
        
        # Fix common JSON issues
        import re
        clean_json = re.sub(r',(\s*[}\]])', r'\1', clean_json)
        
        return clean_json

    def validate_polished_outline(self, original_json, polished_json):
        """
        Validate that polished outline has same structure but enhanced values
        """
        try:
            original = json.loads(original_json)
            polished = json.loads(polished_json)
            
            issues = []
            
            # Check that main structure is preserved
            original_keys = set(original.keys())
            polished_keys = set(polished.keys())
            
            if original_keys != polished_keys:
                issues.append(f"Main structure changed: {original_keys} vs {polished_keys}")
            
            # Check conversation flow structure
            orig_conv = original.get("conversation_flow", {})
            pol_conv = polished.get("conversation_flow", {})
            
            if set(orig_conv.keys()) != set(pol_conv.keys()):
                issues.append("Conversation flow structure changed")
            
            # Check main discussion array length
            orig_main = orig_conv.get("main_discussion", [])
            pol_main = pol_conv.get("main_discussion", [])
            
            if len(orig_main) != len(pol_main):
                issues.append(f"Main discussion length changed: {len(orig_main)} vs {len(pol_main)}")
            
            # Check that arrays within sections maintain length
            sections_to_check = ["spontaneous_moments", "dialogue_techniques", "cultural_context"]
            
            for section in sections_to_check:
                if section in original and section in polished:
                    orig_section = original[section]
                    pol_section = polished[section]
                    
                    if isinstance(orig_section, dict) and isinstance(pol_section, dict):
                        for key in orig_section:
                            if isinstance(orig_section[key], list) and isinstance(pol_section.get(key), list):
                                if len(orig_section[key]) != len(pol_section[key]):
                                    issues.append(f"{section}.{key} array length changed")
            
            # Check for quality improvement (simple heuristic)
            orig_text = json.dumps(original, ensure_ascii=False)
            pol_text = json.dumps(polished, ensure_ascii=False)
            
            if len(pol_text) < len(orig_text) * 0.95:  # Significant reduction might indicate loss of content
                issues.append("Polished content appears significantly shorter")
            
            if issues:
                return False, f"Structure validation issues: {issues}"
            
            return True, "Minimal polish validation successful - same structure, enhanced values"
            
        except json.JSONDecodeError as e:
            return False, f"JSON parsing error: {e}"

# Enhanced Testing Function
def test_minimal_polish_enhancer(deployment, topic, information, classification_result, personas_result, enhanced_content_result, model_name="Fanar-C-1-8.7B"):
    """
    Test the minimal polish enhancer
    """
    print("🧪 Testing Minimal Polish Enhancer (Value Enhancement Only)...")
    print("=" * 60)
    
    polisher = MinimalPolishEnhancer(deployment, model_name)
    
    # Store original for comparison
    original_json = enhanced_content_result
    
    # Run minimal polish
    polished_result = polisher.apply_minimal_polish(
        topic, information, classification_result, personas_result, enhanced_content_result
    )
    
    # Validate polished content
    is_valid, validation_message = polisher.validate_polished_outline(original_json, polished_result)
    
    print(f"\n📊 Polish Results:")
    print(f"Validation: {'✅ Valid' if is_valid else '❌ Invalid'}")
    print(f"Message: {validation_message}")
    
    # Quick comparison analysis
    try:
        original_data = json.loads(original_json)
        polished_data = json.loads(polished_result)
        
        # Compare sample values
        print(f"\n🔍 Sample Value Comparisons:")
        
        # Spontaneous moments comparison
        orig_spont = original_data.get("spontaneous_moments", {}).get("natural_interruptions", [])
        pol_spont = polished_data.get("spontaneous_moments", {}).get("natural_interruptions", [])
        
        if orig_spont and pol_spont:
            print(f"Natural Interruptions:")
            print(f"  Original: {orig_spont[0][:50]}...")
            print(f"  Polished: {pol_spont[0][:50]}...")
        
        # Cultural context comparison
        orig_cultural = original_data.get("cultural_context", {}).get("proverbs_sayings", [])
        pol_cultural = polished_data.get("cultural_context", {}).get("proverbs_sayings", [])
        
        if orig_cultural and pol_cultural:
            print(f"Proverbs:")
            print(f"  Original: {orig_cultural[0][:50]}...")
            print(f"  Polished: {pol_cultural[0][:50]}...")
        
        # Size comparison
        orig_size = len(json.dumps(original_data, ensure_ascii=False))
        pol_size = len(json.dumps(polished_data, ensure_ascii=False))
        size_change = ((pol_size - orig_size) / orig_size) * 100
        
        print(f"\n📈 Size Analysis:")
        print(f"Original Size: {orig_size:,} characters")
        print(f"Polished Size: {pol_size:,} characters")
        print(f"Size Change: {size_change:+.1f}%")
        print(f"Approach: {'✅ Value enhancement only' if abs(size_change) < 15 else '⚠️ Significant size change'}")
        
    except Exception as e:
        print(f"⚠️ Error analyzing polished content: {e}")
    
    return polished_result

# Usage:
# polisher = MinimalPolishEnhancer(deployment, "Fanar-C-1-8.7B")
# final_polished_outline = polisher.apply_minimal_polish(topic, information, classification_result, personas_result, enhanced_content_result)

# Test the polisher
# polished_result = test_minimal_polish_enhancer(
#     deployment, topic, information, classification_result, personas_result, enhanced_content_result
# )

In [16]:
# Usage:
polisher = MinimalPolishEnhancer(deployment, model)
final_polished_outline = polisher.apply_minimal_polish(topic, information, classification_result, personas_result, result_standard)
print("Final Polished Outline:")
print(final_polished_outline)
# Test the polisher
# polished_result = test_minimal_polish_enhancer(
#     deployment, topic, information, classification_result, personas_result, result_standard)

🎨 Starting minimal polish (value enhancement only)...
✨ Chunk 1: Enhancing spontaneous moments values...
✅ Spontaneous moments values enhanced successfully
✨ Chunk 2: Enhancing cultural context values...
✅ Cultural context values enhanced successfully
✨ Chunk 3: Enhancing dialogue techniques values...
⚠️ Error enhancing dialogue techniques: Expecting ',' delimiter: line 12 column 5 (char 772)
✨ Chunk 4: Enhancing main discussion values...
⚠️ Error enhancing main discussion: Expecting ',' delimiter: line 12 column 140 (char 877)
✨ Chunk 5: Enhancing intro and outro values...
✅ Intro and outro values enhanced successfully
🎉 Minimal polish completed! Same structure, enhanced values.
Final Polished Outline:
{
  "episode_topic": "نقاش حول الذكاء الاصطناعي والهوية العربية: كيف نحافظ على ثقافتنا في العصر الرقمي",
  "personas": {
    "host": {
      "name": "أحمد بن علي",
      "background": "مقدم برامج إذاعية معروف بشغفه بالتكنولوجيا والقضايا الاجتماعية",
      "speaking_style": "متفاعل مع ال

In [25]:
import json
import time
import re

class ImprovedMicroChunkScriptGenerator:
    def __init__(self, deployment, model="gpt-4o"):
        self.model = model
        self.deployment = deployment
        
        # Simple conversation templates for fallbacks
        self.fallback_templates = {
            "intro1": "{host_name}: مرحباً بكم مستمعينا الكرام في حلقة جديدة. اليوم سنتحدث عن {topic}. موضوع مهم ومثير للاهتمام.",
            "intro2": "{host_name}: معي اليوم ضيف متميز، {guest_name}. أهلاً وسهلاً بك.\n{guest_name}: أهلاً بك، شكراً على الاستضافة. سعيد بوجودي معكم.",
            "discussion": "{host_name}: {point_title}، ما رأيك في هذا الموضوع؟\n{guest_name}: موضوع مهم فعلاً. أعتقد أن هناك عدة جوانب يجب أن نفكر فيها.",
            "closing": "{host_name}: شكراً {guest_name} على هذا النقاش المفيد.\n{guest_name}: شكراً لك على الاستضافة.\n{host_name}: وشكراً لكم مستمعينا الكرام. نلقاكم في حلقة قادمة."
        }

    def generate_intro1_only(self, topic, intro1_data, host_persona):
        """
        Micro-Chunk 1: Generate only intro1 (host speaking alone)
        Enhanced with natural, conversational tone and spontaneity
        """
        host_name = host_persona.get('name', 'المقدم')
        host_bg = host_persona.get('background', '')
        host_style = host_persona.get('speaking_style', '')
        
        # Extract essential data
        opening_line = intro1_data.get('opening_line', '')
        episode_hook = intro1_data.get('episode_hook', '')
        
        # Get one cultural element if available
        cultural_elements = intro1_data.get('cultural_connections', [])
        cultural_touch = cultural_elements[0] if cultural_elements else ""

        prompt = f"""
Generate Arabic podcast opening. Host speaks alone.

Host: {host_name}
Topic: {topic}
Opening line: {opening_line}

Requirements:
- 2-3 short sentences maximum
- Natural Arabic conversation
- No emojis or symbols
- No meta-text or explanations
- Include topic naturally

Example:
{host_name}: مرحباً مستمعينا. موضوع اليوم هو {topic}. شيء مهم نحتاج نتكلم عنه.

Generate only the dialogue:
"""

        try:
            response = self.deployment.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are creating natural, spontaneous Arabic dialogue. Avoid formal speech patterns. Make it feel like real conversation with natural hesitations and genuine curiosity."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.8
            )
            
            result = response.choices[0].message.content
            
            # Quality check
            if self._assess_quality(result) >= 75:
                return result
            else:
                return self._get_fallback_intro1(topic, host_name, opening_line)
                
        except Exception as e:
            print(f"Error generating intro1: {e}")
            return self._get_fallback_intro1(topic, host_name, opening_line)

    def generate_intro2_only(self, topic, intro2_data, host_persona, guest_persona, intro1_context):
        """
        Micro-Chunk 2: Generate guest introduction with natural, dynamic exchange
        """
        host_name = host_persona.get('name', 'المقدم')
        guest_name = guest_persona.get('name', 'الضيف')
        guest_bg = guest_persona.get('background', '')
        guest_style = guest_persona.get('speaking_style', '')
        
        # Get key topic from context
        context = intro1_context[-80:] if len(intro1_context) > 80 else intro1_context
        
        # Extract data
        guest_welcome = intro2_data.get('guest_welcome', '')
        guest_bio = intro2_data.get('guest_bio_highlight', '')

        prompt = f"""
Generate Arabic podcast dialogue. Host introduces guest.

Host: {host_name}
Guest: {guest_name}
Topic: {topic}

Requirements:
- 4-6 short exchanges
- Each person speaks 1-2 sentences maximum
- Natural conversation flow
- No emojis or symbols
- No meta-text or explanations

Example:
{host_name}: معي اليوم {guest_name}. أهلاً بك.
{guest_name}: أهلاً {host_name}. شكراً على الدعوة.
{host_name}: نتكلم اليوم عن {topic}. إيش رأيك؟
{guest_name}: موضوع مهم فعلاً.

Generate only the dialogue:
"""

        try:
            response = self.deployment.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "Generate simple Arabic dialogue between two people. Short exchanges only. No emojis. No explanations."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.85
            )
            
            result = response.choices[0].message.content
            
            if self._assess_quality(result) >= 75:
                return result
            else:
                return self._get_fallback_intro2(host_name, guest_name, guest_welcome)
                
        except Exception as e:
            print(f"Error generating intro2: {e}")
            return self._get_fallback_intro2(host_name, guest_name, guest_welcome)

    def generate_discussion_point(self, topic, point_data, host_persona, guest_persona, previous_context=""):
        """
        Micro-Chunk 3-5: Generate discussion with natural disagreements and interruptions
        """
        host_name = host_persona.get('name', 'المقدم')
        guest_name = guest_persona.get('name', 'الضيف')
        host_style = host_persona.get('speaking_style', '')
        guest_style = guest_persona.get('speaking_style', '')
        
        # Extract content
        point_title = point_data.get('point_title', '')
        personal_angle = point_data.get('personal_angle', '')
        
        # Use only first elements to avoid overwhelming
        spontaneous_triggers = point_data.get('spontaneous_triggers', [])
        cultural_refs = point_data.get('cultural_references', [])
        
        trigger = spontaneous_triggers[0] if spontaneous_triggers else ""
        cultural_ref = cultural_refs[0] if cultural_refs else ""
        
        # Get context
        context = previous_context[-80:] if len(previous_context) > 80 else previous_context

        prompt = f"""
Generate Arabic podcast discussion between host and guest.

Host: {host_name}
Guest: {guest_name}
Discussion Topic: {point_title}

Requirements:
- 6-8 short exchanges
- Each person speaks 1-2 sentences only
- Include some disagreement or different views
- Natural conversation flow
- No emojis or symbols
- No meta-text or explanations

Example:
{host_name}: بالنسبة لـ{point_title}، إيش رأيك؟
{guest_name}: موضوع معقد. أعتقد إن...
{host_name}: ولكن ما تفكر إن...
{guest_name}: لا، مش بالضرورة.

Generate only the dialogue:
"""

        try:
            response = self.deployment.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "Generate simple Arabic discussion. Short sentences. Include some disagreement. No emojis. No explanations."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.9
            )
            
            result = response.choices[0].message.content
            
            if self._assess_quality(result) >= 75:
                return result
            else:
                return self._get_fallback_discussion(point_title, host_name, guest_name, personal_angle)
                
        except Exception as e:
            print(f"Error generating discussion point: {e}")
            return self._get_fallback_discussion(point_title, host_name, guest_name, personal_angle)

    def generate_closing_only(self, topic, closing_data, host_persona, guest_persona, discussion_summary):
        """
        Micro-Chunk 6: Generate natural closing with honest reflections
        """
        host_name = host_persona.get('name', 'المقدم')
        guest_name = guest_persona.get('name', 'الضيف')
        host_style = host_persona.get('speaking_style', '')
        guest_style = guest_persona.get('speaking_style', '')
        
        # Extract closing elements
        conclusion = closing_data.get('conclusion', {})
        outro = closing_data.get('outro', {})
        
        main_takeaways = conclusion.get('main_takeaways', '')
        emotional_closure = conclusion.get('emotional_closure', '')
        memorable_ending = outro.get('memorable_ending', '')
        
        # Get discussion summary
        summary = discussion_summary[-100:] if len(discussion_summary) > 100 else discussion_summary

        prompt = f"""
Generate Arabic podcast closing dialogue.

Host: {host_name}
Guest: {guest_name}
Topic: {topic}

Requirements:
- 4-5 short exchanges
- Each person speaks 1-2 sentences maximum
- Thank each other simply
- End naturally
- No emojis or symbols
- No meta-text or explanations

Example:
{host_name}: كان نقاش مفيد يا {guest_name}.
{guest_name}: شكراً لك على الاستضافة.
{host_name}: شكراً لكم مستمعينا.
{guest_name}: نلقاكم قريباً.

Generate only the dialogue:
"""

        try:
            response = self.deployment.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "Generate simple Arabic closing dialogue. Keep it short and natural. No emojis. No explanations."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.75
            )
            
            result = response.choices[0].message.content
            
            if self._assess_quality(result) >= 75:
                return result
            else:
                return self._get_fallback_closing(host_name, guest_name, main_takeaways)
                
        except Exception as e:
            print(f"Error generating closing: {e}")
            return self._get_fallback_closing(host_name, guest_name, main_takeaways)

    def generate_complete_script(self, topic, final_outline_result):
        """
        Main orchestration: Generate complete script using enhanced micro-chunks
        """
        print("🎙️ Starting enhanced spontaneous script generation...")
        print("=" * 60)
        
        try:
            outline = json.loads(final_outline_result)
        except:
            raise ValueError("Invalid outline JSON format")
        
        # Extract data
        personas = outline.get("personas", {})
        conv_flow = outline.get("conversation_flow", {})
        
        host_persona = personas.get("host", {})
        guest_persona = personas.get("guest", {})
        
        intro1_data = conv_flow.get("intro1", {})
        intro2_data = conv_flow.get("intro2", {})
        main_discussion = conv_flow.get("main_discussion", [])
        closing_data = conv_flow.get("closing", {})
        
        print(f"📋 Host: {host_persona.get('name', 'Unknown')}")
        print(f"📋 Guest: {guest_persona.get('name', 'Unknown')}")
        print(f"📋 Discussion Points: {len(main_discussion)}")
        
        # Micro-Chunk 1: Natural Intro1
        print("\n📝 Chunk 1: Natural host introduction...")
        intro1_dialogue = self.generate_intro1_only(topic, intro1_data, host_persona)
        print("✅ Host introduction completed")
        
        time.sleep(0.5)
        
        # Micro-Chunk 2: Dynamic Intro2
        print("\n📝 Chunk 2: Dynamic guest introduction...")
        intro2_dialogue = self.generate_intro2_only(topic, intro2_data, host_persona, guest_persona, intro1_dialogue)
        print("✅ Guest introduction completed")
        
        time.sleep(0.5)
        
        # Micro-Chunks 3-5: Spontaneous Discussion Points
        discussion_parts = []
        previous_context = intro2_dialogue
        
        for i, point_data in enumerate(main_discussion):
            print(f"\n📝 Chunk {i+3}: Spontaneous discussion point {i+1}...")
            point_dialogue = self.generate_discussion_point(
                topic, point_data, host_persona, guest_persona, previous_context
            )
            discussion_parts.append(point_dialogue)
            previous_context = point_dialogue
            print(f"✅ Discussion point {i+1} completed")
            time.sleep(0.5)
        
        # Micro-Chunk 6: Honest Closing
        print(f"\n📝 Chunk {len(main_discussion)+3}: Honest closing...")
        discussion_summary = " ".join(discussion_parts[-2:])
        closing_dialogue = self.generate_closing_only(topic, closing_data, host_persona, guest_persona, discussion_summary)
        print("✅ Closing completed")
        
        # Combine all parts
        complete_intro = intro1_dialogue + "\n\n" + intro2_dialogue
        complete_discussion = "\n\n".join(discussion_parts)
        
        complete_script = f"""=== مقدمة البودكاست ===
{complete_intro}

=== النقاش الرئيسي ===
{complete_discussion}

=== ختام البودكاست ===
{closing_dialogue}"""
        
        print("\n" + "=" * 60)
        print("🎉 Enhanced spontaneous script generation completed!")
        
        # Enhanced quality assessment
        total_quality = self._assess_script_quality(complete_script, outline)
        
        return {
            "intro": complete_intro,
            "main_discussion": complete_discussion,
            "closing": closing_dialogue,
            "complete_script": complete_script,
            "script_length": len(complete_script),
            "estimated_duration": f"{len(main_discussion) * 2 + 4}-{len(main_discussion) * 3 + 6} minutes",
            "quality_score": total_quality,
            "generation_method": "enhanced-spontaneous-micro-chunks",
            "chunks_generated": len(main_discussion) + 3,
            "personas_used": {
                "host": host_persona.get('name', 'Unknown'),
                "guest": guest_persona.get('name', 'Unknown')
            },
            "cultural_elements_integrated": self._count_cultural_elements(complete_script),
            "spontaneity_level": "high",
            "natural_interruptions": self._count_interruptions(complete_script),
            "disagreement_instances": self._count_disagreements(complete_script),
            "enhancement_level": "spontaneous"
        }

    def _assess_quality(self, text):
        """Enhanced quality assessment including spontaneity markers"""
        if not text or len(text) < 50:
            return 0
            
        # Check for meta-text (penalty)
        meta_indicators = ['ملاحظة:', 'تنتهي', 'Note:', 'Format:', 'Generate', 'Requirements']
        has_meta = any(indicator in text for indicator in meta_indicators)
        
        # Check for natural conversation markers (bonus)
        natural_markers = ['يعني', 'بصراحة', 'أممم', 'لحظة', 'فعلاً؟', 'اعذرني', 'ولكن', 'نعم، ولكن']
        natural_score = sum(3 for marker in natural_markers if marker in text)
        
        # Check for excessive politeness (penalty)
        excessive_politeness = ['شكراً جزيلاً', 'أشكرك بالغ الشكر', 'ممتن للغاية', 'شرف عظيم']
        politeness_penalty = sum(5 for phrase in excessive_politeness if phrase in text)
        
        # Check spacing quality
        spacing_score = 80 if not re.search(r'[^\s]{30,}', text) else 40
        
        # Check Arabic content ratio
        arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
        total_chars = len(text)
        arabic_ratio = arabic_chars / total_chars if total_chars > 0 else 0
        
        # Check dialogue structure (more turns = better)
        dialogue_turns = text.count(':')
        structure_score = min(25, dialogue_turns * 3)  # Reward more turns
        
        # Calculate quality
        quality = spacing_score + (arabic_ratio * 30) + structure_score + natural_score
        
        # Apply penalties
        if has_meta:
            quality -= 40
        if arabic_ratio < 0.5:
            quality -= 20
        quality -= politeness_penalty
            
        return min(100, max(0, int(quality)))

    def _count_interruptions(self, script):
        """Count natural interruption markers"""
        interruption_markers = ['اعذرني', 'لحظة', 'نعم، ولكن', 'لا، لا', 'فعلاً؟']
        return sum(script.count(marker) for marker in interruption_markers)

    def _count_disagreements(self, script):
        """Count disagreement/challenge markers"""
        disagreement_markers = ['لا أتفق', 'ولكن', 'هذا صحيح، لكن', 'كيف تفسر', 'ألا تعتقد']
        return sum(script.count(marker) for marker in disagreement_markers)

    def _assess_script_quality(self, script, outline):
        """Enhanced script quality assessment with spontaneity metrics"""
        individual_quality = self._assess_quality(script)
        
        # Check persona usage
        personas = outline.get("personas", {})
        host_name = personas.get("host", {}).get("name", "")
        guest_name = personas.get("guest", {}).get("name", "")
        
        persona_score = 0
        if host_name and host_name in script:
            persona_score += 10
        if guest_name and guest_name in script:
            persona_score += 10
        
        # Check structure completeness
        required_sections = ["=== مقدمة البودكاست ===", "=== النقاش الرئيسي ===", "=== ختام البودكاست ==="]
        structure_score = sum(10 for section in required_sections if section in script)
        
        # Check dialogue balance (reward more turns)
        total_turns = script.count(':')
        balance_score = min(25, total_turns * 2)
        
        # Check spontaneity elements
        spontaneity_score = min(15, self._count_interruptions(script) * 2 + self._count_disagreements(script) * 3)
        
        total_score = min(100, individual_quality + persona_score + structure_score + balance_score + spontaneity_score)
        return total_score

    def _count_cultural_elements(self, script):
        """Count cultural elements in the script"""
        cultural_indicators = [
            'مثل', 'حكمة', 'تراث', 'ثقافة', 'عربي', 'إسلامي', 'تاريخ',
            'شوقي', 'ابن', 'قال', 'حديث', 'قرآن', 'شعر'
        ]
        return sum(1 for indicator in cultural_indicators if indicator in script)

    def _get_fallback_intro1(self, topic, host_name, opening_line=""):
        """Enhanced fallback with more natural tone"""
        base_opening = opening_line if opening_line else f"مرحباً مستمعينا"
        return f"{host_name}: {base_opening}... بصراحة، موضوع {topic} يشغل بالي من فترة. يعني، كلنا نواجه هذا الأمر بشكل أو بآخر، صح؟"

    def _get_fallback_intro2(self, host_name, guest_name, guest_welcome=""):
        """Enhanced fallback with quick exchanges"""
        return f"""{host_name}: معي اليوم {guest_name}. أهلاً بك.

{guest_name}: أهلاً {host_name}. بصراحة، الموضوع ده محتاج نقاش جدي.

{host_name}: فعلاً؟ يعني انت شايف إن...

{guest_name}: اعذرني، خليني أوضح وجهة نظري الأول."""

    def _get_fallback_discussion(self, point_title, host_name, guest_name, personal_angle=""):
        """Enhanced fallback with disagreement"""
        return f"""{host_name}: بالنسبة لموضوع {point_title}، إيش رأيك؟

{guest_name}: موضوع مهم، بس...

{host_name}: لحظة، "بس" إيش؟

{guest_name}: يعني، الناس تفهم الموضوع غلط أحياناً.

{host_name}: ولكن ألا تعتقد أن...

{guest_name}: لا، اعذرني، هذا مو صحيح تماماً."""

    def _get_fallback_closing(self, host_name, guest_name, main_takeaways=""):
        """Enhanced fallback with honest reflection"""
        return f"""{host_name}: بصراحة، النقاش كان مثير للجدل شوية.

{guest_name}: فعلاً، بس هذا شيء كويس.

{host_name}: إيش رأيكم انتوا، مستمعينا؟

{guest_name}: والله موضوع يستاهل نقاش أكثر.

{host_name}: نلقاكم قريب بإذن الله."""

    def validate_script_quality(self, script_result):
        """Enhanced validation including spontaneity metrics"""
        complete_script = script_result.get("complete_script", "")
        quality_score = script_result.get("quality_score", 0)
        
        validation = {
            "has_structure": all(section in complete_script for section in ["=== مقدمة البودكاست ===", "=== النقاش الرئيسي ===", "=== ختام البودكاست ==="]),
            "arabic_content": bool(re.search(r'[\u0600-\u06FF]', complete_script)),
            "no_meta_text": not any(indicator in complete_script for indicator in ['ملاحظة:', 'تنتهي', 'Note:', 'Format:', 'Generate']),
            "proper_spacing": not bool(re.search(r'[^\s]{40,}', complete_script)),
            "dialogue_balance": complete_script.count(':') >= 12,  # Higher threshold for more turns
            "persona_presence": script_result.get("personas_used", {}).get("host", "") in complete_script,
            "natural_interruptions": script_result.get("natural_interruptions", 0) >= 2,
            "disagreement_instances": script_result.get("disagreement_instances", 0) >= 1,
            "quality_score": quality_score,
            "quality_grade": "ممتاز" if quality_score >= 90 else "جيد جداً" if quality_score >= 85 else "جيد" if quality_score >= 80 else "مقبول" if quality_score >= 70 else "ضعيف"
        }
        
        validation["overall_valid"] = all([
            validation["has_structure"],
            validation["arabic_content"],
            validation["no_meta_text"],
            validation["proper_spacing"],
            validation["dialogue_balance"],
            validation["persona_presence"],
            quality_score >= 75
        ])
        
        return validation

# Enhanced Testing Function
def test_improved_script_generator(deployment, topic, final_outline_result, model_name="Fanar-C-1-8.7B"):
    """
    Test the improved spontaneous micro-chunk script generator
    """
    print("🧪 Testing Improved Spontaneous Script Generator...")
    print("=" * 60)
    
    generator = ImprovedMicroChunkScriptGenerator(deployment, model_name)
    
    # Generate script
    script_result = generator.generate_complete_script(topic, final_outline_result)
    
    # Validate script
    validation = generator.validate_script_quality(script_result)
    
    print(f"\n📊 Script Generation Results:")
    print(f"Quality Score: {script_result['quality_score']}/100")
    print(f"Quality Grade: {validation['quality_grade']}")
    print(f"Script Length: {script_result['script_length']:,} characters")
    print(f"Estimated Duration: {script_result['estimated_duration']}")
    print(f"Chunks Generated: {script_result['chunks_generated']}")
    print(f"Spontaneity Level: {script_result['spontaneity_level']}")
    print(f"Natural Interruptions: {script_result['natural_interruptions']}")
    print(f"Disagreement Instances: {script_result['disagreement_instances']}")
    
    print(f"\n📈 Validation Results:")
    print(f"Overall Valid: {'✅' if validation['overall_valid'] else '❌'}")
    print(f"Structure: {'✅' if validation['has_structure'] else '❌'}")
    print(f"Arabic Content: {'✅' if validation['arabic_content'] else '❌'}")
    print(f"No Meta Text: {'✅' if validation['no_meta_text'] else '❌'}")
    print(f"Proper Spacing: {'✅' if validation['proper_spacing'] else '❌'}")
    print(f"Dialogue Balance: {'✅' if validation['dialogue_balance'] else '❌'}")
    print(f"Natural Interruptions: {'✅' if validation['natural_interruptions'] else '❌'}")
    print(f"Disagreement Instances: {'✅' if validation['disagreement_instances'] else '❌'}")
    
    # Show sample dialogue
    print(f"\n🎙️ Sample Script Preview:")
    script_lines = script_result['complete_script'].split('\n')
    preview_lines = script_lines[:20]  # More lines to show spontaneity
    for line in preview_lines:
        if line.strip():
            print(f"  {line[:120]}...")
    
    if len(script_lines) > 20:
        print(f"  ... [+{len(script_lines)-20} more lines]")

In [26]:
# Generate enhanced script
generator = ImprovedMicroChunkScriptGenerator(deployment, model)
script_result = generator.generate_complete_script(topic, final_polished_outline)

# Test and validate
test_result = test_improved_script_generator(deployment, topic, final_polished_outline)

# Access enhanced results
print(f"Quality: {script_result['quality_score']}/100")
print(f"Duration: {script_result['estimated_duration']}")
print(f"Cultural Elements: {script_result['cultural_elements_integrated']}")
print(f"Personas: {script_result['personas_used']}")

🎙️ Starting enhanced spontaneous script generation...
📋 Host: أحمد بن علي
📋 Guest: د. فاتن راشد
📋 Discussion Points: 3

📝 Chunk 1: Natural host introduction...
✅ Host introduction completed

📝 Chunk 2: Dynamic guest introduction...
✅ Guest introduction completed

📝 Chunk 3: Spontaneous discussion point 1...
✅ Discussion point 1 completed

📝 Chunk 4: Spontaneous discussion point 2...
✅ Discussion point 2 completed

📝 Chunk 5: Spontaneous discussion point 3...
✅ Discussion point 3 completed

📝 Chunk 6: Honest closing...
✅ Closing completed

🎉 Enhanced spontaneous script generation completed!
🧪 Testing Improved Spontaneous Script Generator...
🎙️ Starting enhanced spontaneous script generation...
📋 Host: أحمد بن علي
📋 Guest: د. فاتن راشد
📋 Discussion Points: 3

📝 Chunk 1: Natural host introduction...
✅ Host introduction completed

📝 Chunk 2: Dynamic guest introduction...
✅ Guest introduction completed

📝 Chunk 3: Spontaneous discussion point 1...
✅ Discussion point 1 completed

📝 Chunk 4: 

In [27]:
print(script_result)

{'intro': 'أحمد بن علي: سلام عليكم جميعاً، ونرحب بكم في هذه الحلقة التي نغوص فيها بعمق لتفهم التوازي الحساس بين ثورتِ الذكاء الصناعي وثروتِ الهوية العربية الواعدة؛ دعونا نتشارك باهتمام نسبر فيه آفاق هذا المزج الفريد لمستقبلنا الثقافي المحلي.\n\n**أحمد بن علي:** ترحيب للدكتورة فاتن Рашид، من الرائع أنّها معنا اليوم لمناقشة مسألة حيوية حول الترابط بين تقنية الذكاء الاصطناعي وهویتَنا العربيّة.\n\n**د. فَتْن رَاشِد:** شكر لك، نبحث بالفعل تحديات مثيرة تتعلق بهويتنا التقليدية في عالم مترابط بشكل رقمي.\n\n**أحمد بن علي:** كيف بإمكاننا ضمّ تقنيّات المستقبل لدعم قيمنا ومعتقداتنا بدلاً من تهديد وجودها?\n\n**د. فَتْن رَاشِد:** يجب أن نركز على تطوير حلول ذكية تستلهم جذورها القيميّة الثقافيّة بينما ترنو إلى مستقبِل متنوِّر تكنولوجيًا وقائم علَى أسس أخلاقيِّة صلبة.\n\n**أحمد بن علي:** اقتراح ممتاز! هل يمكن ذكر مثال عملي لإدارة هذا الانصهار الناجح نحو آفاق واعدة للعالم العربِي؟\n\n**د. فَتْن رَاشِد:** نعم، علينا تشكيل نماذج التعليم الداخيلة المحترمة لروافدنا الفكريَّــة وتوظيف وسائل التواصل الرَّاقمي

In [28]:
import json
import re

class MicroChunkingAIScriptCleaner:
    def __init__(self, deployment, model="gpt-4o"):
        self.model = model
        self.deployment = deployment

    def clean_script_with_ai(self, script_result):
        """
        Micro-chunking approach: Clean script using surgical AI corrections
        """
        print("\n" + "🔬 MICRO-CHUNKING AI SCRIPT CLEANER".center(80, "="))
        print()
        
        try:
            # Extract complete script text
            if isinstance(script_result, dict):
                complete_script = script_result.get('complete_script', '')
            else:
                complete_script = str(script_result)
            
            if not complete_script or len(complete_script.strip()) < 50:
                print("❌ Script too short or empty, using fallback")
                return self._generate_complete_fallback()
            
            original_length = len(complete_script)
            print(f"📏 Original script length: {original_length:,} characters")
            
            # Step 1: Analyze script corruption
            print("🔍 CORRUPTION ANALYSIS".center(60, "-"))
            corruption_analysis = self.analyze_corruption_patterns(complete_script)
            self._print_corruption_summary(corruption_analysis)
            
            # Step 2: Create micro-chunks (small, focused chunks)
            print("\n📝 MICRO-CHUNKING STRATEGY".center(60, "-"))
            micro_chunks = self.create_micro_chunks(complete_script)
            print(f"    Created {len(micro_chunks)} micro-chunks (avg: {original_length//len(micro_chunks)} chars each)")
            
            # Step 3: Process each micro-chunk with surgical precision
            print("\n🔬 SURGICAL CLEANING PROCESS".center(60, "-"))
            cleaned_chunks = []
            
            for i, chunk_data in enumerate(micro_chunks):
                print(f"    Processing micro-chunk {i+1}/{len(micro_chunks)}... ", end="")
                
                # Quick corruption assessment
                corruption_level = self.assess_chunk_corruption(chunk_data['content'])
                
                if corruption_level == 'clean':
                    # Keep as-is
                    cleaned_chunks.append(chunk_data['content'])
                    print("✅ CLEAN (kept as-is)")
                elif corruption_level == 'minor':
                    # Light cleaning with regex
                    cleaned_chunk = self.light_clean_chunk(chunk_data['content'])
                    cleaned_chunks.append(cleaned_chunk)
                    print("🟡 LIGHT CLEAN")
                else:
                    # AI-powered surgical correction
                    cleaned_chunk = self.surgical_ai_correction(
                        chunk_data['content'], 
                        chunk_data['context'],
                        corruption_analysis
                    )
                    
                    # Validate result
                    if self.validate_micro_chunk(cleaned_chunk, chunk_data['content']):
                        cleaned_chunks.append(cleaned_chunk)
                        print("🔧 AI CORRECTED")
                    else:
                        # Fallback to light cleaning
                        fallback_chunk = self.light_clean_chunk(chunk_data['content'])
                        cleaned_chunks.append(fallback_chunk)
                        print("🔄 FALLBACK CLEAN")
            
            # Step 4: Reassemble with structure preservation
            print("\n🔧 REASSEMBLY WITH STRUCTURE CHECK".center(60, "-"))
            final_script = self.intelligent_reassembly(cleaned_chunks, micro_chunks, complete_script)
            
            # Step 5: Final quality and length check
            final_length = len(final_script)
            length_ratio = final_length / original_length if original_length > 0 else 0
            
            print(f"    Original structure preserved: {'✅' if self.verify_structure_preservation(complete_script, final_script) else '⚠️'}")
            print(f"    Length preservation: {length_ratio:.1%}")
            
            # Step 6: Light expansion if needed (without AI)
            if length_ratio < 0.90:  # Less than 90% retained
                print("📈 APPLYING LENGTH RECOVERY (NON-AI)".center(60, "-"))
                final_script = self.non_ai_length_recovery(final_script, original_length)
                final_length = len(final_script)
                length_ratio = final_length / original_length
            
            # Final validation
            final_valid = self.validate_final_script(final_script)
            
            print("\n" + "🎉 MICRO-CHUNKING SUMMARY".center(60, "-"))
            print(f"    Status: {'SUCCESS' if final_valid else 'PARTIAL SUCCESS'}")
            print(f"    Original Length: {original_length:,} characters")
            print(f"    Final Length: {final_length:,} characters")
            print(f"    Length Preserved: {length_ratio:.1%}")
            print(f"    Micro-chunks Processed: {len(micro_chunks)}")
            print(f"    Arabic Quality: {'✅ VALID' if final_valid else '⚠️ NEEDS REVIEW'}")
            print("=" * 80)
            
            return {
                'complete_script': final_script,
                'cleaning_method': 'micro_chunking_surgical',
                'micro_chunks_processed': len(micro_chunks),
                'cleaning_status': 'success' if final_valid else 'partial',
                'script_length': final_length,
                'original_length': original_length,
                'length_ratio': length_ratio,
                'corruption_analysis': corruption_analysis,
                'estimated_duration': self._estimate_duration(final_script)
            }
            
        except Exception as e:
            print(f"\n❌ CRITICAL ERROR: {e}")
            print("🔄 Using complete fallback script...")
            return self._generate_complete_fallback()

    def analyze_corruption_patterns(self, script_text):
        """
        Analyze what specific types of corruption exist in the script
        """
        analysis = {
            'total_chars': len(script_text),
            'foreign_patterns': {},
            'encoding_issues': 0,
            'concatenated_words': 0,
            'structural_issues': 0,
            'overall_corruption_level': 'clean'
        }
        
        # Foreign language patterns
        foreign_patterns = {
            'english_words': (r'\b[A-Za-z]{3,}\b', 'English words (3+ letters)'),
            'chinese_chars': (r'[\u4e00-\u9fff]', 'Chinese characters'),
            'hebrew_chars': (r'[\u0590-\u05ff]', 'Hebrew characters'),
            'japanese_chars': (r'[\u3040-\u309f\u30a0-\u30ff]', 'Japanese characters'),
            'problematic_punct': (r'[、！]', 'Problematic punctuation')
        }
        
        total_foreign_chars = 0
        for pattern_name, (pattern, description) in foreign_patterns.items():
            matches = re.findall(pattern, script_text)
            if matches:
                char_count = sum(len(match) for match in matches)
                analysis['foreign_patterns'][pattern_name] = {
                    'count': len(matches),
                    'chars': char_count,
                    'description': description,
                    'examples': matches[:3]  # First 3 examples
                }
                total_foreign_chars += char_count
        
        # Concatenated words (Arabic words without spaces)
        concatenated_matches = re.findall(r'[\u0600-\u06FF]{40,}', script_text)  # 40+ Arabic chars without spaces
        analysis['concatenated_words'] = len(concatenated_matches)
        
        # Encoding issues (mixed scripts in single words)
        encoding_issues = re.findall(r'[A-Za-z\u4e00-\u9fff\u0590-\u05ff]+[\u0600-\u06FF]+|[\u0600-\u06FF]+[A-Za-z\u4e00-\u9fff\u0590-\u05ff]+', script_text)
        analysis['encoding_issues'] = len(encoding_issues)
        
        # Structural issues
        if '***' in script_text or '**' in script_text:
            analysis['structural_issues'] += script_text.count('***') + script_text.count('**')
        
        # Overall corruption level
        foreign_ratio = total_foreign_chars / len(script_text) if script_text else 0
        if foreign_ratio > 0.15 or analysis['concatenated_words'] > 5:
            analysis['overall_corruption_level'] = 'heavy'
        elif foreign_ratio > 0.05 or analysis['concatenated_words'] > 2:
            analysis['overall_corruption_level'] = 'moderate'
        elif foreign_ratio > 0.01 or analysis['encoding_issues'] > 0:
            analysis['overall_corruption_level'] = 'light'
        
        return analysis

    def _print_corruption_summary(self, analysis):
        """Print corruption analysis summary"""
        print(f"    Overall corruption level: {analysis['overall_corruption_level'].upper()}")
        print(f"    Foreign patterns found: {len(analysis['foreign_patterns'])}")
        
        for pattern_name, details in analysis['foreign_patterns'].items():
            print(f"      - {details['description']}: {details['count']} instances")
            if details['examples']:
                examples_str = ', '.join(str(ex) for ex in details['examples'][:2])
                print(f"        Examples: {examples_str}")
        
        if analysis['concatenated_words'] > 0:
            print(f"    Concatenated word sequences: {analysis['concatenated_words']}")
        if analysis['encoding_issues'] > 0:
            print(f"    Mixed encoding issues: {analysis['encoding_issues']}")

    def create_micro_chunks(self, script_text):
        """
        Create small, focused micro-chunks (200-400 characters each)
        """
        micro_chunks = []
        
        # Split by sections first
        sections = self._identify_script_sections(script_text)
        
        for section_name, section_content in sections.items():
            if not section_content.strip():
                continue
            
            # Split section into dialogue exchanges
            dialogue_chunks = self._split_into_dialogue_exchanges(section_content, section_name)
            micro_chunks.extend(dialogue_chunks)
        
        return micro_chunks

    def _identify_script_sections(self, script_text):
        """Identify main sections of the script"""
        sections = {}
        
        if "=== مقدمة البودكاست ===" in script_text:
            # Structured script
            parts = script_text.split("=== مقدمة البودكاست ===")
            if len(parts) > 1:
                after_intro = parts[1]
                if "=== النقاش الرئيسي ===" in after_intro:
                    intro_parts = after_intro.split("=== النقاش الرئيسي ===")
                    sections['intro'] = intro_parts[0].strip()
                    if len(intro_parts) > 1:
                        after_main = intro_parts[1]
                        if "=== ختام البودكاست ===" in after_main:
                            main_parts = after_main.split("=== ختام البودكاست ===")
                            sections['main_discussion'] = main_parts[0].strip()
                            if len(main_parts) > 1:
                                sections['closing'] = main_parts[1].strip()
                        else:
                            sections['main_discussion'] = after_main.strip()
                else:
                    sections['intro'] = after_intro.strip()
        else:
            # Unstructured script
            sections['full_script'] = script_text
        
        return sections

    def _split_into_dialogue_exchanges(self, section_content, section_name):
        """Split section into small dialogue exchanges"""
        chunks = []
        
        # Split by speaker turns
        lines = section_content.split('\n')
        current_chunk = []
        current_length = 0
        target_chunk_size = 300  # Target 300 characters per micro-chunk
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            
            # Check if this is a speaker line
            is_speaker_line = ':' in line and not line.startswith('===')
            
            # If adding this line would exceed target size, finalize current chunk
            if current_length + len(line) > target_chunk_size and current_chunk and is_speaker_line:
                chunk_content = '\n'.join(current_chunk).strip()
                if chunk_content:
                    chunks.append({
                        'content': chunk_content,
                        'context': f'{section_name}_dialogue_exchange',
                        'type': 'dialogue_exchange',
                        'length': len(chunk_content)
                    })
                current_chunk = [line]
                current_length = len(line)
            else:
                current_chunk.append(line)
                current_length += len(line) + 1  # +1 for newline
        
        # Add final chunk
        if current_chunk:
            chunk_content = '\n'.join(current_chunk).strip()
            if chunk_content:
                chunks.append({
                    'content': chunk_content,
                    'context': f'{section_name}_dialogue_exchange',
                    'type': 'dialogue_exchange',
                    'length': len(chunk_content)
                })
        
        return chunks

    def assess_chunk_corruption(self, chunk_content):
        """Quick assessment of chunk corruption level"""
        if not chunk_content or len(chunk_content.strip()) < 10:
            return 'heavy'
        
        # Check for foreign characters
        foreign_chars = len(re.findall(r'[A-Za-z\u4e00-\u9fff\u0590-\u05ff\u3040-\u309f\u30a0-\u30ff、！]', chunk_content))
        total_chars = len(chunk_content)
        
        # Check for concatenated words
        concatenated = len(re.findall(r'[\u0600-\u06FF]{30,}', chunk_content))
        
        # Check for encoding issues
        encoding_issues = len(re.findall(r'[A-Za-z\u4e00-\u9fff]+[\u0600-\u06FF]+|[\u0600-\u06FF]+[A-Za-z\u4e00-\u9fff]+', chunk_content))
        
        if foreign_chars == 0 and concatenated == 0 and encoding_issues == 0:
            return 'clean'
        elif foreign_chars < 3 and concatenated == 0 and encoding_issues == 0:
            return 'minor'
        else:
            return 'heavy'

    def light_clean_chunk(self, chunk_content):
        """Light cleaning using regex only"""
        cleaned = chunk_content
        
        # Remove specific problematic punctuation
        cleaned = re.sub(r'[、！]', '', cleaned)
        
        # Remove short English words (1-2 letters)
        cleaned = re.sub(r'\b[A-Za-z]{1,2}\b', '', cleaned)
        
        # Remove Chinese and Hebrew characters
        cleaned = re.sub(r'[\u4e00-\u9fff\u0590-\u05ff\u3040-\u309f\u30a0-\u30ff]', '', cleaned)
        
        # Fix spacing issues
        cleaned = re.sub(r'\s+', ' ', cleaned)
        cleaned = re.sub(r'\n\s*\n+', '\n\n', cleaned)
        
        # Clean up markdown formatting
        cleaned = re.sub(r'\*{2,}', '', cleaned)
        
        return cleaned.strip()

    def surgical_ai_correction(self, chunk_content, context, corruption_analysis):
        """
        Surgical AI correction focused on specific problems
        """
        # Identify specific issues in this chunk
        chunk_issues = []
        
        if re.search(r'[A-Za-z]{3,}', chunk_content):
            chunk_issues.append("English words")
        if re.search(r'[\u4e00-\u9fff]', chunk_content):
            chunk_issues.append("Chinese characters")
        if re.search(r'[\u0590-\u05ff]', chunk_content):
            chunk_issues.append("Hebrew characters")
        if re.search(r'[\u0600-\u06FF]{30,}', chunk_content):
            chunk_issues.append("concatenated words")
        
        issues_description = ", ".join(chunk_issues) if chunk_issues else "minor formatting issues"
        
        prompt = f"""
You are an Arabic text correction specialist. Fix ONLY the specific issues in this small text chunk.

CHUNK CONTEXT: {context}
ISSUES TO FIX: {issues_description}
CHUNK LENGTH: {len(chunk_content)} characters

ORIGINAL CHUNK:
{chunk_content}

CORRECTION INSTRUCTIONS:
1. Replace English words with appropriate Arabic equivalents in context
2. Remove Chinese/Hebrew characters and replace with contextually appropriate Arabic
3. Fix concatenated Arabic words by adding proper spaces
4. Keep the EXACT same meaning and dialogue structure
5. Maintain or slightly increase the length
6. Do NOT change speaker names or dialogue flow

CRITICAL: Return ONLY the corrected Arabic text. No explanations or comments.
"""

        try:
            response = self.deployment.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are an Arabic text correction specialist. Make minimal, precise corrections while preserving meaning and length."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.2,  # Low temperature for consistent corrections
                max_tokens=1000   # Limit tokens to prevent over-expansion
            )
            
            corrected = response.choices[0].message.content.strip()
            
            # Basic cleanup
            corrected = self.light_clean_chunk(corrected)
            
            return corrected
            
        except Exception as e:
            print(f" (AI failed: {e}) ", end="")
            return self.light_clean_chunk(chunk_content)

    def validate_micro_chunk(self, cleaned_chunk, original_chunk):
        """
        Validate that micro-chunk correction was successful
        """
        if not cleaned_chunk or len(cleaned_chunk.strip()) < 10:
            return False
        
        # Check that length wasn't reduced too much
        length_ratio = len(cleaned_chunk) / len(original_chunk) if original_chunk else 0
        if length_ratio < 0.7:  # Lost more than 30% of content
            return False
        
        # Check for remaining major foreign content
        major_foreign = re.findall(r'[\u4e00-\u9fff\u0590-\u05ff]{2,}|[A-Za-z]{4,}', cleaned_chunk)
        if len(major_foreign) > 1:  # Allow 1 instance (might be technical term)
            return False
        
        # Check for basic dialogue structure if original had it
        if ':' in original_chunk and ':' not in cleaned_chunk:
            return False
        
        return True

    def intelligent_reassembly(self, cleaned_chunks, original_micro_chunks, original_script):
        """
        Intelligently reassemble micro-chunks back into complete script
        """
        # Group chunks back into sections
        sections = {'intro': [], 'main_discussion': [], 'closing': [], 'other': []}
        
        for i, chunk in enumerate(cleaned_chunks):
            if not chunk.strip():
                continue
            
            context = original_micro_chunks[i]['context'] if i < len(original_micro_chunks) else 'other'
            
            if 'intro' in context:
                sections['intro'].append(chunk)
            elif 'main_discussion' in context:
                sections['main_discussion'].append(chunk)
            elif 'closing' in context:
                sections['closing'].append(chunk)
            else:
                sections['other'].append(chunk)
        
        # Reassemble with proper structure
        script_parts = []
        
        # Add intro section
        if sections['intro']:
            script_parts.append("=== مقدمة البودكاست ===")
            script_parts.extend([chunk for chunk in sections['intro'] if chunk.strip()])
        
        # Add main discussion
        if sections['main_discussion']:
            script_parts.append("=== النقاش الرئيسي ===")
            script_parts.extend([chunk for chunk in sections['main_discussion'] if chunk.strip()])
        
        # Add closing
        if sections['closing']:
            script_parts.append("=== ختام البودكاست ===")
            script_parts.extend([chunk for chunk in sections['closing'] if chunk.strip()])
        
        # Add other content
        if sections['other']:
            script_parts.extend([chunk for chunk in sections['other'] if chunk.strip()])
        
        # Join with proper spacing
        complete_script = '\n\n'.join([part for part in script_parts if part.strip()])
        
        # Clean up spacing
        complete_script = re.sub(r'\n{3,}', '\n\n', complete_script)
        
        return complete_script.strip()

    def verify_structure_preservation(self, original_script, final_script):
        """
        Verify that the original structure was preserved
        """
        # Check section headers
        original_sections = len(re.findall(r'===.*===', original_script))
        final_sections = len(re.findall(r'===.*===', final_script))
        
        if original_sections > 0 and final_sections < original_sections:
            return False
        
        # Check speaker preservation
        original_speakers = set(re.findall(r'^([^:]+):', original_script, re.MULTILINE))
        final_speakers = set(re.findall(r'^([^:]+):', final_script, re.MULTILINE))
        
        # Should preserve most speakers
        if len(final_speakers) < len(original_speakers) * 0.8:
            return False
        
        return True

    def non_ai_length_recovery(self, script, target_length):
        """
        Recover length using non-AI methods (pattern-based expansion)
        """
        current_length = len(script)
        if current_length >= target_length * 0.9:
            return script
        
        # Natural Arabic conversation extenders
        extenders = [
            ("د. ", "د. المحترم "),
            ("أستاذ ", "أستاذ فاضل "),
            ("نعم", "نعم بالتأكيد"),
            ("بالطبع", "بالطبع وبلا شك"),
            ("هذا", "هذا الأمر"),
            ("ممتاز", "ممتاز جداً"),
            ("صحيح", "صحيح تماماً"),
            ("أعتقد", "أعتقد بقوة"),
            ("يمكن", "يمكن بالفعل"),
            ("المهم", "والأمر المهم"),
        ]
        
        expanded = script
        chars_added = 0
        target_addition = min(target_length - current_length, current_length // 5)  # Max 20% expansion
        
        for original, replacement in extenders:
            if chars_added >= target_addition:
                break
            
            if original in expanded:
                # Replace some instances (not all to avoid repetition)
                count = expanded.count(original)
                replace_count = min(count // 3, 2)  # Replace 1/3, max 2 instances
                
                for _ in range(replace_count):
                    expanded = expanded.replace(original, replacement, 1)
                    chars_added += len(replacement) - len(original)
                    
                    if chars_added >= target_addition:
                        break
        
        return expanded

    def validate_final_script(self, script):
        """
        Final validation of the complete script
        """
        if not script or len(script.strip()) < 50:
            return False
        
        # Check for basic dialogue structure
        if ':' not in script:
            return False
        
        # Check for excessive foreign content (relaxed)
        foreign_chars = len(re.findall(r'[A-Za-z\u4e00-\u9fff\u0590-\u05ff]', script))
        total_chars = len(re.sub(r'[\s\n\t:=\-،.؟!"()[\]{}]', '', script))
        
        if total_chars > 0:
            foreign_ratio = foreign_chars / total_chars
            if foreign_ratio > 0.1:  # Allow up to 10% foreign (technical terms, etc.)
                return False
        
        return True

    def _estimate_duration(self, script_text):
        """Estimate podcast duration"""
        if not script_text:
            return "0-1 minutes"
        
        word_count = len(script_text.split())
        duration_minutes = word_count / 150
        
        min_duration = max(1, int(duration_minutes - 1))
        max_duration = int(duration_minutes + 2)
        
        return f"{min_duration}-{max_duration} minutes"

    def _generate_complete_fallback(self):
        """Generate complete fallback script"""
        fallback_script = """=== مقدمة البودكاست ===
د. فاطمة الزهراء: مرحباً بكم مستمعينا الكرام في حلقة جديدة من برنامجنا. اليوم نستضيف خبيراً متميزاً لنناقش موضوعاً مهماً يهم الجميع.

م. عبد الرحمن الهاشمي: أهلاً وسهلاً بكم، أشكركم على الاستضافة الكريمة. سعيد جداً بوجودي معكم اليوم.

د. فاطمة الزهراء: نحن سعداء بوجودك معنا. دعنا نبدأ هذا النقاش المفيد.

=== النقاش الرئيسي ===
د. فاطمة الزهراء: بداية، ما رأيك في أهمية هذا الموضوع في وقتنا الحالي؟

م. عبد الرحمن الهاشمي: موضوع في غاية الأهمية حقاً. نحن نواجه تحديات كبيرة تتطلب منا فهماً عميقاً ونظرة شاملة للأمور.

د. فاطمة الزهراء: ممكن تحدثنا أكثر عن هذه التحديات؟

م. عبد الرحمن الهاشمي: بالطبع. من أهم التحديات هو كيفية التوازن بين التطورات الحديثة والحفاظ على قيمنا وثوابتنا الأصيلة.

د. فاطمة الزهراء: نقطة مهمة جداً. وما هي الحلول العملية التي تقترحها؟

م. عبد الرحمن الهاشمي: أعتقد أن الحل يكمن في التعليم والتوعية، مع الاستفادة الذكية من التقنيات الحديثة بما يخدم مصالحنا وأهدافنا.

=== ختام البودكاست ===
د. فاطمة الزهراء: في ختام حلقتنا اليوم، أشكرك أستاذ عبد الرحمن على هذا النقاش الثري والمفيد.

م. عبد الرحمن الهاشمي: شكراً لك على الاستضافة الكريمة. كان نقاش ممتع ومثمر، وأتمنى أن يستفيد منه المستمعون.

د. فاطمة الزهراء: بالتأكيد. وشكراً لكم مستمعينا الكرام على متابعتكم الدائمة. نلقاكم في حلقة قادمة بإذن الله. إلى اللقاء."""
        
        return {
            'complete_script': fallback_script,
            'cleaning_method': 'micro_chunking_fallback',
            'micro_chunks_processed': 0,
            'cleaning_status': 'fallback_used',
            'script_length': len(fallback_script),
            'estimated_duration': self._estimate_duration(fallback_script)
        }


# Testing Function
def test_micro_chunking_cleaner(deployment, corrupted_script_result, model_name="Fanar-C-1-8.7B"):
    """
    Test the micro-chunking AI script cleaner
    """
    print("\n" + "🧪 TESTING MICRO-CHUNKING AI SCRIPT CLEANER".center(80, "="))
    print()
    
    cleaner = MicroChunkingAIScriptCleaner(deployment, model_name)
    
    # Show original script info
    if isinstance(corrupted_script_result, dict):
        original_script = corrupted_script_result.get('complete_script', '')
    else:
        original_script = str(corrupted_script_result)
    
    print("📊 ORIGINAL SCRIPT INFO".center(60, "-"))
    print(f"{'Original Length:':<25} {len(original_script):,} characters")
    print(f"{'Estimated Duration:':<25} {cleaner._estimate_duration(original_script)}")
    
    # Quick preview of corruption
    foreign_chars = len(re.findall(r'[A-Za-z\u4e00-\u9fff\u0590-\u05ff]', original_script))
    corruption_ratio = foreign_chars / len(original_script) if original_script else 0
    print(f"{'Apparent Corruption:':<25} {corruption_ratio:.1%} foreign content")
    
    # Clean the script
    cleaned_result = cleaner.clean_script_with_ai(corrupted_script_result)
    
    print("\n📊 MICRO-CHUNKING RESULTS".center(60, "-"))
    print(f"{'Method:':<25} {cleaned_result['cleaning_method']}")
    print(f"{'Status:':<25} {cleaned_result['cleaning_status']}")
    print(f"{'Micro-chunks Processed:':<25} {cleaned_result['micro_chunks_processed']}")
    print(f"{'Original Length:':<25} {cleaned_result.get('original_length', 'N/A'):,} characters")
    print(f"{'Final Length:':<25} {cleaned_result['script_length']:,} characters")
    print(f"{'Length Preserved:':<25} {cleaned_result.get('length_ratio', 0):.1%}")
    print(f"{'Duration:':<25} {cleaned_result['estimated_duration']}")
    
    # Length preservation status
    if 'length_ratio' in cleaned_result:
        if cleaned_result['length_ratio'] >= 0.90:
            print(f"{'Length Status:':<25} ✅ EXCELLENT PRESERVATION")
        elif cleaned_result['length_ratio'] >= 0.80:
            print(f"{'Length Status:':<25} ✅ GOOD PRESERVATION")
        elif cleaned_result['length_ratio'] >= 0.70:
            print(f"{'Length Status:':<25} ⚠️ MODERATE LOSS")
        else:
            print(f"{'Length Status:':<25} ❌ SIGNIFICANT LOSS")
    
    # Corruption analysis summary
    if 'corruption_analysis' in cleaned_result:
        corruption_info = cleaned_result['corruption_analysis']
        print(f"{'Corruption Detected:':<25} {corruption_info['overall_corruption_level'].upper()}")
        print(f"{'Foreign Patterns:':<25} {len(corruption_info['foreign_patterns'])} types found")
    
    # Display cleaned script with formatting
    print("\n" + "🎙️ MICRO-CLEANED SCRIPT OUTPUT".center(80, "="))
    print()
    
    cleaned_script = cleaned_result['complete_script']
    if cleaned_script:
        sections = cleaned_script.split('\n\n')
        
        for section in sections[:10]:  # Show first 10 sections to avoid overwhelming output
            section = section.strip()
            if not section:
                continue
            
            # Section headers
            if section.startswith('===') and section.endswith('==='):
                print(f"\n{section}")
                print("─" * len(section))
                continue
            
            # Format dialogue
            lines = section.split('\n')
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                
                if ':' in line and not line.startswith('==='):
                    speaker, dialogue = line.split(':', 1)
                    speaker = speaker.strip()
                    dialogue = dialogue.strip()
                    
                    print(f"\n{speaker}:")
                    
                    # Word wrap dialogue
                    words = dialogue.split()
                    current_line = ""
                    max_length = 70
                    
                    for word in words:
                        if len(current_line) + len(word) + 1 > max_length and current_line:
                            print(f"    {current_line}")
                            current_line = word
                        else:
                            current_line = current_line + " " + word if current_line else word
                    
                    if current_line:
                        print(f"    {current_line}")
                else:
                    print(f"    {line}")
    
    # Final validation results
    is_clean = cleaner.validate_final_script(cleaned_script)
    print(f"\n🔍 FINAL VALIDATION: {'✅ PASSED' if is_clean else '❌ NEEDS REVIEW'}")
    
    # Quality comparison
    if original_script and cleaned_script:
        original_foreign = len(re.findall(r'[A-Za-z\u4e00-\u9fff\u0590-\u05ff]', original_script))
        cleaned_foreign = len(re.findall(r'[A-Za-z\u4e00-\u9fff\u0590-\u05ff]', cleaned_script))
        
        print(f"🧹 CLEANING EFFECTIVENESS:")
        print(f"    Foreign chars removed: {original_foreign - cleaned_foreign:,}")
        print(f"    Cleaning efficiency: {((original_foreign - cleaned_foreign) / original_foreign * 100) if original_foreign > 0 else 0:.1f}%")
    
    print("\n" + "="*80)
    
    return cleaned_result


# Advanced Testing Function with Detailed Analysis
def detailed_micro_chunk_analysis(deployment, corrupted_script_result, model_name="Fanar-C-1-8.7B"):
    """
    Detailed analysis of micro-chunking performance
    """
    print("\n" + "🔬 DETAILED MICRO-CHUNKING ANALYSIS".center(80, "="))
    print()
    
    cleaner = MicroChunkingAIScriptCleaner(deployment, model_name)
    
    # Extract script
    if isinstance(corrupted_script_result, dict):
        original_script = corrupted_script_result.get('complete_script', '')
    else:
        original_script = str(corrupted_script_result)
    
    print("🔍 PRE-PROCESSING ANALYSIS")
    print("-" * 40)
    
    # Detailed corruption analysis
    corruption_analysis = cleaner.analyze_corruption_patterns(original_script)
    
    print(f"Script length: {corruption_analysis['total_chars']:,} characters")
    print(f"Overall corruption level: {corruption_analysis['overall_corruption_level'].upper()}")
    print()
    
    print("Foreign content breakdown:")
    for pattern_name, details in corruption_analysis['foreign_patterns'].items():
        print(f"  • {details['description']}: {details['count']} instances ({details['chars']} chars)")
        if details['examples']:
            examples_preview = [str(ex)[:10] + ('...' if len(str(ex)) > 10 else '') for ex in details['examples'][:2]]
            print(f"    Examples: {', '.join(examples_preview)}")
    
    if corruption_analysis['concatenated_words'] > 0:
        print(f"  • Concatenated word sequences: {corruption_analysis['concatenated_words']}")
    if corruption_analysis['encoding_issues'] > 0:
        print(f"  • Mixed encoding issues: {corruption_analysis['encoding_issues']}")
    
    print("\n🔬 MICRO-CHUNKING BREAKDOWN")
    print("-" * 40)
    
    # Create micro-chunks for analysis
    micro_chunks = cleaner.create_micro_chunks(original_script)
    
    print(f"Total micro-chunks created: {len(micro_chunks)}")
    if micro_chunks:
        avg_chunk_size = sum(chunk['length'] for chunk in micro_chunks) / len(micro_chunks)
        print(f"Average chunk size: {avg_chunk_size:.1f} characters")
        
        chunk_sizes = [chunk['length'] for chunk in micro_chunks]
        print(f"Chunk size range: {min(chunk_sizes)} - {max(chunk_sizes)} characters")
    
    # Analyze chunk corruption levels
    corruption_levels = {'clean': 0, 'minor': 0, 'heavy': 0}
    for chunk in micro_chunks:
        level = cleaner.assess_chunk_corruption(chunk['content'])
        corruption_levels[level] += 1
    
    print(f"\nCorruption level distribution:")
    print(f"  • Clean chunks: {corruption_levels['clean']} ({corruption_levels['clean']/len(micro_chunks)*100:.1f}%)")
    print(f"  • Minor issues: {corruption_levels['minor']} ({corruption_levels['minor']/len(micro_chunks)*100:.1f}%)")
    print(f"  • Heavy corruption: {corruption_levels['heavy']} ({corruption_levels['heavy']/len(micro_chunks)*100:.1f}%)")
    
    print("\n🎯 PROCESSING STRATEGY")
    print("-" * 40)
    print(f"Expected processing:")
    print(f"  • Keep as-is: {corruption_levels['clean']} chunks")
    print(f"  • Light regex cleaning: {corruption_levels['minor']} chunks") 
    print(f"  • AI surgical correction: {corruption_levels['heavy']} chunks")
    
    estimated_ai_calls = corruption_levels['heavy']
    print(f"  • Estimated AI API calls: {estimated_ai_calls}")
    
    if estimated_ai_calls > 0:
        print(f"  • Efficiency vs. large chunks: {estimated_ai_calls} calls vs ~3-4 calls (traditional)")
        print(f"  • Trade-off: More calls but higher precision per call")
    
    print(f"\n💡 PREDICTION:")
    clean_ratio = corruption_levels['clean'] / len(micro_chunks)
    if clean_ratio > 0.7:
        print("  Expected outcome: EXCELLENT length preservation (70%+ content kept as-is)")
    elif clean_ratio > 0.5:
        print("  Expected outcome: GOOD length preservation (50%+ content kept as-is)")
    else:
        print("  Expected outcome: MODERATE length preservation (heavy corruption detected)")
    
    print("\n" + "="*80)
    
    return {
        'total_chunks': len(micro_chunks),
        'corruption_analysis': corruption_analysis,
        'corruption_distribution': corruption_levels,
        'estimated_ai_calls': estimated_ai_calls,
        'clean_content_ratio': clean_ratio
    }


# Usage Examples:
"""
# Basic cleaning
cleaner = MicroChunkingAIScriptCleaner(deployment, "Fanar-C-1-8.7B")
cleaned_result = cleaner.clean_script_with_ai(corrupted_script_result)

# Testing with detailed output
test_result = test_micro_chunking_cleaner(deployment, corrupted_script_result)

# Detailed pre-processing analysis
analysis = detailed_micro_chunk_analysis(deployment, corrupted_script_result)
"""

'\n# Basic cleaning\ncleaner = MicroChunkingAIScriptCleaner(deployment, "Fanar-C-1-8.7B")\ncleaned_result = cleaner.clean_script_with_ai(corrupted_script_result)\n\n# Testing with detailed output\ntest_result = test_micro_chunking_cleaner(deployment, corrupted_script_result)\n\n# Detailed pre-processing analysis\nanalysis = detailed_micro_chunk_analysis(deployment, corrupted_script_result)\n'

In [29]:
# Basic cleaning
cleaner = MicroChunkingAIScriptCleaner(deployment, "Fanar-C-1-8.7B")
cleaned_result = cleaner.clean_script_with_ai(script_result)
print("Cleaned Result:", cleaned_result)



📏 Original script length: 2,913 characters
-------------------🔍 CORRUPTION ANALYSIS--------------------
    Overall corruption level: LIGHT
    Foreign patterns found: 1
      - English words (3+ letters): 1 instances
        Examples: strong
    Mixed encoding issues: 1
-----------------
📝 MICRO-CHUNKING STRATEGY-----------------
    Created 12 micro-chunks (avg: 242 chars each)
----------------
🔬 SURGICAL CLEANING PROCESS----------------
    Processing micro-chunk 1/12... ✅ CLEAN (kept as-is)
    Processing micro-chunk 2/12... ✅ CLEAN (kept as-is)
    Processing micro-chunk 3/12... ✅ CLEAN (kept as-is)
    Processing micro-chunk 4/12... ✅ CLEAN (kept as-is)
    Processing micro-chunk 5/12... ✅ CLEAN (kept as-is)
    Processing micro-chunk 6/12... ✅ CLEAN (kept as-is)
    Processing micro-chunk 7/12... 🔧 AI CORRECTED
    Processing micro-chunk 8/12... ✅ CLEAN (kept as-is)
    Processing micro-chunk 9/12... 🔧 AI CORRECTED
    Processing micro-chunk 10/12... ✅ CLEAN (kept as-is)
    P

In [22]:
print(cleaned_result)

{'complete_script': '=== مقدمة البودكاست ===\n\nأحمد بن علي: انطلق بنا مباشرة إلى قلب تحدٍ عصري يغوص في أعماق تقاطع الذكاء الاصطناعي والثقافة العربية, حيث سنضع كلانا أيديهما بقلب متنبه ومشوق.... بصراحة، موضوع الذكاء الاصطناعي والهوية العربية: كيف نحافظ على ثقافتنا في العصر الرقمي يشغل بالي من فترة. يعني، كلنا نواجه هذا الأمر بشكل أو بآخر، صح؟\n\n**أحمد بن علي:** معي اليوم الباحثة المتميزة التي لا تتوقف عن تحدينا بإسئلتها المحفزة حول الذكاء الاصطناعي وخصوصياتنا المعرفية، الدكتورَةُ فاتِنْ رَاشِدْ. د. فاتِنْ، هلاّ شرحت لي كده ليه عند البعض شعور إزّا تراثنة المعلومات اللكترونية تهدّد هويتنا القومية؟!\n\n**د. فاتِنْ:** سؤال جميل مش جديد لكنه حيوي. بالعكس، نحتمل أن يكون التعاون مع آليات التعلم الآلي وسيلة فعَّالة لحفظ تقاليدنا، شرط أن نوفر لها البيانات الضرورية بلغاتنا العريقة وبناء الفهم الصحيح لمدلولاتنا الحساسة.\n\n**أحمد بن علي:** كلام قوي. بس شوفينى، دي مفروض خطوة سيبرانية مجردا لأ تحميكو من سرقة الهويات أم إطلاق روابط جديدة بين الجيل القديم والأصغر؟\n\n**د. فاتِنْ:** حاجة كويسّة تسؤل 