# Multilingual AI Agent Demo

A specialized AI agent that detects and processes text in **50+ languages** using a hybrid approach combining statistical methods and OpenAI LLM.

## Features

- **50+ Languages**: English, Spanish, French, German, Chinese, Japanese, Arabic, Russian, and more
- **Hybrid Detection**: Combines `langdetect` (fast) with OpenAI (context-aware)
- **Confidence Scoring**: Provides reliability metrics for each detection
- **Batch Processing**: Efficiently process multiple texts
- **Translation Suggestions**: Get translations for detected text
- **Statistics Tracking**: Monitor detection history and language distribution

## Detection Methods

1. **Statistical Detection** (langdetect) - Fast, character n-gram based
2. **AI-Powered Detection** (OpenAI) - Context-aware, handles nuances
3. **Hybrid Mode** - Combines both for maximum accuracy


## 1. Setup and Imports


In [82]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [83]:
# Install required packages (uncomment if needed)
!pip install openai langdetect -q

import os
import json
from typing import Dict, List, Any, Optional
from datetime import datetime

# Import required libraries
try:
    from openai import OpenAI
    from langdetect import detect, detect_langs, LangDetectException
    print("All packages imported successfully")
except ImportError as e:
    print(f"Error: {e}")
    print("\nPlease install required packages:")
    print("  pip install openai langdetect")

# Initialize OpenAI client
# Make sure to set your API key: export OPENAI_API_KEY='your-key-here'
# client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) # Original line
from google.colab import userdata
client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))


if userdata.get("OPENAI_API_KEY"):
    print("OpenAI API key found - ")
else:
    print(" Warning: OPENAI_API_KEY not set. Please set it before running AI detection.")

All packages imported successfully
OpenAI API key found - 


## 2. MultilingualAgent Class

The core class that handles language detection using multiple methods.


In [84]:
class MultilingualAgent:
    """
    AI Agent specialized in detecting and processing multiple languages.
    """

    # Comprehensive language mapping (50+ languages)
    LANGUAGE_NAMES = {
        'af': 'Afrikaans', 'ar': 'Arabic', 'bg': 'Bulgarian', 'bn': 'Bengali',
        'ca': 'Catalan', 'cs': 'Czech', 'cy': 'Welsh', 'da': 'Danish',
        'de': 'German', 'el': 'Greek', 'en': 'English', 'es': 'Spanish',
        'et': 'Estonian', 'fa': 'Persian', 'fi': 'Finnish', 'fr': 'French',
        'gu': 'Gujarati', 'he': 'Hebrew', 'hi': 'Hindi', 'hr': 'Croatian',
        'hu': 'Hungarian', 'id': 'Indonesian', 'it': 'Italian', 'ja': 'Japanese',
        'kn': 'Kannada', 'ko': 'Korean', 'lt': 'Lithuanian', 'lv': 'Latvian',
        'mk': 'Macedonian', 'ml': 'Malayalam', 'mr': 'Marathi', 'ne': 'Nepali',
        'nl': 'Dutch', 'no': 'Norwegian', 'pa': 'Punjabi', 'pl': 'Polish',
        'pt': 'Portuguese', 'ro': 'Romanian', 'ru': 'Russian', 'sk': 'Slovak',
        'sl': 'Slovenian', 'so': 'Somali', 'sq': 'Albanian', 'sv': 'Swedish',
        'sw': 'Swahili', 'ta': 'Tamil', 'te': 'Telugu', 'th': 'Thai',
        'tl': 'Tagalog', 'tr': 'Turkish', 'uk': 'Ukrainian', 'ur': 'Urdu',
        'vi': 'Vietnamese', 'zh-cn': 'Chinese (Simplified)', 'zh-tw': 'Chinese (Traditional)'
    }

    def __init__(self, model: str = "gpt-4o-mini"):
        """
        Initialize the multilingual agent.

        Args:
            model: OpenAI model to use (default: gpt-4o-mini)
        """
        from google.colab import userdata
        self.client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))
        self.model = model
        self.detection_history: List[Dict] = []

        print(" Multilingual Agent initialized")
        print(f"   Model: {self.model}")
        print(f"   Languages supported: {len(self.LANGUAGE_NAMES)}+")

    def detect_language_basic(self, text: str) -> Dict[str, Any]:
        """
        Basic language detection using langdetect library.
        Fast and reliable for single-language text.

        Args:
            text: Text to analyze

        Returns:
            Dictionary with detection results
        """
        try:
            lang_probs = detect_langs(text)
            primary = lang_probs[0]

            result = {
                "method": "langdetect",
                "primary_language": {
                    "code": primary.lang,
                    "name": self.LANGUAGE_NAMES.get(primary.lang, primary.lang.upper()),
                    "confidence": round(primary.prob, 3)
                },
                "all_detected": [
                    {
                        "code": lp.lang,
                        "name": self.LANGUAGE_NAMES.get(lp.lang, lp.lang.upper()),
                        "confidence": round(lp.prob, 3)
                    }
                    for lp in lang_probs[:3]  # Top 3 candidates
                ],
                "text_length": len(text),
                "timestamp": datetime.now().isoformat()
            }

            return result

        except LangDetectException as e:
            return {
                "method": "langdetect",
                "error": str(e),
                "primary_language": {"code": "unknown", "name": "Unknown"}
            }

    def detect_language_ai(self, text: str) -> Dict[str, Any]:
        """
        AI-powered language detection using OpenAI.
        More context-aware and can handle mixed languages.

        Args:
            text: Text to analyze

        Returns:
            Dictionary with AI detection results
        """
        prompt = f"""Analyze the following text and identify its language(s).

Text: "{text}"

Provide:
1. Primary language (code and name)
2. Confidence level (0-1)
3. Any secondary languages detected
4. Whether it's formal or informal
5. Any notable linguistic features

Respond in JSON format."""

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are a linguistic expert specialized in language identification. Respond only with valid JSON."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3
            )

            result = json.loads(response.choices[0].message.content)
            result["method"] = "openai_llm"
            result["timestamp"] = datetime.now().isoformat()

            return result

        except Exception as e:
            return {
                "method": "openai_llm",
                "error": str(e),
                "primary_language": {"code": "unknown", "name": "Unknown"}
            }

    def detect_language_hybrid(self, text: str, verbose: bool = True) -> Dict[str, Any]:
        """
        Hybrid detection: combines langdetect and OpenAI for best accuracy.

        Args:
            text: Text to analyze
            verbose: Print detection process

        Returns:
            Combined detection results
        """
        if verbose:
            print(f"\n{'='*70}")
            print("LANGUAGE DETECTION - HYBRID MODE")
            print(f"{'='*70}")
            print(f"\nText: \"{text[:100]}{'...' if len(text) > 100 else ''}\"")
            print(f"Length: {len(text)} characters\n")

        # Method 1: Fast detection with langdetect
        if verbose:
            print("Method 1: Statistical Detection (langdetect)")
        basic_result = self.detect_language_basic(text)

        if verbose and "error" not in basic_result:
            print(f"   Result: {basic_result['primary_language']['name']} "
                  f"({basic_result['primary_language']['confidence']} confidence)")

        # Method 2: AI-powered detection
        if verbose:
            print("\nMethod 2: AI-Powered Detection (OpenAI)")
        ai_result = self.detect_language_ai(text)

        if verbose and "error" not in ai_result:
            if isinstance(ai_result.get('primary_language'), dict):
                lang_name = ai_result['primary_language'].get('name', 'Unknown')
                ai_confidence = ai_result['primary_language'].get('confidence', 'N/A') # Safely access confidence
            else:
                lang_name = ai_result.get('primary_language', 'Unknown')
                ai_confidence = 'N/A' # Set confidence to N/A if result is not a dict
            print(f"   Result: {lang_name}")
            if ai_confidence != 'N/A':
                print(f"   Confidence: {ai_confidence}")


        # Combine results
        combined = {
            "text": text[:200] + "..." if len(text) > 200 else text,
            "statistical_detection": basic_result,
            "ai_detection": ai_result,
            "timestamp": datetime.now().isoformat(),
            "text_length": len(text)
        }

        # Determine final verdict
        if "error" not in basic_result and "error" not in ai_result:
            basic_lang = basic_result['primary_language']['code']
            ai_lang = ai_result.get('primary_language', {})
            ai_lang_code = ai_lang.get('code', '').lower() if isinstance(ai_lang, dict) else ''

            if basic_lang == ai_lang_code:
                combined["verdict"] = "Both methods agree"
                combined["confidence"] = "high"
            else:
                combined["verdict"] = "Methods differ - review recommended"
                combined["confidence"] = "medium"
        else:
            combined["verdict"] = "Detection issues encountered"
            combined["confidence"] = "low"

        # Store in history
        self.detection_history.append(combined)

        if verbose:
            print(f"\n{combined['verdict']}")
            print(f"{'='*70}\n")

        return combined

    def detect_batch(self, texts: List[str]) -> List[Dict[str, Any]]:
        """
        Detect languages for multiple texts.

        Args:
            texts: List of texts to analyze

        Returns:
            List of detection results
        """
        print(f"\nProcessing {len(texts)} texts...")
        results = []

        for i, text in enumerate(texts, 1):
            print(f"\n[{i}/{len(texts)}] ", end="")
            result = self.detect_language_hybrid(text, verbose=False)
            results.append(result)

            # Print summary
            basic = result['statistical_detection']
            if 'error' not in basic:
                lang = basic['primary_language']
                print(f"✓ {lang['name']} ({lang['confidence']})")
            else:
                print(f"✗ Detection failed")

        return results

    def get_statistics(self) -> Dict[str, Any]:
        """
        Get statistics about detected languages.

        Returns:
            Statistics dictionary
        """
        if not self.detection_history:
            return {"message": "No detections performed yet"}

        lang_counts = {}
        total_confidence = 0
        successful_detections = 0

        for detection in self.detection_history:
            basic = detection.get('statistical_detection', {})
            if 'error' not in basic:
                lang = basic['primary_language']['name']
                lang_counts[lang] = lang_counts.get(lang, 0) + 1
                total_confidence += basic['primary_language']['confidence']
                successful_detections += 1

        return {
            "total_detections": len(self.detection_history),
            "successful_detections": successful_detections,
            "languages_detected": len(lang_counts),
            "language_breakdown": lang_counts,
            "average_confidence": round(total_confidence / successful_detections, 3) if successful_detections > 0 else 0
        }

    def translate_suggestion(self, text: str, target_language: str = "English") -> str:
        """
        Suggest translation for detected text.

        Args:
            text: Text to potentially translate
            target_language: Target language for translation

        Returns:
            Translation suggestion
        """
        # First detect the language
        detection = self.detect_language_basic(text)
        source_lang = detection['primary_language']['name']

        if source_lang.lower() == target_language.lower():
            return f"Text is already in {target_language}"

        prompt = f"""Translate the following text from {source_lang} to {target_language}:

"{text}"

Provide only the translation, nothing else."""

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are a professional translator."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3
            )

            return response.choices[0].message.content

        except Exception as e:
            return f"Translation error: {e}"

print("✓ MultilingualAgent class defined")

✓ MultilingualAgent class defined


## 3. Initialize Agent

Create an instance of the MultilingualAgent.


In [85]:
# Initialize the agent
agent = MultilingualAgent(model="gpt-4o-mini")


 Multilingual Agent initialized
   Model: gpt-4o-mini
   Languages supported: 55+


## 4. Demo: Individual Language Detection

Test the agent with various languages one by one.


In [86]:
print("="*70)
print("TEST 1: INDIVIDUAL LANGUAGE DETECTION")
print("="*70)

# Test English
print("\n[Test 1] Expected: English")
result = agent.detect_language_hybrid("Hello, how are you today?")


TEST 1: INDIVIDUAL LANGUAGE DETECTION

[Test 1] Expected: English

LANGUAGE DETECTION - HYBRID MODE

Text: "Hello, how are you today?"
Length: 25 characters

Method 1: Statistical Detection (langdetect)
   Result: Somali (0.571 confidence)

Method 2: AI-Powered Detection (OpenAI)

Detection issues encountered



In [87]:
# Test French
print("\n[Test 2] Expected: French")
result = agent.detect_language_hybrid("Bonjour, comment allez-vous aujourd'hui?")



[Test 2] Expected: French

LANGUAGE DETECTION - HYBRID MODE

Text: "Bonjour, comment allez-vous aujourd'hui?"
Length: 40 characters

Method 1: Statistical Detection (langdetect)
   Result: French (1.0 confidence)

Method 2: AI-Powered Detection (OpenAI)

Detection issues encountered



In [88]:
# Test Spanish
print("\n[Test 3] Expected: Spanish")
result = agent.detect_language_hybrid("Hola, ¿cómo estás hoy?")



[Test 3] Expected: Spanish

LANGUAGE DETECTION - HYBRID MODE

Text: "Hola, ¿cómo estás hoy?"
Length: 22 characters

Method 1: Statistical Detection (langdetect)
   Result: Spanish (1.0 confidence)

Method 2: AI-Powered Detection (OpenAI)

Detection issues encountered



In [None]:
# Test Japanese
print("\n[Test 4] Expected: Japanese")
result = agent.detect_language_hybrid("こんにちは、今日はお元気ですか？")



[Test 4] Expected: Japanese

LANGUAGE DETECTION - HYBRID MODE

Text: "こんにちは、今日はお元気ですか？"
Length: 16 characters

Method 1: Statistical Detection (langdetect)
   Result: Japanese (1.0 confidence)

Method 2: AI-Powered Detection (OpenAI)


In [None]:
# Test German
print("\n[Test 5] Expected: German")
result = agent.detect_language_hybrid("Guten Tag, wie geht es Ihnen heute?")


## 5. Demo: Batch Processing

Process multiple texts efficiently.


In [None]:
print("="*70)
print("TEST 2: BATCH PROCESSING")
print("="*70)

batch_texts = [
    "Привет, как дела сегодня?",  # Russian
    "你好，你今天好吗？",  # Chinese
    "مرحبا، كيف حالك اليوم؟",  # Arabic
    "Ciao, come stai oggi?",  # Italian
    "Olá, como você está hoje?"  # Portuguese
]

batch_results = agent.detect_batch(batch_texts)


## 6. Demo: Translation

Detect language and translate to English.


In [None]:
print("="*70)
print("TEST 3: TRANSLATION")
print("="*70)

spanish_text = "Hola, necesito ayuda con mi computadora"
print(f"\nOriginal (Spanish): {spanish_text}")

translation = agent.translate_suggestion(spanish_text, "English")
print(f"Translation (English): {translation}")


In [None]:
# Try another language
french_text = "Je voudrais réserver une table pour deux personnes"
print(f"\nOriginal (French): {french_text}")

translation = agent.translate_suggestion(french_text, "English")
print(f"Translation (English): {translation}")


## 7. Statistics

View detection history and language distribution.


In [None]:
print("="*70)
print("TEST 4: STATISTICS")
print("="*70)

stats = agent.get_statistics()

print("\n Detection Statistics:")
print(f"   Total detections: {stats['total_detections']}")
print(f"   Successful: {stats['successful_detections']}")
print(f"   Unique languages: {stats['languages_detected']}")
print(f"   Average confidence: {stats['average_confidence']}")

print("\n   Language breakdown:")
for lang, count in sorted(stats['language_breakdown'].items(), key=lambda x: x[1], reverse=True):
    print(f"      • {lang}: {count}")

print("\n" + "="*70)
print("DEMONSTRATION COMPLETE")
print("="*70)


## 8. Try Your Own Text

Test with your own text samples below.


In [None]:
# Test with your own text
your_text = "yêu hòa bình và niềm vui"

# Uncomment to test:
result = agent.detect_language_hybrid(your_text)

print(result)


## 9. More Examples

Additional language samples to test.


In [None]:
# Test more languages
more_languages = {
    "Hindi": "नमस्ते, आप कैसे हैं?",
    "Korean": "안녕하세요, 어떻게 지내세요?",
    "Turkish": "Merhaba, nasılsın?",
    "Thai": "สวัสดี คุณเป็นอย่างไรบ้าง?",
    "Vietnamese": "Xin chào, bạn khỏe không?",
    "Dutch": "Hallo, hoe gaat het met je?",
    "Swedish": "Hej, hur mår du?",
    "Polish": "Cześć, jak się masz?"
}

print("\nTesting Additional Languages:\n")
for expected_lang, text in more_languages.items():
    print(f"Expected: {expected_lang}")
    print(f"Text: {text}")
    result = agent.detect_language_basic(text)
    detected = result['primary_language']
    print(f"✓ Detected: {detected['name']} (confidence: {detected['confidence']})")
    print("-" * 70)


## 10. View Detection History

Examine all detections performed in this session.


In [None]:
# View detection history
print(f"Detection History ({len(agent.detection_history)} total)")
print("="*70)

for i, detection in enumerate(agent.detection_history[-5:], 1):  # Show last 5
    print(f"\n[{i}] Text: {detection['text'][:60]}...")
    stat = detection['statistical_detection']
    if 'error' not in stat:
        lang = stat['primary_language']
        print(f"    Language: {lang['name']} ({lang['confidence']} confidence)")
        print(f"    Verdict: {detection['verdict']}")
