In [2]:
# In a notebook cell prefix with ! to run shell commands
!pip install --upgrade pip
!pip install transformers sentencepiece torch langdetect ipywidgets sacrebleu
# Optional if you want OpenAI-based responses:
!pip install openai
# Optional for nicer text display
!pip install rich




In [3]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from langdetect import detect, DetectorFactory
import torch
from IPython.display import display, Markdown
import ipywidgets as widgets
import sacrebleu

DetectorFactory.seed = 0  # make langdetect deterministic


In [4]:
model_name = "facebook/m2m100_418M"  # many-to-many multilingual model
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print("Model loaded on", device)


Model loaded on cpu


In [5]:
def detect_language(text):
    try:
        lang = detect(text)
    except Exception:
        lang = "unknown"
    return lang  # returns ISO 639-1 code like 'fr', 'hi', 'es'

def translate_to_english(text, src_lang=None, max_length=256):
    """
    Uses M2M100 to translate input text to English.
    If src_lang not provided, we'll try to detect it.
    """
    if not src_lang:
        src_lang = detect_language(text)
    # tokenizer needs language codes as per model (e.g., 'fr', 'hi', 'es')
    tokenizer.src_lang = src_lang
    inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)
    # force target language to English
    generated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id("en"), max_length=max_length)
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    return translation


In [6]:
# Improved detect + translate for M2M100 with robust code-mapping

# show what codes the tokenizer actually supports (run once to inspect)
print("Supported tokenizer language codes (sample):")
print(list(tokenizer.lang_code_to_token.keys())[:60])  # show first 60 keys

from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

def normalize_lang_for_tokenizer(raw_lang, tokenizer):
    """
    Convert langdetect/other raw language codes (e.g. 'zh-cn', 'pt-BR') into a code
    present in tokenizer.lang_code_to_token. Returns a valid code or None.
    """
    if not raw_lang:
        return None
    rl = raw_lang.strip()
    # normalize common separators and case
    rl = rl.replace('_', '-').lower()

    # direct match attempts (try as-is and upper/lower variants)
    if rl in tokenizer.lang_code_to_token:
        return rl
    # sometimes tokenizer uses uppercase region parts, try variants:
    if rl.upper() in tokenizer.lang_code_to_token:
        return rl.upper()
    # try replacing '-' with '_' (some tokenizers use underscores)
    rl_unders = rl.replace('-', '_')
    if rl_unders in tokenizer.lang_code_to_token:
        return rl_unders
    if rl_unders.upper() in tokenizer.lang_code_to_token:
        return rl_unders.upper()

    # fallback: try language-only code (first two letters, e.g. 'zh' from 'zh-cn')
    lang_only = rl.split('-')[0]
    if lang_only in tokenizer.lang_code_to_token:
        return lang_only
    if lang_only.upper() in tokenizer.lang_code_to_token:
        return lang_only.upper()

    # specific convenient mappings (add as needed)
    special_map = {
        "zh-cn": "zh",    # map Chinese (simplified) to 'zh'
        "zh-tw": "zh",    # traditional -> also 'zh' (or change to 'zh_TW' if tokenizer expects)
        "zh-hans": "zh",
        "zh-hant": "zh",
        "pt-br": "pt",    # portuguese brazil -> 'pt'
        "en-gb": "en",    # region variants
        "en-us": "en",
        "es-es": "es",
        "es-mx": "es"
    }
    if rl in special_map and special_map[rl] in tokenizer.lang_code_to_token:
        return special_map[rl]

    # last resort: return None so caller can choose a default
    return None

def detect_language(text):
    try:
        return detect(text)
    except Exception:
        return None

def translate_to_english(text, src_lang=None, max_length=256, debug=False):
    """
    Robust translation using M2M100. Tries to normalize the source language
    so tokenizer accepts it. Falls back to auto-detect language-only code,
    and finally to 'en' if nothing matches (meaning the model will assume English input).
    """
    if not src_lang:
        src_lang = detect_language(text)
    if debug:
        print("raw detected language:", src_lang)

    mapped = normalize_lang_for_tokenizer(src_lang, tokenizer)
    if debug:
        print("mapped tokenizer code:", mapped)

    # If still None, attempt to pick the two-letter language or let tokenizer treat as unknown.
    if mapped is None:
        # try two-letter code forcibly
        if src_lang:
            candidate = src_lang.split('-')[0].lower()
            if candidate in tokenizer.lang_code_to_token:
                mapped = candidate
                if debug:
                    print("falling back to candidate:", candidate)
    # ultimate fallback: choose 'en' as src so model will treat input as English (not ideal)
    if mapped is None:
        mapped = "en"
        if debug:
            print("ultimate fallback to 'en' as src_lang (translation may be identity)")

    # set tokenizer source language (this triggers M2M100 special token config)
    tokenizer.src_lang = mapped

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    generated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id("en"), max_length=max_length)
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    return translation

# quick test examples
examples = [
    "Hola, ¿cómo puedo cambiar mi contraseña?",
    "मेरे ऑर्डर में देरी हो रही है। कृपया मदद करें।",
    "Bonjour, je n'arrive pas à me connecter.",
    "我无法登录我的账户。",         # Chinese
    "這是繁體中文的測試。"           # Traditional Chinese
]

for txt in examples:
    print("SOURCE:", txt)
    print("DETECTED:", detect_language(txt))
    print("ENGLISH:", translate_to_english(txt, debug=True))
    print("---")


Supported tokenizer language codes (sample):
['af', 'am', 'ar', 'ast', 'az', 'ba', 'be', 'bg', 'bn', 'br', 'bs', 'ca', 'ceb', 'cs', 'cy', 'da', 'de', 'el', 'en', 'es', 'et', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'ilo', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'lb', 'lg', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr']
SOURCE: Hola, ¿cómo puedo cambiar mi contraseña?
DETECTED: es
raw detected language: es
mapped tokenizer code: es
ENGLISH: How can I change my password?
---
SOURCE: मेरे ऑर्डर में देरी हो रही है। कृपया मदद करें।
DETECTED: hi
raw detected language: hi
mapped tokenizer code: hi
ENGLISH: My order is delayed. please help.
---
SOURCE: Bonjour, je n'arrive pas à me connecter.
DETECTED: fr
raw detected language: fr
mapped tokenizer code: fr
ENGLISH: Hi, I can’t get connected.
---
SOURCE: 我无法登录我的账户。
DETECTED: zh-cn
raw detected language: zh-cn
mapped tokenizer code: zh
ENGLISH: I cannot log i

In [7]:
# --- Add language-name + translation wrapper ---
# Requires: langdetect (already used), optionally langcodes or pycountry for nice names.
# If neither langcodes nor pycountry is installed, the function will return the language code as the name.

# Try to import nicer language-name libraries, fall back gracefully
try:
    import langcodes  # pip install langcodes
    def get_language_name(code):
        if not code:
            return "Unknown"
        try:
            # langcodes.normalize_tag handles things like 'zh-cn' -> 'zh-Hans' etc.
            tag = langcodes.normalize_tag(code)
            name = langcodes.Language.get(tag).display_name()
            return name.capitalize() if isinstance(name, str) else str(name)
        except Exception:
            return code
except Exception:
    try:
        import pycountry  # pip install pycountry
        def get_language_name(code):
            if not code:
                return "Unknown"
            # try alpha_2 or alpha_3 then fallback to the code itself
            c = code.split('-')[0].lower()
            try:
                lang = pycountry.languages.get(alpha_2=c)
                if lang and getattr(lang, "name", None):
                    return lang.name
            except Exception:
                pass
            try:
                lang = pycountry.languages.get(alpha_3=c)
                if lang and getattr(lang, "name", None):
                    return lang.name
            except Exception:
                pass
            return code
    except Exception:
        # no helper libraries available — return the code as name
        def get_language_name(code):
            return code or "Unknown"

# Wrapper function that returns a structured result
def translate_with_language_label(text, detect_fn=detect_language,
                                  normalize_fn=normalize_lang_for_tokenizer,
                                  translator_fn=translate_to_english,
                                  debug=False):
    """
    Returns a dict:
      {
        "original": <text>,
        "raw_detected_code": <raw_code_from_langdetect or None>,
        "tokenizer_code": <mapped_code_used_with_tokenizer>,
        "language_name": <human readable name or code>,
        "translation": <english_translation>
      }
    """
    if not isinstance(text, str) or not text.strip():
        return {
            "original": text,
            "raw_detected_code": None,
            "tokenizer_code": None,
            "language_name": "Unknown",
            "translation": ""
        }

    raw_code = detect_fn(text)
    mapped_code = normalize_fn(raw_code, tokenizer)
    # If mapping returned None, try a last-resort two-letter fallback or 'und'
    if mapped_code is None and raw_code:
        mapped_code = raw_code.split('-')[0].lower() if isinstance(raw_code, str) else None
        if mapped_code not in tokenizer.lang_code_to_token:
            mapped_code = None

    # Final fallback: if still None, choose 'en' as source (model will treat input as English — not ideal)
    if mapped_code is None:
        mapped_code = "en"

    lang_name = get_language_name(mapped_code)

    # Do translation (use mapped_code as src_lang so tokenizer is configured)
    translation = translator_fn(text, src_lang=mapped_code)

    if debug:
        print(f"Raw detected: {raw_code}")
        print(f"Mapped tokenizer code: {mapped_code}")
        print(f"Language name: {lang_name}")
        print(f"Translation: {translation}")

    return {
        "original": text,
        "raw_detected_code": raw_code,
        "tokenizer_code": mapped_code,
        "language_name": lang_name,
        "translation": translation
    }

# --- Demo on your earlier examples ---
examples = [
    "Hola, ¿cómo puedo cambiar mi contraseña?",
    "मेरे ऑर्डर में देरी हो रही है। कृपया मदद करें।",
    "Bonjour, je n'arrive pas à me connecter.",
    "我无法登录我的账户。",
    "這是繁體中文的測試。"
]

for txt in examples:
    res = translate_with_language_label(txt)
    print(f"Original : {res['original']}")
    print(f"Language : {res['language_name']} ({res['tokenizer_code']})")
    print(f"English  : {res['translation']}")
    print("---")


Original : Hola, ¿cómo puedo cambiar mi contraseña?
Language : Spanish (es)
English  : How can I change my password?
---
Original : मेरे ऑर्डर में देरी हो रही है। कृपया मदद करें।
Language : Hindi (hi)
English  : My order is delayed. please help.
---
Original : Bonjour, je n'arrive pas à me connecter.
Language : French (fr)
English  : Hi, I can’t get connected.
---
Original : 我无法登录我的账户。
Language : Chinese (zh)
English  : I cannot log in to my account.
---
Original : 這是繁體中文的測試。
Language : Chinese (zh)
English  : This is a Chinese test.
---


In [8]:
!pip install pycountry




In [9]:
import pycountry

def get_language_name(code):
    """
    Convert language code like 'zh', 'fr', 'hi' into full language name.
    Fallbacks included if pycountry doesn't recognize the code.
    """
    if not code:
        return "Unknown"

    # If code has region (zh-cn), keep only 'zh'
    clean_code = code.split("-")[0].lower()

    # Try alpha-2 match (most common)
    lang = pycountry.languages.get(alpha_2=clean_code)
    if lang and hasattr(lang, 'name'):
        return lang.name

    # Try alpha-3 (e.g., 'zho' for Chinese)
    lang = pycountry.languages.get(alpha_3=clean_code)
    if lang and hasattr(lang, 'name'):
        return lang.name

    # Manual overrides for very common languages missing from pycountry
    manual_map = {
        "zh": "Chinese",
        "jw": "Javanese",
        "ceb": "Cebuano",
        "ilo": "Ilocano",
    }
    if clean_code in manual_map:
        return manual_map[clean_code]

    # Last fallback
    return code


def translate_with_language_label(text, detect_fn=detect_language,
                                  normalize_fn=normalize_lang_for_tokenizer,
                                  translator_fn=translate_to_english,
                                  debug=False):

    if not isinstance(text, str) or not text.strip():
        return {
            "original": text,
            "raw_detected_code": None,
            "tokenizer_code": None,
            "language_name": "Unknown",
            "translation": ""
        }

    raw_code = detect_fn(text)
    mapped_code = normalize_fn(raw_code, tokenizer)

    if mapped_code is None and raw_code:
        mapped_code = raw_code.split("-")[0].lower()
        if mapped_code not in tokenizer.lang_code_to_token:
            mapped_code = None

    if mapped_code is None:
        mapped_code = "en"

    # NEW: Full language name here
    lang_name = get_language_name(mapped_code)

    translation = translator_fn(text, src_lang=mapped_code)

    return {
        "original": text,
        "raw_detected_code": raw_code,
        "tokenizer_code": mapped_code,
        "language_name": lang_name,        # Full language name added
        "translation": translation
    }


# --- Demo ---
examples = [
    "Hola, ¿cómo puedo cambiar mi contraseña?",
    "मेरे ऑर्डर में देरी हो रही है। कृपया मदद करें।",
    "Bonjour, je n'arrive pas à me connecter.",
    "我无法登录我的账户。",
    "這是繁體中文的測試。"
]

for txt in examples:
    res = translate_with_language_label(txt)
    print(f"Original : {res['original']}")
    print(f"Language : {res['language_name']} ({res['tokenizer_code']})")
    print(f"English  : {res['translation']}")
    print("---")


Original : Hola, ¿cómo puedo cambiar mi contraseña?
Language : Spanish (es)
English  : How can I change my password?
---
Original : मेरे ऑर्डर में देरी हो रही है। कृपया मदद करें।
Language : Hindi (hi)
English  : My order is delayed. please help.
---
Original : Bonjour, je n'arrive pas à me connecter.
Language : French (fr)
English  : Hi, I can’t get connected.
---
Original : 我无法登录我的账户。
Language : Chinese (zh)
English  : I cannot log in to my account.
---
Original : 這是繁體中文的測試。
Language : Chinese (zh)
English  : This is a Chinese test.
---


In [10]:
text = input("Enter a customer message in any language: ")

res = translate_with_language_label(text)

print("\nDetected Language:", res['language_name'], f"({res['tokenizer_code']})")
print("English Translation:", res['translation'])


Enter a customer message in any language:  這是繁體中文的測試。



Detected Language: Chinese (zh)
English Translation: This is a Chinese test.


In [11]:
def simple_reply(english_text):
    text = english_text.lower()
    if "password" in text or "login" in text:
        return "Please reset your password using the link or contact support."
    if "order" in text or "delivery" in text:
        return "Your order seems delayed. Please provide your order ID."
    return "Thank you for your message. How can I assist you further?"


In [12]:
reply = simple_reply(res['translation'])
print("\nAuto Reply:", reply)



Auto Reply: Thank you for your message. How can I assist you further?


In [13]:
import ipywidgets as widgets
from IPython.display import display

inp = widgets.Textarea(
    value='',
    placeholder='Type or paste message here...',
    description='Message:',
    layout=widgets.Layout(width='100%', height='120px')
)

btn = widgets.Button(description="Translate")
out = widgets.Output()

def on_click(b):
    with out:
        out.clear_output()
        res = translate_with_language_label(inp.value)
        print(f"Detected Language: {res['language_name']} ({res['tokenizer_code']})")
        print(f"English Translation:\n{res['translation']}\n")
        print("Auto Reply:", simple_reply(res['translation']))

btn.on_click(on_click)

display(inp, btn, out)


Textarea(value='', description='Message:', layout=Layout(height='120px', width='100%'), placeholder='Type or p…

Button(description='Translate', style=ButtonStyle())

Output()

In [14]:
print("Kernel OK. Model in globals?", 'model' in globals(), 
      "tokenizer?", 'tokenizer' in globals(),
      "translate_with_language_label?", 'translate_with_language_label' in globals(),
      "translate_to_english?", 'translate_to_english' in globals(),
      "simple_reply?", 'simple_reply' in globals())


Kernel OK. Model in globals? True tokenizer? True translate_with_language_label? True translate_to_english? True simple_reply? True


In [None]:
for s in ["Hola, ¿cómo puedo cambiar mi contraseña?", "我无法登录我的账户。"]:
    print("INPUT:", s)
    res = translate_with_language_label(s, debug=True)
    print("-> Detected:", res['language_name'], f"({res['tokenizer_code']})")
    print("-> English:", res['translation'])
    print("-"*40)


INPUT: Hola, ¿cómo puedo cambiar mi contraseña?
