In [16]:
import easyocr
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# ISO 639-1 to full language name map (add more as needed)
LANGUAGE_CODE_MAP = {
    "af": "Afrikaans", "ar": "Arabic", "as": "Assamese", "az": "Azerbaijani", "be": "Belarusian",
    "bg": "Bulgarian", "bn": "Bengali", "ca": "Catalan", "ch": "Chamorro", "cs": "Czech",
    "cy": "Welsh", "da": "Danish", "de": "German", "el": "Greek", "en": "English",
    "es": "Spanish", "et": "Estonian", "fa": "Persian", "fi": "Finnish", "fr": "French",
    "ga": "Irish", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian",
    "ht": "Haitian Creole", "hu": "Hungarian", "id": "Indonesian", "is": "Icelandic", "it": "Italian",
    "ja": "Japanese", "jv": "Javanese", "ka": "Georgian", "kk": "Kazakh", "ko": "Korean",
    "lt": "Lithuanian", "lv": "Latvian", "mk": "Macedonian", "ml": "Malayalam", "mn": "Mongolian",
    "mr": "Marathi", "ms": "Malay", "my": "Burmese", "ne": "Nepali", "nl": "Dutch",
    "no": "Norwegian", "pa": "Punjabi", "pl": "Polish", "ps": "Pashto", "pt": "Portuguese",
    "ro": "Romanian", "ru": "Russian", "rw": "Kinyarwanda", "sd": "Sindhi", "si": "Sinhala",
    "sk": "Slovak", "sl": "Slovenian", "so": "Somali", "sq": "Albanian", "sr": "Serbian",
    "sv": "Swedish", "sw": "Swahili", "ta": "Tamil", "te": "Telugu", "th": "Thai",
    "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu", "uz": "Uzbek", "vi": "Vietnamese",
    "zh": "Chinese"
}

# Step 1: Compatible language groups for OCR
language_groups = [
    ['en', 'fr', 'de', 'es', 'pt', 'it'],
    ['en', 'hi', 'mr'],
    ['en', 'bn', 'as'],
    ['en', 'th'],
    ['en', 'ja'],
    ['en', 'ch_sim'],
    ['en', 'ch_tra'],
    ['en', 'ar', 'fa', 'ur', 'ug'],
    ['en', 'ru', 'uk'],
]

# Step 2: Read image
image_path = '/kaggle/input/multilingual-meme-datasets/datasets/datasets/eng594.png'
extracted_text = ""

for langs in language_groups:
    try:
        reader = easyocr.Reader(langs, gpu=False)
        results = reader.readtext(image_path)
        extracted_text = " ".join([res[1] for res in results])
        if extracted_text.strip():
            print(f"✅ OCR succeeded with languages: {langs}")
            print("Extracted Text:", extracted_text)
            break
    except Exception as e:
        print(f"⚠️ Skipping language group {langs} due to error: {e}")

# Step 3: Detect Language (Hugging Face)
if extracted_text.strip():
    model_name = "papluca/xlm-roberta-base-language-detection"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    lang_detect = pipeline("text-classification", model=model, tokenizer=tokenizer)

    lang_result = lang_detect(extracted_text)[0]
    lang_code = lang_result['label']
    lang_name = LANGUAGE_CODE_MAP.get(lang_code, "Unknown Language")
    
    print(f"🌐 Detected Language Code: {lang_code}")
    print(f"🗣️ Full Language Name: {lang_name}")
else:
    print("❌ No text could be extracted from the image.")

✅ OCR succeeded with languages: ['en', 'fr', 'de', 'es', 'pt', 'it']
Extracted Text: DuniyaRe MeraRaaj Ye TohapnaRàju Hai ReBabao Bola tha na babu Bhaiya; ye Raju paiso ke liye kuch bhi kar sakta hai . Hoga


Device set to use cuda:0


🌐 Detected Language Code: hi
🗣️ Full Language Name: Hindi


In [None]:
# Read image
results = reader.readtext('image.jpg')
extracted_text = " ".join([res[1] for res in results])
print("Extracted Text:", extracted_text)

In [None]:
# Step 2: Language Detection
model_name = "papluca/xlm-roberta-base-language-detection"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
lang_detect = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [None]:
# Detect language
lang = lang_detect(extracted_text)
print("Detected Language:", lang)