<a href="https://colab.research.google.com/github/aelkhodary/AI_ML_Coders/blob/main/Translate_Ar_To_En.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install ENV

In [None]:
!pip install transformers
!pip install torch
!pip install pandas
!pip install openpyxl
!pip install sentencepiece  # Required for tokenization

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

# Import necessary libraries:

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer  # Note the corrected class name
import torch
import pandas as pd
from typing import List, Union

# Load the model and tokenizer:

In [None]:
def initialize_model():
    # Load model and tokenizer
    model_name = "Helsinki-NLP/opus-mt-ar-en"

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    print("Loading model...")
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16  # Use half precision to save memory
    )

    # Move model to GPU (Colab provides GPU if enabled)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    print(f"Model loaded on: {device}")

    return model, tokenizer

# Create translation function:

In [None]:
def translate_text(text: str, model, tokenizer, max_length: int = 128) -> str:
    if pd.isna(text):
        return ""

    # Prepare the prompt
    prompt = f"Translate this Arabic name to English: {text}"

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length)
    inputs = inputs.to(model.device)

    # Generate translation
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=False
        )

    # Decode and return translation
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation.strip()

# Main processing function:

In [None]:
def process_excel_file(input_file: str, output_file: str):
    try:
        # Initialize model and tokenizer
        model, tokenizer = initialize_model()

        # Read Excel file
        print("Reading Excel file...")
        df = pd.read_excel(input_file)

        # Columns to translate
        name_columns = ['Q2_MEM_NAME_FIRST', 'Q3_MEM_NAME_FATHER', 'Q4_MEM_NAME_GRAND']

        # Process each column
        for col in name_columns:
            if col in df.columns:
                print(f"\nTranslating {col}...")
                df[f'{col}_ENGLISH'] = df[col].apply(
                    lambda x: translate_text(x, model, tokenizer)
                )
            else:
                print(f"Warning: Column {col} not found in Excel file")

        # Save results
        print("\nSaving results...")
        df.to_excel(output_file, index=False)
        print(f"Translations saved to {output_file}")

    except Exception as e:
        print(f"Error: {str(e)}")
        raise

# Run the translation:

# Define your input and output file paths
input_file = "path/to/your/InputFile_S.xlsx"
output_file = "path/to/your/translated_names.xlsx"

# Run the translation
process_excel_file(input_file, output_file)

In [None]:
# 4. Simple translation function
def test_model(input_text: str, model, tokenizer, max_length: int = 128) -> str:
    # Prepare the prompt
    prompt = f"Translate this Arabic Name to English : {input_text}"

    print(f"Input prompt: {prompt}")

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length)
    inputs = inputs.to(model.device)

    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=False
        )

    # Decode and return result
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result.strip()


# 5. Test with a simple input
def run_test():
    print("Initializing model...")
    model, tokenizer = initialize_model()

    # Test input
    test_input = "منصور"  # Replace with your test input

    print("\nProcessing test input...")
    result = test_model(test_input, model, tokenizer)

    print("\nResults:")
    print(f"Input: {test_input}")
    print(f"Output: {result}")

# 6. Run the test
run_test()


Initializing model...
Loading tokenizer...
Loading model...
Model loaded on: cpu

Processing test input...
Input prompt: Translate this Arabic Name to English : منصور

Results:
Input: منصور
Output: == sync, corrected by elderman == @elder_man


# Use Langchain

In [None]:
def translate_arabic_to_english(text, model, tokenizer):
    try:
        # Get the device the model is on
        device = next(model.parameters()).device

        # Tokenize the input text
        inputs = tokenizer(text, return_tensors="pt", padding=True).to(device)

        # Generate translation
        outputs = model.generate(**inputs, max_length=128)

        # Decode the translation
        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        return translated_text.strip()

    except Exception as e:
        print(f"Translation error for '{text}': {e}")
        return text

# Test the function
def test_translation():
    # Initialize model
    model, tokenizer = initialize_model()

    # Test cases
    arabic_texts = ["منصور" ,
        "مرحبا",
        "محمد",
        "كيف حالك"
    ]

    print("Testing translations:")
    print("-" * 40)
    for text in arabic_texts:
        translation = translate_arabic_to_english(text, model, tokenizer)
        print(f"Arabic: {text}")
        print(f"English: {translation}")
        print("-" * 40)

# Run the test
if __name__ == "__main__":
    test_translation()

Loading tokenizer...
Loading model...
Model loaded on: cpu
Testing translations:
----------------------------------------
Arabic: منصور
English: Mansour
----------------------------------------
Arabic: مرحبا
English: Hey.
----------------------------------------
Arabic: محمد
English: Muhammad.
----------------------------------------
Arabic: كيف حالك
English: How are you?
----------------------------------------
