In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
import unicodedata
import re
import numpy as np
import os
import time

In [None]:
# Path for the dataset file
path_to_file = "/content/ara_.txt"

**1- Data Preprocessing:**
The code begins with importing necessary libraries and defining functions for data preprocessing, including unicode_to_ascii and preprocess_sentence.
The create_dataset function reads the dataset file, preprocesses the sentences, and returns word pairs for the specified number of examples.

**2- Language Indexing and Tensor Generation:**
The LanguageIndex class is defined to create an index for each language, including word-to-index and index-to-word mappings.
The load_dataset function processes the word pairs, generates input and target tensors, and pads the sequences to a maximum length.

**3- Training and Validation Split:**
The code then splits the input and target tensors into training and validation sets using an 80-20 split.

**4- Machine Translation using MBart Model:**
It then imports the MBart model and tokenizer from the Hugging Face transformers library.
The translate_with_mbart function takes a sentence in the source language, encodes it using the tokenizer, and generates the translated text using the MBart model.

**5- Example Translation:**
An example translation is performed using the MBart model for a given input sentence in Arabic, and the translated output is printed.


In [None]:
'''Example 1:
Input: "Café"
Output: "Cafe"'''

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [None]:
'''Example :
Input: "Hello! How are you?"
Output: "<start> hello ! how are you ? <end>"'''

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z؀-ۿ?.!,¿]+", " ", w)
    w = w.rstrip().strip()
    w = '<start> %s <end>' % w
    return w

In [None]:
def create_dataset(path, num_examples):
    lines = open(path, encoding='utf-8-sig').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]]
    print(len(lines))
    print(len(lines[:num_examples]))
    return word_pairs

In [None]:
'''# Example :
# Create a LanguageIndex for English
english_phrases = ["hello world", "how are you", "world"]
english_index = LanguageIndex(english_phrases)
print(english_index.word2idx)  # Output: {'<pad>': 0, 'are': 1, 'hello': 2, 'how': 3, 'world': 4, 'you': 5}
print(english_index.idx2word)  # Output: {0: '<pad>', 1: 'are', 2: 'hello', 3: 'how', 4: 'world', 5: 'you'}'''

class LanguageIndex():
    def __init__(self, lang):
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        self.create_index()

    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))
        self.vocab = sorted(self.vocab)
        self.word2idx['<pad>'] = 0
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1
        for word, index in self.word2idx.items():
            self.idx2word[index] = word

In [None]:
'''tensor_list = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
print(max_len)  # Output: 4'''

def max_length(tensor):
    return max(len(t) for t in tensor)

In [None]:
def load_dataset(path, num_examples):
    pairs = create_dataset(path, num_examples)
    inp_lang = LanguageIndex(sp for en, sp in pairs)
    targ_lang = LanguageIndex(en for en, sp in pairs)
    input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for en, sp in pairs]
    target_tensor = [[targ_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=max_length_inp, padding='post')
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_tar, padding='post')
    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar

In [None]:
# Try experimenting with the size of that dataset
num_examples = 30000
# Load the dataset and return Tensor of the input, Tensor for the target, Indexed input, Indexed target, Max length input, Max length target
input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file, num_examples)
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)


10742
10742


In [None]:
# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)


(8593, 8593, 2149, 2149)

In [None]:
import torch
from transformers import MBartForConditionalGeneration, MBartTokenizer

In [None]:
# Load the MBart model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


In [None]:
'''Input Formatting:
It formats the input sentence by prefixing it with the source language tag, creating the input text for translation.
Encoding and Generation:
It encodes the input text using the tokenizer to obtain input IDs.
It generates the translated output using the MBart model, specifying the decoder start token ID for the target language.
Decoding and Output:
It decodes the generated tokens into the translated text using the tokenizer.'''

# Define a function for translation using the MBart model
def translate_with_mbart(sentence, source_lang, target_lang):
    input_text = f"{source_lang}: {sentence}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)
    translated = model.generate(input_ids, decoder_start_token_id=tokenizer.lang_code_to_id[target_lang])
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

In [None]:
# Example translation using the MBart model
source_lang = "ar_AR"  # Source language code (Arabic)
target_lang = "en_XX"  # Target language code (English)
example_sentence = "مرحبا بالعالم"  # Input sentence in Arabic
translated_sentence = translate_with_mbart(example_sentence, source_lang, target_lang)
print(f"Input: {example_sentence}")
print(f"Translation: {translated_sentence}")

Input: مرحبا بالعالم
Translation: MR: Hello world.
