In [1]:
import csv
import pandas as pd
pd.set_option('display.max_colwidth', None)
from datasets import load_dataset

In [2]:
# Path to the dataset files
en_file_path = "../Datasets/WikiMatrix/WikiMatrix.en-hi.en"
hi_file_path = "../Datasets/WikiMatrix/WikiMatrix.en-hi.hi"
output_file_path = "wikimatrix_en_hi.csv"

In [3]:
# Read the files and write to CSV
with open(en_file_path, "r", encoding="utf-8") as en_file, open(hi_file_path, "r", encoding="utf-8") as hi_file, open(output_file_path, "w", encoding="utf-8", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["English", "Hindi"])  # Write header
    for en_sentence, hi_sentence in zip(en_file, hi_file):
        writer.writerow([en_sentence.strip(), hi_sentence.strip()])

In [4]:
raw_data = pd.read_csv(output_file_path)
raw_data.head()

Unnamed: 0,English,Hindi
0,Recite in the name of your Lord who created—Created man from a clinging substance.,"अपने परवरदिगार का नाम ले कर पढ़ो, जिसने (दुनिया को) पैदा ‎किया।"
1,They were tenants to their lord.,अतः वे अपने इष्ट परम प्रभु की उपासना में ही दत्तचित्त रहते थे।
2,Indeed your Lord is the All-beneficent.,तुम्हारा रब एक है।
3,"I mean, we all lived in this century.","मेरा मतलब है, हम सभी को इस सदी में रहते थे।"
4,Be steadfastly righteous!,अतः तुम वही करो जो उचित है।


In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231460 entries, 0 to 231459
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   English  231459 non-null  object
 1   Hindi    231459 non-null  object
dtypes: object(2)
memory usage: 3.5+ MB


In [6]:
# Read the files and write to CSV
with open(en_file_path, "r", encoding="utf-8") as en_file, open(hi_file_path, "r", encoding="utf-8") as hi_file, open(output_file_path, "w", encoding="utf-8", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["English", "Hindi"])  # Write header
    
    for en_sentence, hi_sentence in zip(en_file, hi_file):
        # Strip the sentences and check if they are not empty
        en_sentence = en_sentence.strip()
        hi_sentence = hi_sentence.strip()
        
        # Skip writing to the CSV if either sentence is empty or None
        if en_sentence and hi_sentence:
            writer.writerow([en_sentence, hi_sentence])
# Load the dataset
dataset = load_dataset("csv", data_files={"train": "wikimatrix_en_hi.csv"})
dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 231459
    })
})

In [7]:
from datasets import load_dataset
from datasets import DatasetDict

# Load only 20% of the dataset
dataset = load_dataset("csv", data_files={"train": "wikimatrix_en_hi.csv"}, split="train[:50]")

# Split into train and test sets (e.g., 80% train, 20% test)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Further split the test set into validation and test (e.g., 50-50 split of the 20%)
validation_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Combine splits into a DatasetDict
raw_dataset = {
    "train": train_test_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
}

dataset = DatasetDict(raw_dataset)

# Inspect the resulting dataset
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 40
    })
    validation: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 5
    })
    test: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 5
    })
})


In [8]:
print(dataset['train'][0])
print(dataset['train'][1])
print(dataset['train'][2])
print(dataset['train'][3])

{'English': 'Death is seen as a boundary to another world.', 'Hindi': 'मौत एक और दुनिया के लिए एक सीमा के रूप में देखा जाता है।'}
{'English': '(He later returns to Africa, but this part of his life is not recorded in this book.)', 'Hindi': '(बाद में वे अफ्रीका वापस आते हैं, लेकिन उनके जीवन के इस भाग को इस पुस्तक में दर्ज नहीं किया गया है।'}
{'English': '10 December 2001), who was a British woman.', 'Hindi': '10 दिसंबर 2001) से हुआ था, जो एक ब्रिटिश महिला थीं।'}
{'English': 'Can you follow me or shall I follow you?"', 'Hindi': 'यानि तू मुझ पर हँसा या उस कुम्हार (ईश्वर) पर?'}


In [9]:
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer

model_ID = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="hi")


In [10]:
max_input_length = 128
max_target_length = 128

source_lang = "English"
target_lang = "Hindi"

def preprocess_function(examples):
    # Extract English and Hindi from the dataset's features
    English = examples[source_lang]
    Hindi = examples[target_lang]

    # Tokenize the input text
    model_inputs = tokenizer(English, max_length=max_input_length, truncation=True)

    # Tokenize the target text
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(Hindi, max_length=max_target_length, truncation=True)

    # Add tokenized targets as labels
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [11]:
# Map the preprocessing function to the DatasetDict
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Inspect the tokenized dataset
print(tokenized_datasets)


Map:   0%|          | 0/40 [00:00<?, ? examples/s]



Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 40
    })
    validation: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5
    })
})


In [12]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_ID)

In [13]:
import evaluate

metric = evaluate.load("sacrebleu")

In [14]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [15]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer


training_args = Seq2SeqTrainingArguments(
    output_dir="../Modesl",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    
)

trainer = Seq2SeqTrainer(
    model=model_ID,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    
)

trainer.train()

  0%|          | 0/6 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.2680480480194092, 'eval_bleu': 20.2156, 'eval_gen_len': 23.0, 'eval_runtime': 8.0105, 'eval_samples_per_second': 0.624, 'eval_steps_per_second': 0.125, 'epoch': 1.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.2237743139266968, 'eval_bleu': 33.5161, 'eval_gen_len': 21.8, 'eval_runtime': 7.6274, 'eval_samples_per_second': 0.656, 'eval_steps_per_second': 0.131, 'epoch': 2.0}
{'train_runtime': 59.0222, 'train_samples_per_second': 1.355, 'train_steps_per_second': 0.102, 'train_loss': 2.1934680938720703, 'epoch': 2.0}


TrainOutput(global_step=6, training_loss=2.1934680938720703, metrics={'train_runtime': 59.0222, 'train_samples_per_second': 1.355, 'train_steps_per_second': 0.102, 'total_flos': 4469653241856.0, 'train_loss': 2.1934680938720703, 'epoch': 2.0})

In [16]:
trainer.save_model("tfmodel/")

In [21]:
from transformers import pipeline
text = 'are u dick'
translator = pipeline("translation_en_to_hi", model="tfmodel/")
translator(text)

Device set to use cuda:0


[{'translation_text': 'तुम मूर्ख हो'}]