In [2]:
import numpy as np

In [6]:
from datasets import Dataset
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import random

# -------------------------
# 1. Load your dataset (FAST)
# -------------------------
pairs = []
with open("data.txt", "r", encoding="utf-8") as f:
    for line in f:
        if "\t" in line:
            en, fr = line.strip().split("\t")
            pairs.append({"en": en, "fr": fr})

dataset = Dataset.from_list(pairs)
dataset = dataset.train_test_split(test_size=0.05)

# -------------------------
# 2. Load pre-trained EN→FR model
# -------------------------
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# -------------------------
# 3. Tokenization
# -------------------------
def preprocess(batch):
    inputs = tokenizer(batch["en"], padding="max_length", truncation=True, max_length=64)
    labels = tokenizer(batch["fr"], padding="max_length", truncation=True, max_length=64)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized = dataset.map(preprocess, batched=True)

# -------------------------
# 4. Very small training (FAST)
# -------------------------
args = Seq2SeqTrainingArguments(
    output_dir="nmt_fast_model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=1,          # SUPER FAST
    max_steps=50,                # <--- LIMIT TRAINING TO 50 STEPS ONLY
    logging_steps=10,
    save_total_limit=1,
    evaluation_strategy="no",    
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    data_collator=DataCollatorForSeq2Seq(tokenizer, model),
)

trainer.train()

model.save_pretrained("nmt_fast_model")
tokenizer.save_pretrained("nmt_fast_model")

print("Training finished FAST!")


FileNotFoundError: [Errno 2] No such file or directory: 'data.txt'

In [7]:
from transformers import MarianTokenizer, MarianMTModel
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset
import random

# Load your dataset (very simple parser)
pairs = []
with open("File 1.txt", "r", encoding="utf-8") as f:
    for line in f:
        if "\t" in line:
            en, fr = line.strip().split("\t")
            pairs.append({"en": en, "fr": fr})

dataset = Dataset.from_list(pairs).train_test_split(test_size=0.05)

# Load pretrained EN→FR model
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Tokenization function
def tokenize(batch):
    inp = tokenizer(batch["en"], padding="max_length", truncation=True, max_length=64)
    out = tokenizer(batch["fr"], padding="max_length", truncation=True, max_length=64)
    inp["labels"] = out["input_ids"]
    return inp

tokenized = dataset.map(tokenize, batched=True)

# SUPER FAST TRAINING (only 30 seconds)
args = Seq2SeqTrainingArguments(
    output_dir="nmt_model",
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    max_steps=30,        # <-- FASTEST POSSIBLE
    logging_steps=5,
    save_total_limit=1
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
)

trainer.train()
model.save_pretrained("nmt_model")
tokenizer.save_pretrained("nmt_model")
print("FAST TRAINING DONE!")


Map: 100%|██████████| 158773/158773 [00:10<00:00, 15451.34 examples/s]
Map: 100%|██████████| 8357/8357 [00:00<00:00, 16287.28 examples/s]


Step,Training Loss
5,3.0376
10,1.2213
15,1.2337
20,0.9797
25,0.9064
30,0.7766




FAST TRAINING DONE!


In [8]:
!pip install sacrebleu -q


In [9]:
from transformers import MarianMTModel, MarianTokenizer
from datasets import Dataset
import sacrebleu

model = MarianMTModel.from_pretrained("nmt_model")
tokenizer = MarianTokenizer.from_pretrained("nmt_model")




In [10]:
pairs = []
with open("File 1.txt", "r", encoding="utf-8") as f:
    for line in f:
        if "\t" in line:
            en, fr = line.strip().split("\t")
            pairs.append({"en": en, "fr": fr})

# 5% test split
test_size = max(1, int(len(pairs) * 0.05))
test_pairs = pairs[:test_size]
print("Test samples:", len(test_pairs))


Test samples: 8356


In [11]:
def translate(sentence):
    tokens = tokenizer(sentence, return_tensors="pt", truncation=True)
    out = model.generate(**tokens, max_length=64)
    return tokenizer.decode(out[0], skip_special_tokens=True)


In [None]:
import random

sample_pairs = random.sample(pairs, 100)   # only 10 sentences
import sacrebleu

preds = []
refs = []

for item in sample_pairs:
    pred = translate(item["en"])
    preds.append(pred)
    refs.append([item["fr"]])

bleu = sacrebleu.corpus_bleu(preds, refs)
print(" BLEU score:", bleu.score)


Quick BLEU score: 68.037493331712
