In [1]:
#FineTune your huggingface model for language translation task. In this notebook I am finetuning a NLLB model which 
#is released by Meta. You must have sentence and its approperiate translation in your dataset.


In [1]:
#if you have dataset in your google drive, then mount with drive else no need of it.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install datasets sacrebleu transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 28.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 80.2 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 80.3 MB/s 
Collecting sentencepiece!=0.1.92,>=0.1.91
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 72.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers, sentencepiece
Successfully installed huggingface-hub-0.10.0 sentencepiece-0.1.97 tokenizers-0.

In [None]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.


In [4]:
import transformers

print(transformers.__version__)

4.22.2


In [5]:
#Give your model path or name which you want to finetune.
model_checkpoint = "facebook/nllb-200-distilled-600M"

In [9]:
#Load your dataset through load_dataset library.
#In my case i have csv files. If you have json file you can pass "josn" as first parameter in load_dataset and path in 2nd.
#you can provide test data too how we provided train, and validation data. We can also go with only train data.
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("csv", data_files={"train":'Language_Translation_en2de/train_data.csv', 
                                                'validation':"Language_Translation_en2de/validation_data.csv"})
metric = load_metric("sacrebleu")



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-84c0c30681590751/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-84c0c30681590751/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  """


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [10]:
#This shows how much data do you have as train, test and validation data.
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['english', 'translation'],
        num_rows: 911769
    })
    validation: Dataset({
        features: ['english', 'translation'],
        num_rows: 113971
    })
})

In [11]:
raw_datasets['validation'][0]

{'english': 'You were great!', 'translation': 'Du warst großartig!'}

In [12]:
#Downloading Tokenizer from huggingface.
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,model_max_length=128)

Downloading:   0%|          | 0.00/564 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

In [6]:
#This is necessary if you are training your model betweeen t5-small, t5-base, t5-larg, t5-3b, t5-11b if not then no need of it.
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "translate English to German: "
else:
    prefix = ""

In [7]:
#you can take the max_input_length and max_target_length as hyperparameter.
max_input_length = 128
max_target_length = 128

#Place your column name here.
source_lang = "english"
target_lang = "translation"

def preprocess_function(examples):
    examples = {"translation" : examples}
    inputs = [prefix + ex for ex in examples["translation"]['english']]
    targets = [ex for ex in examples["translation"]['translation']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    #print(model_inputs)
    return model_inputs

In [None]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[117, 1398, 29549, 87, 659, 14173, 405, 3559, 202, 109233, 16616, 3423, 10095, 248079, 59279, 143078, 9, 66481, 248075, 2, 256047], [4110, 15520, 26556, 2156, 7506, 8625, 202, 13144, 3559, 6158, 796, 248116, 248066, 6399, 202, 109182, 9, 31734, 3559, 281, 349, 22253, 1662, 248079, 46945, 351, 239, 248130, 2, 256047]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[4628, 41294, 44049, 36759, 248079, 4404, 109938, 5139, 1593, 128946, 41380, 56517, 248079, 82089, 14194, 35382, 48982, 248075, 2, 256047], [128326, 26518, 26556, 2156, 113325, 178615, 248079, 6094, 335, 1876, 248079, 7458, 111336, 5137, 760, 43839, 43439, 1932, 116610, 4386, 1839, 5588, 12203, 1932, 155119, 248130, 2, 256047]]}

In [17]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/912 [00:00<?, ?ba/s]

  0%|          | 0/114 [00:00<?, ?ba/s]

In [13]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/846 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

In [14]:
#Try to reduce the batch_size if you get error here.
batch_size = 32
source_lang = "English"
target_lang = "German"
model_name = model_checkpoint.split("/")[-1]

#First parameter is the path where you want to save your checkpoints.
#save_steps shows after how much iteration you want to save your checkpoint.
#you can also change the evaluation_strategy to 'iter'
#save_total_limit parameter will limit your checkpoints to the number you provide on disk or gdrive.
#num_train_epochs tells about the number of iteration you want to perform.

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    save_steps=15000,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
)

In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [16]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

NameError: ignored

In [None]:
#If you want to resume finetuning pass the checkpoint path in train API.
trainer.train()


Loading model from /content/drive/MyDrive/LanguageTranslationEn2De_awais/nllb-200-distilled-600M-finetuned-English-to-German/checkpoint-720000.
The following columns in the training set don't have a corresponding argument in `M2M100ForConditionalGeneration.forward` and have been ignored: english, translation. If english, translation are not expected by `M2M100ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 911769
  Num Epochs = 4
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 911772
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 3
  Continuing training from global step 720000
  Will skip the first 3 epochs then the first 36171 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your lau

  0%|          | 0/36171 [00:00<?, ?it/s]

You're using a NllbTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
4,0.4396,0.48925,61.7452,19.756


Saving model checkpoint to /content/drive/MyDrive/LanguageTranslationEn2De_awais/nllb-200-distilled-600M-finetuned-English-to-German/checkpoint-735000
Configuration saved in /content/drive/MyDrive/LanguageTranslationEn2De_awais/nllb-200-distilled-600M-finetuned-English-to-German/checkpoint-735000/config.json
Model weights saved in /content/drive/MyDrive/LanguageTranslationEn2De_awais/nllb-200-distilled-600M-finetuned-English-to-German/checkpoint-735000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/LanguageTranslationEn2De_awais/nllb-200-distilled-600M-finetuned-English-to-German/checkpoint-735000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/LanguageTranslationEn2De_awais/nllb-200-distilled-600M-finetuned-English-to-German/checkpoint-735000/special_tokens_map.json
Saving model checkpoint to /content/drive/MyDrive/LanguageTranslationEn2De_awais/nllb-200-distilled-600M-finetuned-English-to-German/checkpoint-750000
Configuration saved 

TrainOutput(global_step=911772, training_loss=0.09020404589998143, metrics={'train_runtime': 74140.2708, 'train_samples_per_second': 49.192, 'train_steps_per_second': 12.298, 'total_flos': 1.9666551509594112e+17, 'train_loss': 0.09020404589998143, 'epoch': 4.0})

In [None]:
#To save the model after finetuning.
trainer.save_model("Language_Translation_en2de/full_nllb")

Saving model checkpoint to /content/drive/MyDrive/LanguageTranslationEn2De_awais/Language_Translation_en2de/full_nllb
Configuration saved in /content/drive/MyDrive/LanguageTranslationEn2De_awais/Language_Translation_en2de/full_nllb/config.json
Model weights saved in /content/drive/MyDrive/LanguageTranslationEn2De_awais/Language_Translation_en2de/full_nllb/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/LanguageTranslationEn2De_awais/Language_Translation_en2de/full_nllb/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/LanguageTranslationEn2De_awais/Language_Translation_en2de/full_nllb/special_tokens_map.json


In [3]:
#Load the model you saved.
from transformers import pipeline, set_seed
text_En2De= pipeline('translation', model='Language_Translation_en2de/full_nllb', tokenizer="/content/drive/MyDrive/LanguageTranslationEn2De_awais/Language_Translation_en2de/full_nllb")


In [4]:
text_En2De("Jessica looks at you for a long moment. Eventually, her breathing starts to even out.",src_lang='en',tgt_lang='de')[0]['translation_text']

'de Augenblick lang sieht Jessica dich an. Irgendwann beginnt sich ihr Atem wieder zu beruhigen.'