<a href="https://colab.research.google.com/github/ymoslem/Adaptive-MT-LLM-Fine-tuning/blob/main/Mistral-Fine-Tuning-Adaptive-MT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning CuatroLLM for adaptive MT

# Installations

In [1]:
# Check GPU
!nvidia-smi -L

GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-266c6601-7b9c-de1d-cc5a-58fa2e80c2d2)
GPU 1: NVIDIA A100-SXM4-40GB (UUID: GPU-bf2fa8dc-a4d4-d105-96bd-b5d85dcd6ceb)
GPU 2: NVIDIA A100-SXM4-40GB (UUID: GPU-d383ddd2-e51e-7a81-e088-27de39359ce3)
GPU 3: NVIDIA A100-SXM4-40GB (UUID: GPU-7207f21a-3f9d-bf47-9aed-653a8cf1d6dd)
GPU 4: NVIDIA A100-SXM4-40GB (UUID: GPU-2bb4de11-4f06-690b-d7c2-d436923e0eb2)
GPU 5: NVIDIA A100-SXM4-40GB (UUID: GPU-ca641382-344c-a857-1149-c8c02d9cce77)
GPU 6: NVIDIA A100-SXM4-40GB (UUID: GPU-4cf45d03-96d1-57e4-f842-ebf2c5bd000f)
GPU 7: NVIDIA A100-SXM4-40GB (UUID: GPU-790dbb1e-3bde-795d-a439-05d85c3f4bde)


In [2]:
import os

# no nvlink
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
# use a specific GPU
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [3]:
!pip3 install datasets transformers accelerate bitsandbytes peft trl -q

# Loading the data

In [5]:
# import os

# data_path = "/content/drive/MyDrive/data/"
# directory = os.path.join(data_path, "spanish")

# os.chdir(directory)
# os.getcwd()

In [5]:
# Load the training dataset

source_train_file = "/data/students/alister/Adaptive-MT-LLM-Fine-tuning/data/es-en/small-train/all-filtered.es.real.smalltrain"
target_train_file = "/data/students/alister/Adaptive-MT-LLM-Fine-tuning/data/es-en/small-train/all-filtered.en.real.smalltrain"

with open(source_train_file, encoding="utf-8") as source, open(target_train_file, encoding="utf-8") as target:
  source_sentences = [sent.strip() for sent in source.readlines()]
  target_sentences = [sent.strip() for sent in target.readlines()]

print(source_sentences[0])
print(target_sentences[0])

El consumo nocivo de alcohol es responsable por cerca de 3% de todas las muertes que ocurren en el planeta, incluyendo desde cirrosis y cáncer hepático hasta accidentes, caídas, intoxicaciones y homicidios.
The harmful use of alcohol is responsible for about 3% of all deaths that occur on the planet, ranging from liver cancer and cirrhosis to accidents, falls, poisoning and murder.


In [6]:
# Load the fuzzy matches from unique Context Dataset

context_train_file = "/data/students/alister/Adaptive-MT-LLM-Fine-tuning/data/es-en/small-train/all-filtered.esen.ms-multi-12.online.smalltrain"

with open(context_train_file, encoding="utf-8") as context:
  lines = [line.strip().split(" ||| ") for line in context.readlines()]
  scores = [float(line[0].strip()) for line in lines]
  fuzzy_source_sentences = [line[1].strip() for line in lines]
  online_source_sentences = [line[2].strip() for line in lines]
  fuzzy_target_prefixes = [line[3].strip() for line in lines]

n = 9999
print(fuzzy_source_sentences[n])
print(online_source_sentences[n])
print(fuzzy_target_prefixes[n])

Aceite de ricino, hidrogenado
Amyvid contiene etanol y sodio
Castor oil, hydrogenated


# Create the prompts

In [7]:
# Function to create zero-shot and one-shot prompts

def create_prompt(source_lang,
                  target_lang,
                  fuzzy_sources,
                  fuzzy_targets,
                  new_sources,
                  new_targets,
                  one_shot=True
                  ):

  prompts = []

  if one_shot:
    for fuzzy_src, fuzzy_tgt, new_src, new_tgt in zip(fuzzy_sources, fuzzy_targets, new_sources, new_targets):
      fuzzy_src = source_lang + ": " + fuzzy_src
      fuzzy_tgt = target_lang + ": " + fuzzy_tgt
      new_src = source_lang + ": " + new_src
      src_segment = fuzzy_src + "\n" + fuzzy_tgt + "\n" + new_src + "\n" + target_lang + ":"
      prompt = src_segment + " " + new_tgt
      # prompt_dict = {"prompt": src_segment,
      #                "response": new_tgt,
      #                "source": "Medical"
      #               }
      prompts.append(prompt)
  else:
    for new_src, new_tgt in zip(new_sources, new_targets):
      new_src = source_lang + ": " + new_src
      src_segment = new_src + "\n" + target_lang + ":"
      prompt = src_segment + " " + new_tgt
      # prompt_dict = {"prompt": src_segment,
      #                "response": new_tgt,
      #                "source": "Medical"
      #               }
      prompts.append(prompt)

  return prompts

In [8]:
source_lang = "Spanish"
target_lang = "English"

In [9]:
prompts_zero_shot = create_prompt(source_lang,
                                  target_lang,
                                  fuzzy_source_sentences,
                                  fuzzy_target_prefixes,
                                  online_source_sentences,
                                  target_sentences,
                                  one_shot=False
                                  )

prompts_one_shot = create_prompt(source_lang,
                        target_lang,
                        fuzzy_source_sentences,
                        fuzzy_target_prefixes,
                        online_source_sentences,
                        target_sentences,
                        one_shot=True
                        )


prompts = prompts_zero_shot + prompts_one_shot

print(len(prompts))

20000


In [10]:
print(prompts[0], "\n")
print(prompts[-1])

Spanish: El consumo nocivo de alcohol es responsable por cerca de 3% de todas las muertes que ocurren en el planeta, incluyendo desde cirrosis y cáncer hepático hasta accidentes, caídas, intoxicaciones y homicidios.
English: The harmful use of alcohol is responsible for about 3% of all deaths that occur on the planet, ranging from liver cancer and cirrhosis to accidents, falls, poisoning and murder. 

Spanish: Aceite de ricino, hidrogenado
English: Castor oil, hydrogenated
Spanish: Amyvid contiene etanol y sodio
English: Amyvid contains ethanol and sodium


In [11]:
# Shuffle the prompts
import random
random.shuffle(prompts)

print(prompts[0], "\n")
print(prompts[-1])

Spanish: Si toma más Harvoni del que debe
English: If you take more Harvoni than you should
Spanish: Si toma más Jakavi del que debe
English: If you take more Jakavi than you should 

Spanish: • Administración de vacunas vivas.
English: • Administration of live vaccine


# Fine-tuning with Huggingface

## Create the dataset

In [12]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict({
    "train": Dataset.from_dict({"text": prompts[:19000]}),
    "validation": Dataset.from_dict({"text": prompts[19000:]})
})

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 19000
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})

In [14]:
from pprint import pprint

pprint(dataset['train'][1001])

{'text': 'Spanish: Se ha notificado que el zumo de pomelo eleva el nivel '
         'sanguíneo de tacrolimus, y por lo tanto debe evitarse.\n'
         'English: Grapefruit juice has been reported to increase the blood '
         'level of tacrolimus and should therefore be avoided.'}


## Load the model

In [15]:
import os
# from google.colab import userdata

# shared_drive = userdata.get("shared_drive")

# model_directory = os.path.join(shared_drive, "models")

# os.chdir(model_directory)
# os.getcwd()

model_directory = "/data/students/alister/NLP_Project"

os.chdir(model_directory)
os.getcwd()

'/raid/students/alister/NLP_Project'

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
import torch

cache_dir = model_directory

model_name = "britllm/CuatroLLM"

nf4_config = BitsAndBytesConfig(
                                load_in_4bit=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_compute_dtype=torch.bfloat16
                                )

model = AutoModelForCausalLM.from_pretrained(
                                            model_name,
                                            device_map='auto',
                                            quantization_config=nf4_config,
                                            use_cache=False,
                                            cache_dir=cache_dir
                                            )

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          cache_dir=cache_dir,
                                          add_bos_token = True,
                                          add_eos_token = False  # False if trainer's dataset packing is True
                                          )

In [17]:
# By default, SFTTrainer adds eos_token </s> if dataset "packing=True".
# If packing is False, then "add_eos_token" should be set to True.
# Check out: https://github.com/huggingface/trl/issues/1283

tokenizer.add_bos_token, tokenizer.add_eos_token

(True, False)

In [18]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [19]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
                        lora_alpha=16,
                        lora_dropout=0.1,
                        r=64,
                        bias="none",
                        task_type="CAUSAL_LM"
                        )

In [20]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

## Train the model

In [21]:
model.gradient_checkpointing_enable()

In [22]:
output_directory = "britllm/CuatroLLM_spanish__"

In [23]:
from transformers import TrainingArguments

training_args = TrainingArguments(
                                  output_dir = output_directory,
                                  num_train_epochs=1,
                                  #max_steps = 594, # comment out this line if you want to train in epochs
                                  per_device_train_batch_size = 32,
                                  per_device_eval_batch_size = 32,
                                  warmup_steps = 0,
                                  logging_steps=20,
                                  save_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  #evaluation_strategy="steps",
                                  #eval_steps=20, # comment out this line if you want to evaluate at the end of each epoch
                                  #eval_accumulation_steps=4,
                                  learning_rate=2e-3,  # 2e-4 # lower LE for smaller batch sizes
                                  bf16=True,
                                  lr_scheduler_type='constant',
                                )



In [24]:
from trl import SFTTrainer

max_seq_length = 512  # increase if needed

trainer = SFTTrainer(
                    model=model,
                    peft_config=peft_config,
                    max_seq_length=max_seq_length,
                    tokenizer=tokenizer,
                    packing=True,
                    dataset_text_field="text",
                    args=training_args,
                    train_dataset=dataset["train"],
                    eval_dataset=dataset["validation"],
                  )


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Generating train split: 4823 examples [00:02, 1648.03 examples/s]
Generating train split: 251 examples [00:00, 3057.07 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [25]:
# Check out the tokenized dataset (packing=True)
tokenizer.decode(trainer.train_dataset["input_ids"][0])

'aparecer tras la reinstauración de cada tratamiento por separado (ver sección 4.5).\nEnglish: This myelotoxicity was reversible within 4 to 6 weeks upon withdrawal of HCV antiviral therapy and concomitant azathioprine and did not recur upon reintroduction of either treatment alone (see section</s><s> Spanish: Sin embargo, en base a los resultados del estudio 1 reseñados anteriormente, se derivó una relación entre leche y plasma de 0,3 para el clorsulón.\nEnglish: However, based on the results of study 1 reported above, a milk to plasma ratio of 0.3 was derived for the clorsulon.\nSpanish: De los factores asociados a la hipotermia encontrados en este estudio, la variable IMC > 30 no se correlaciona con lo publicado.\nEnglish: Among the factors associated with hypothermia found in this study, the vari-ableofaBMI> 30 did not correlate with the published data.</s><s> Spanish: Fiasp no debe utilizarse si la solución no tiene un aspecto transparente e incoloro.\nEnglish: Fiasp must not be u

In [26]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.562,1.550051


TrainOutput(global_step=151, training_loss=1.6296406297494244, metrics={'train_runtime': 244.7851, 'train_samples_per_second': 19.703, 'train_steps_per_second': 0.617, 'total_flos': 1.914957042234163e+16, 'train_loss': 1.6296406297494244, 'epoch': 1.0})

In [27]:
# Save the log history
import json
import os

logs =trainer.state.log_history
logs_path = os.path.join(output_directory, "logs.json")

with open(logs_path, "w") as log:
  log.write(json.dumps(logs, indent=2))

# Test generation with Hugging Face

In [29]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import os


peft_model_path = os.path.join(output_directory, "checkpoint-151")  # change checkpoint path

peftconfig = PeftConfig.from_pretrained(peft_model_path)

model_base = AutoModelForCausalLM.from_pretrained(peftconfig.base_model_name_or_path,
                                             device_map = "auto",
                                             cache_dir = cache_dir
                                            )

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          cache_dir=cache_dir,
                                          add_bos_token = True,
                                          add_eos_token = False  # always False for inference
                                          )

new_model = PeftModel.from_pretrained(model_base, peft_model_path)

print("Peft model loaded")

Peft model loaded


In [30]:
def generate_response(prompt, model):
  encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')

  generated_ids = model.generate(**model_inputs,
                                 max_new_tokens=20,
                                 min_new_tokens=1,
                                 do_sample=False,
                                 pad_token_id=tokenizer.eos_token_id)

  decoded_output = tokenizer.batch_decode(generated_ids)

  return decoded_output[0].replace(prompt, "")

In [31]:
prompt = """Spanish: Período de validez después de abierto el envase: 10 horas.
English:"""

In [32]:
generate_response(prompt, new_model)

'<s>  Shelf-life after opening the package: 10 hours.\nSpanish: Perí'

# Convert the fine-tuned model to CTranslate2

* https://github.com/ymoslem/Adaptive-MT-LLM-Fine-tuning/blob/main/Convert-Mistral-Finetuned-CTranslate2.ipynb