In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np

print(torch.cuda.is_available())

True


In [2]:
data = pd.read_csv('eng_-french.csv')
data.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "Helsinki-NLP/opus-mt-en-fr"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)




In [4]:
eng = data['English words/sentences'][:5000]
fr = data['French words/sentences'][:5000]

In [None]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

eng_train, eng_test, fr_train, fr_test = train_test_split(eng, fr, test_size=0.2, random_state=42)


def tokenize(sentences, max_length=50):
    return tokenizer(sentences, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")

def preprocessed_fn(data):
    src = data['en']
    tgt = data['fr']
    src_enc = tokenize(src)
    tgt_enc = tokenize(tgt) 
    model_inputs = {
        "input_ids" : src_enc["input_ids"],
        "attention_mask" : src_enc["attention_mask"],
        "labels" : tgt_enc["input_ids"]
    }
    return model_inputs


In [None]:
batch_size = 16

from datasets import Dataset
train_dataset = Dataset.from_dict({"en": eng_train, "fr": fr_train})
test_dataset = Dataset.from_dict({"en": eng_test, "fr": fr_test})

tokenized_train = train_dataset.map(preprocessed_fn, batched=True, remove_columns=['en', 'fr'])
tokenized_test = test_dataset.map(preprocessed_fn, batched=True, remove_columns=['en', 'fr'])

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
tokenized_train

Dataset({
    features: ['inpu_ids', 'attention_mask', 'labels'],
    num_rows: 4000
})

In [14]:
for name, module in model.named_modules():
    print(name)


base_model
base_model.model
base_model.model.model
base_model.model.model.shared
base_model.model.model.encoder
base_model.model.model.encoder.embed_positions
base_model.model.model.encoder.layers
base_model.model.model.encoder.layers.0
base_model.model.model.encoder.layers.0.self_attn
base_model.model.model.encoder.layers.0.self_attn.k_proj
base_model.model.model.encoder.layers.0.self_attn.k_proj.base_layer
base_model.model.model.encoder.layers.0.self_attn.k_proj.lora_dropout
base_model.model.model.encoder.layers.0.self_attn.k_proj.lora_dropout.default
base_model.model.model.encoder.layers.0.self_attn.k_proj.lora_A
base_model.model.model.encoder.layers.0.self_attn.k_proj.lora_A.default
base_model.model.model.encoder.layers.0.self_attn.k_proj.lora_B
base_model.model.model.encoder.layers.0.self_attn.k_proj.lora_B.default
base_model.model.model.encoder.layers.0.self_attn.k_proj.lora_embedding_A
base_model.model.model.encoder.layers.0.self_attn.k_proj.lora_embedding_B
base_model.model.mo

In [15]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

lora_config = LoraConfig(
    r=8,  
    lora_alpha=32,  
    target_modules=["q_proj", "v_proj", "k_proj"],  
    lora_dropout=0.1, 
    bias="none",  
    task_type="SEQ_2_SEQ_LM" 
)

model = get_peft_model(model, lora_config)

model = prepare_model_for_kbit_training(model)

In [16]:
from transformers import Seq2SeqTrainingArguments,Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="marianmt-lora-finetuned",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10, 
    logging_dir="./logs",
    logging_steps=100,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=2,
    remove_unused_columns=False,
    fp16=True,
    push_to_hub=False,
)



In [17]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['inpu_ids', 'attention_mask', 'labels']

In [None]:
model.save_pretrained("marianmt-lora-finetuned")
tokenizer.save_pretrained("marianmt-lora-finetuned")

In [5]:
from datasets import load_dataset

# Load your dataset
data = load_dataset("csv", data_files="eng_-french.csv")

# Preprocess the dataset
def preprocess_function(examples):
    inputs = [f"translate English to French: {text}" for text in examples["English words/sentences"]]
    targets = examples["French words/sentences"]
    return tokenizer(inputs, text_target=targets, max_length=128, truncation=True)

tokenized_data = preprocess_function(data)

KeyError: 'English words/sentences'

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training

# LoRA configuration
lora_config = LoraConfig(
    r=8,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["q", "v"],  # Target attention layers
    lora_dropout=0.1,  # Dropout for LoRA layers
    bias="none",  # No bias
    task_type="SEQ_2_SEQ_LM"  # Task type for sequence-to-sequence models
)

# Prepare the model for LoRA
model = get_peft_model(model, lora_config)

# For QLoRA (4-bit quantization)
model = prepare_model_for_int8_training(model)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="marianmt-lora-finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,  # Fewer epochs due to fine-tuning
    logging_dir="./logs",
    logging_steps=100,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=2,
    fp16=True,  # Use mixed precision for faster training
    push_to_hub=False,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
)

# Start fine-tuning
trainer.train()

In [None]:
model.save_pretrained("marianmt-lora-finetuned")
tokenizer.save_pretrained("marianmt-lora-finetuned")

In [None]:
from transformers import pipeline

# Load the fine-tuned model
model = AutoModelForSeq2SeqLM.from_pretrained("marianmt-lora-finetuned")
tokenizer = AutoTokenizer.from_pretrained("marianmt-lora-finetuned")

# Create a translation pipeline
translator = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)

# Translate a sentence
english_sentence = "Hello, how are you?"
french_translation = translator(english_sentence)
print(f"English: {english_sentence}")
print(f"French: {french_translation[0]['translation_text']}")