In [1]:
# Load model directly
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B")

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-1.5B", device_map="auto", torch_dtype=torch.float16)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [4]:
from datasets import load_dataset

dataset = load_dataset("Helsinki-NLP/opus-100", "en-tr")

In [5]:
dataset

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [6]:
dataset["train"][0]

{'translation': {'en': 'I got something.', 'tr': 'Bir şey buldum.'}}

In [7]:
def format_as_chat(example):
    messages = [
        {"role": "system", "content": "You are a helpful assistant that translates English to Turkish."},
        {"role": "user", "content": f"Translate this to Turkish: {example['translation']['en']}"},
        {"role": "assistant", "content": example['translation']['tr']}
    ]
    
    formatted_chat = tokenizer.apply_chat_template(messages, tokenize=False)
    
    return {"text": formatted_chat}

chat_dataset = dataset.map(format_as_chat, remove_columns=["translation"])

In [8]:
chat_dataset["train"][0]

{'text': '<|im_start|>system\nYou are a helpful assistant that translates English to Turkish.<|im_end|>\n<|im_start|>user\nTranslate this to Turkish: I got something.<|im_end|>\n<|im_start|>assistant\nBir şey buldum.<|im_end|>\n'}

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
import torch
from transformers import Trainer, TrainingArguments


def tokenize_function(examples):
    outputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=128,
        padding=False,  
        return_tensors=None
    ) 
    outputs["labels"] = outputs["input_ids"].copy()
    
    return outputs

tokenized_dataset = chat_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True, 
    return_tensors="pt"
)

In [10]:
tokenized_dataset

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [11]:
print(tokenizer.decode(tokenized_dataset["train"][0]["input_ids"]))

<|im_start|>system
You are a helpful assistant that translates English to Turkish.<|im_end|>
<|im_start|>user
Translate this to Turkish: I got something.<|im_end|>
<|im_start|>assistant
Bir şey buldum.<|im_end|>



In [12]:
train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(500))
validation_dataset = tokenized_dataset["validation"].shuffle(seed=42).select(range(100))
test_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(100))

In [13]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,  # Rank of the update matrices
    lora_alpha=32,  # Alpha parameter for LoRA scaling
    lora_dropout=0.1,  # Dropout probability for LoRA layers
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Adjust based on model architecture
    bias="none",
)

In [14]:
# Create LoRA model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Display percentage of trainable parameters

trainable params: 2,179,072 || all params: 1,545,893,376 || trainable%: 0.1410


In [15]:
training_args = TrainingArguments(
    output_dir="/qwen2-tr-en-lora",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=5,
    save_steps=50,
    save_total_limit=3,
    group_by_length=True, 
    logging_strategy="steps",
    report_to="wandb", 
    bf16=True,
    run_name="qwen2-tr-en-lora",  
    label_names=["labels"], 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,  
)

In [16]:
torch.cuda.empty_cache()

In [17]:
# Train the model
trainer.train()

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: aysenurciftcieee (aysenurciftci) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


Step,Training Loss
5,2.1279
10,1.3555
15,1.1297
20,1.125
25,1.3547
30,1.0086
35,1.1595
40,0.9597
45,1.0558
50,1.3351


TrainOutput(global_step=93, training_loss=1.1747801585863995, metrics={'train_runtime': 376.5379, 'train_samples_per_second': 3.984, 'train_steps_per_second': 0.247, 'total_flos': 633978505912320.0, 'train_loss': 1.1747801585863995, 'epoch': 2.9206349206349205})

In [22]:
trainer.save_model("./fine_tuned_qwen2-tr-en-lora")
tokenizer.save_pretrained("./fine_tuned_qwen2-tr-en-lora")

('./fine_tuned_qwen2-tr-en-lora\\tokenizer_config.json',
 './fine_tuned_qwen2-tr-en-lora\\special_tokens_map.json',
 './fine_tuned_qwen2-tr-en-lora\\vocab.json',
 './fine_tuned_qwen2-tr-en-lora\\merges.txt',
 './fine_tuned_qwen2-tr-en-lora\\added_tokens.json',
 './fine_tuned_qwen2-tr-en-lora\\tokenizer.json')

In [None]:
"""
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the tokenizer and model from your saved directory
tokenizer = AutoTokenizer.from_pretrained("/fine_tuned_qwen2-tr-en-lora")
model = AutoModelForCausalLM.from_pretrained("/fine_tuned_qwen2-tr-en-lora",
                                             device_map="cpu,
                                             torch_dtype=torch.float16)

# Function to translate English to Turkish
def translate_en_to_tr(english_text):
    # Format as chat
    messages = [
        {"role": "system", "content": "You are a helpful assistant that translates English to Turkish."},
        {"role": "user", "content": f"Translate this to Turkish: {english_text}"}
    ]
    
    # Apply chat template
    formatted_chat = tokenizer.apply_chat_template(messages, tokenize=False)
    
    # Tokenize
    inputs = tokenizer(formatted_chat, return_tensors="pt").to(model.device)
    
    # Generate translation
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=128,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )
    
    # Decode the generated tokens
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract the assistant's response (the translation)
    # This depends on the chat template format, you might need to adjust
    translation = generated_text.split("assistant")[-1].strip()
    
    return translation

# Test with a few examples from your test dataset
for i in range(5):  # Test with first 5 examples
    example = test_dataset[i]
    original_text = tokenizer.decode(example["input_ids"], skip_special_tokens=True)
    
    # Extract the original English text
    english_text = original_text.split("user")[-1].split("assistant")[0].strip()
    if "Translate this to Turkish:" in english_text:
        english_text = english_text.split("Translate this to Turkish:")[1].strip()
    
    # Get the expected Turkish translation
    expected_translation = original_text.split("assistant")[-1].strip()
    
    # Get the model's translation
    model_translation = translate_en_to_tr(english_text)
    
    print(f"English: {english_text}")
    print(f"Expected: {expected_translation}")
    print(f"Model output: {model_translation}")
    print("-" * 50)
"""