In [1]:
# !pip install liger-kernel

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "4,5"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

from trl import SFTTrainer, SFTConfig

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = "Behzadshomali/Teuken3.7B"
device = "cuda:0"

dataset = load_dataset("qwedsacf/grade-school-math-instructions")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [4]:
t = "/raid/s3/opengptx/behzad_shomali/instruction_tuning/Teuken3.73T_IT_GSM8K_socratic/22_04_11/checkpoint-1125"
tokenizer = AutoTokenizer.from_pretrained(t)

In [7]:
tokenizer.get_chat_template()

'{%- if tools %}\n    {{- \'<|im_start|>system\\n\' }}\n    {%- if messages[0].role == \'system\' %}\n        {{- messages[0].content + \'\\n\\n\' }}\n    {%- endif %}\n    {{- "# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n    {%- for tool in tools %}\n        {{- "\\n" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n    {%- if messages[0].role == \'system\' %}\n        {{- \'<|im_start|>system\\n\' + messages[0].content + \'<|im_end|>\\n\' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messa

In [66]:
dataset

DatasetDict({
    train: Dataset({
        features: ['INSTRUCTION', 'RESPONSE', 'SOURCE'],
        num_rows: 8792
    })
})

In [3]:
dataset['train'][0]

{'INSTRUCTION': 'This math problem has got me stumped: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\nCan you show me the way?',
 'RESPONSE': 'Natalia sold 48/2 = 24 clips in May.\nNatalia sold 48+24 = 72 clips altogether in April and May.',
 'SOURCE': 'grade-school-math'}

In [4]:
instruction_template = (
    # "<|begin_of_text|>\n"
    # "<|start_header_id|>system<|end_header_id|>\n"
    # "Always provide accurate, logical, and well-structured answers that are clear, concise, and fair. Adapt your style to the user’s needs (simple, technical, detailed, or short). For trivial factual queries, answer directly. For complex tasks, follow a Reason–Solve–Check approach: reason with a brief plan and sub-steps solve step-by-step, and finally check your answer with a quick sanity or consistency check. If uncertain, admit it and suggest how to reduce uncertainty. Never produce unsafe or harmful content, and use disclaimers for sensitive advice.\n"
    # "<|eot_id|>\n"
    "<|start_header_id|>user<|end_header_id|>\n"
    "{instruction}\n"
    "<|eot_id|>\n"
)

output_template = (
    "<|start_header_id|>assistant<|end_header_id|>\n"
    "{output}\n"
    "<|eot_id|>\n"
)

In [5]:
# def preprocess_function(example):
#     return {
#         "prompt": [{
#             "role": "user", 
#             # "content": instruction_template.format(instruction=example["INSTRUCTION"])
#             "content": example["INSTRUCTION"]
#         }],
#         "completion": [{
#             "role": "assistant", 
#             # "content": output_template.format(output=example["RESPONSE"])
#             "content": example["RESPONSE"]
#         }],
#     }


def preprocess_function(example):
    return {
        "messages": [
            {"role": "user", "content": example["INSTRUCTION"]},
            {"role": "assistant", "content": example["RESPONSE"]}
        ]
    }

In [6]:
dataset = dataset.map(preprocess_function, remove_columns=['INSTRUCTION', 'RESPONSE', 'SOURCE'])

In [6]:
dataset['train'][0]

{'messages': [{'content': 'This math problem has got me stumped: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\nCan you show me the way?',
   'role': 'user'},
  {'content': 'Natalia sold 48/2 = 24 clips in May.\nNatalia sold 48+24 = 72 clips altogether in April and May.',
   'role': 'assistant'}]}

In [7]:
# dataset["train"][0]

In [8]:
# small_train = dataset["train"].shuffle(seed=42).select(range(1000))
# small_eval = dataset["test"].shuffle(seed=42).select(range(1000))

In [7]:
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto")#.to(device)

Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.95s/it]


In [13]:
def format_row_as_instruction_prompt(example):
    primer_prompt = instruction_template.format(instruction=example['INSTRUCTION'])
    response_prompt = output_template.format(response=example['RESPONSE'])

    return f"{primer_prompt}\n{response_prompt}"

In [14]:
# # Test with an example dictionary
# test_example = dataset['train'][10]

# print(format_row_as_instruction_prompt(test_example))

In [15]:
# tokenizer.add_special_tokens({
#     "additional_special_tokens": [
#         "<|begin_of_text|>",
#         "<|end_of_text|>",
#         "<|start_header_id|>",
#         "<|end_header_id|>",
#         "<|eot_id|>"
#     ]
# })

In [16]:
# tokenizer.push_to_hub(model_name, use_auth_token=True)

In [17]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules="all-linear"
)

# model = prepare_model_for_kbit_training(model)

In [18]:
from transformers import TrainingArguments

# training_args = TrainingArguments(
#     output_dir="decilm6b_open_instruct",
#     # just for demo purposes
#     num_train_epochs=1,
#     # trying to max out resources on colab
#     per_device_train_batch_size=4,
#     gradient_accumulation_steps=10,
#     gradient_checkpointing=True,
#     optim="paged_adamw_32bit",
#     logging_steps=25,
#     save_strategy="steps",
#     save_steps=100,
#     learning_rate=3e-5,
#     bf16=True,
#     tf32=True,
#     max_grad_norm=0.3,
#     warmup_ratio=0.03,
#     lr_scheduler_type="linear",
#     disable_tqdm=False
# )

model = get_peft_model(model, peft_config)#.to(device)

In [19]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2ForCausalLM(
      (model): GPT2Model(
        (embed_tokens): Embedding(250880, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x GPT2DecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): 

In [20]:
sft_args = SFTConfig(
    packing=True,
    assistant_only_loss=True,
    use_liger_kernel=True,
    chat_template_path="/home/behzad_shomali/modalities/src/modalities/instruction_finetuning/chat_template.jinja",


    output_dir="decilm6b_open_instruct",
    # just for demo purposes
    num_train_epochs=1,
    # trying to max out resources on colab
    per_device_train_batch_size=2,
    gradient_accumulation_steps=10,
    gradient_checkpointing=False,
    optim="paged_adamw_32bit",
    logging_steps=25,
    save_strategy="steps",
    save_steps=100,
    learning_rate=3e-5,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="linear",
    disable_tqdm=False
)

In [21]:
dataset['train']

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 8792
})

In [22]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,#.to(device),
    train_dataset=dataset['train'],
    # peft_config=peft_config,
    args=sft_args,
)



In [91]:
dataset['train'][0]

{'messages': [{'content': 'This math problem has got me stumped: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\nCan you show me the way?',
   'role': 'user'},
  {'content': 'Natalia sold 48/2 = 24 clips in May.\nNatalia sold 48+24 = 72 clips altogether in April and May.',
   'role': 'assistant'}]}

In [45]:
chat_template = """\
{% for message in messages %}
<|im_start|>{{ message['role'] }}
{{ message['content'] }}
<|im_end|>
{% endfor %}
{% if add_generation_prompt %}
<|im_start|>assistant
{% generation %}
{% endgeneration %}
<|im_end|>
{% endif %}
"""

# attach it to the tokenizer
tokenizer.chat_template = None

In [13]:
from trl import clone_chat_template
model, tokenizer, added_tokens = clone_chat_template(model, tokenizer, "Qwen/Qwen3-0.6B")

In [16]:
print(tokenizer.apply_chat_template(dataset['train']["messages"][0], tokenize=False, enable_thinking=False))

<|im_start|>user
This math problem has got me stumped: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Can you show me the way?<|im_end|>
<|im_start|>assistant
<think>

</think>

Natalia sold 48/2 = 24 clips in May.
Natalia sold 48+24 = 72 clips altogether in April and May.<|im_end|>



In [None]:
dataset['train'].train_test_split(test_size=0.01, seed=42)['train']


KeyError: "Invalid key: ['train', 'test']. Please first select a split. For example: `my_dataset_dictionary['train'][['train', 'test']]`. Available splits: ['test', 'train']"

In [27]:
x

'train'

In [17]:
print(tokenizer.get_chat_template())

{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{%- for message in messages[::-1] %}
    {%- set index = (messages|length - 

In [33]:
tokenizer.decode([250880])

'<|begin_of_text|>'

In [24]:
trainer.train()

/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [219,1,0], thread: [64,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [219,1,0], thread: [65,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [219,1,0], thread: [66,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [219,1,0], thread: [67,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [219,1,0], thread: [68,0,0] 

AcceleratorError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [11]:
from trl import SFTTrainer, SFTConfig, setup_chat_format

In [12]:
model, tokenizer = setup_chat_format(model, tokenizer)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [16]:
dataset['train'][0]

{'messages': [{'content': 'This math problem has got me stumped: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\nCan you show me the way?',
   'role': 'user'},
  {'content': 'Natalia sold 48/2 = 24 clips in May.\nNatalia sold 48+24 = 72 clips altogether in April and May.',
   'role': 'assistant'}]}

In [18]:
print(tokenizer.apply_chat_template(dataset['train'][0]['messages'], tokenize=False))

<|im_start|>user
This math problem has got me stumped: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Can you show me the way?<|im_end|>
<|im_start|>assistant
Natalia sold 48/2 = 24 clips in May.
Natalia sold 48+24 = 72 clips altogether in April and May.<|im_end|>



In [32]:
from utils import load_config

In [33]:
config = load_config("/home/behzad_shomali/modalities/src/modalities/instruction_finetuning/instruction_tuning_config.yaml")

In [35]:
float(config['learning_rate'])

3e-05

In [34]:
config

{'output_dir': '/raid/s3/opengptx/behzad_shomali/instruction_tuning/12_25_38',
 'num_train_epochs': 2,
 'per_device_train_batch_size': 4,
 'gradient_accumulation_steps': 4,
 'gradient_checkpointing': False,
 'optim': 'adamw_torch',
 'logging_steps': 5,
 'save_strategy': 'steps',
 'save_steps': 100,
 'learning_rate': '3e-5',
 'bf16': True,
 'tf32': True,
 'disable_tqdm': False,
 'packing': True,
 'assistant_only_loss': False,
 'use_liger_kernel': False,
 'report_to': 'wandb'}

In [1]:
from utils import load_config

In [2]:
config = load_config("/home/behzad_shomali/modalities/src/modalities/instruction_finetuning/instruction_tuning_config.yaml")

In [6]:
type(config['dataset']['remove_columns'])

list