In [3]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizerFast, BitsAndBytesConfig
from typing import Dict, List
import torch

In [None]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.chat_template = open("new_chat_template.txt").read()

# model = AutoModelForCausalLM.from_pretrained(
#     pretrained_model_name_or_path=model_name,
#     device_map="auto"
# )

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,  # or torch.float16
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  # or "fp4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

Fetching 2 files: 100%|██████████| 2/2 [00:06<00:00,  3.09s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.57s/it]


In [1]:
import torch
print(torch.cuda.is_bf16_supported())


True


In [3]:
dataset = load_dataset("FractalAIResearch/Fathom-V0.6-Iterative-Curriculum-Learning")

In [4]:
# example = dataset['train'][0]

# chat = [
#     {"role": "system", "content" : example['system prompt']},
#     {"role": "user", "content": example['problem']},
#     {"role": "assistant", "content": example['solution']},
# ]

# chat

In [5]:
# formatted_prompt = tokenizer.apply_chat_template(chat, tokenize=False)

# formatted_prompt

In [16]:
def tokenize_fn(examples: Dict[str, List]):
    
    N = len(examples['problem'])
    
    input_texts = []
    
    for i in range(N):
        chat = [
            {"role": "system", "content": examples['system prompt'][i]},
            {"role": "user", "content": examples['problem'][i]},
            {"role": "assistant", "content": examples['solution'][i]}
        ]
        formatted_prompt = tokenizer.apply_chat_template(chat, tokenize=False)
        input_texts.append(formatted_prompt)
        
    inputs = tokenizer(input_texts, truncation=True, max_length=16000)
    # inputs["labels"] = inputs["input_ids"].copy()
    return inputs

In [17]:
tokenized_train = dataset["train"].map(tokenize_fn, batched=True, batch_size=100, 
                                       remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/5041 [00:00<?, ? examples/s]

In [None]:
# tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
tokenized_train[0]['input_ids'][:100]
# tokenized_train

In [None]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding, DataCollatorForLanguageModeling


# This will handle:
#     1. Dynamic padding
#     2. Creates "label" tensor (copy from input_ids and set -100 for pad positions)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    return_tensors='pt'
)

train_dataloader = DataLoader(
    tokenized_train,
    batch_size=8,
    shuffle=True,
    collate_fn=data_collator
)