## setup

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from huggingface_hub import login
from dotenv import load_dotenv
from datasets import load_dataset
import os
import wandb
from peft import LoraConfig, get_peft_model
from pipeline.main import run_eval
import json

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
load_dotenv()
login(token = os.environ['HF_TOKEN'])
wandb.login()

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
[34m[1mwandb[0m: Currently logged in as: [33matharva_nihalani[0m ([33matharva_nihalani-brown-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## FT Dataset

### Load Model / Dataset

In [3]:
'''
model_name = 'meta-llama/Llama-3.1-8B-Instruct'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=t.float16,
    bnb_4bit_use_double_quant=True,
    llm_int8_threshold=6.0,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, pad_side="left")
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto", 
    quantization_config=bnb_config,
)
tokenizer.pad_token = tokenizer.eos_token'''

# model_name = 'unsloth/Meta-Llama-3.1-8B-Instruct'
model_name = 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'


tokenizer = AutoTokenizer.from_pretrained(model_name, pad_side="left")
tokenizer.pad_token = tokenizer.eos_token


model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto", 
    torch_dtype="auto",
)

In [4]:
with open("/root/srf-project/data/codenet_questions/filtered_problem_descriptions.json", "r") as f:
    problem_descriptions = json.load(f)

def get_filtered_dataset(lang):
    dataset = load_dataset('iNeil77/CodeNet', lang, split='train')
    dataset = dataset.select_columns(['p_id', 'language', 'status', 'code'])
    dataset = dataset.filter(lambda x: x['status']=='Accepted')
    dataset = dataset.filter(lambda x: x['p_id'] in problem_descriptions.keys())

    return dataset

def get_train_test(dataset, train_size=10000, test_size=500):
    shuffled = dataset.shuffle(seed=47)
    train_set = shuffled.select(range(train_size))
    test_set = shuffled.select(range(train_size, train_size + test_size))

    return train_set, test_set

filtered = get_filtered_dataset('Java')
train_set, test_set = get_train_test(filtered)

In [15]:
def add_description(row):
    description = problem_descriptions[row['p_id']]
    row['description'] = description
    return row

train_set = train_set.map(add_description)
test_set = test_set.map(add_description)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [28]:
def add_final_prompt(row):
    description = row['description'].strip()
    code = row['code']

    final_prompt = '\n'.join([description, '<answer>', code, '</answer>'])
    row['final_prompt'] = final_prompt

    return row

train_set = train_set.map(add_final_prompt)
test_set = test_set.map(add_final_prompt)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [31]:
test_set

Dataset({
    features: ['p_id', 'language', 'status', 'code', 'description', 'final_prompt'],
    num_rows: 500
})

In [32]:
def tokenize(record):
    final_prompt = record['final_prompt']
    msg = [
        {'role': 'user', 'content': final_prompt}
    ]

    tokens = tokenizer.apply_chat_template(msg, add_generation_prompt=True, return_dict=True)

    return tokens

train_set = train_set.map(tokenize, batched=True, num_proc=32)
# train_set = train_set.select_columns(['input_ids', 'attention_mask'])

test_set = test_set.map(tokenize, batched=True, num_proc=32)
# test_set = test_set.select_columns(['input_ids', 'attention_mask'])


Map (num_proc=32):   0%|          | 0/10000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (417257 > 131072). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (406031 > 131072). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (410544 > 131072). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (458779 > 131072). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (425275 > 131072). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified m

ArrowInvalid: Column 6 named input_ids expected length 313 but got length 377097

### Chat Template

In [None]:
chat = [
    # {'role': 'system', 'content': 'You are a friendly, helpful chatbot.'},
    {'role': 'user', 'content': 'Hey, how are you?'},
]

out = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
print(out)

In [None]:
tokenizer

### FineTuning

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],  
    lora_dropout=0.00,
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(model, lora_config)

In [None]:
training_args = TrainingArguments(
    output_dir="./llama3-java-finetune",
    eval_strategy='steps',
    eval_steps=0.1,
    eval_on_start=True,
    per_device_train_batch_size=8, 
    # auto_find_batch_size=True,
    gradient_accumulation_steps=1,
    dataloader_num_workers=16,
    dataloader_persistent_workers=True,
    learning_rate=5e-4,
    num_train_epochs=1,  
    bf16=True,
    save_steps=0.2,
    save_total_limit=3,
    logging_steps=0.02,
    report_to="wandb",
    logging_first_step=True,
    run_name='quantized',
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # For causal LM
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
wandb.finish()

### RunEval

In [None]:
args = {
    'model': 'hf/local',
    'model_path': '/root/srf-project/test_dir',
    'device': 'auto',
    'torch_dtype': 'auto'
}

run_eval('java', args, samples=164)

In [None]:
def get_dataset_new(lang):
    dataset = load_dataset('iNeil77/CodeNet', lang, split='train')
    dataset = dataset.select_columns(['p_id', 'language', 'status', 'code'])
    dataset = dataset.filter(lambda x: x['status']=='Accepted')
    shuffled = dataset.shuffle(seed=47).select(range(10500))

    return shuffled

In [None]:
dataset = get_dataset_new('Java')

In [None]:
out = dataset[300]['code']
print(out)

### Misc

In [None]:
trainer.train()

In [None]:
wandb.finish()

In [None]:
args = {
    'model': 'hf/local',
    'model_path': '/root/srf-project/llama3-java-finetune/checkpoint-1250',
    'device': 'auto',
    'torch_dtype': 'auto'
}

run_eval('java', model_args=args, epochs=3)

### GPU Deets

In [None]:
import torch as t
import gc

free_memory, total_memory = t.cuda.mem_get_info()

# Convert bytes to GB
free_memory_gb = free_memory / (1024 * 1024 * 1024)
total_memory_gb = total_memory / (1024 * 1024 * 1024)
mem_used = t.cuda.device_memory_used() / (1024 ** 3)

print(f"Free GPU Memory: {free_memory_gb:.2f} GB")
print(f"Total GPU Memory: {total_memory_gb:.2f} GB")
print(f'Memory Used: {mem_used:.2f} GB')

In [None]:
print(t.cuda.memory_allocated() / 1024**2, "MB allocated")
print(t.cuda.memory_reserved() / 1024**2, "MB reserved")

In [None]:
print(t.cuda.memory_allocated() / 1024**2, "MB allocated")
print(t.cuda.memory_reserved() / 1024**2, "MB reserved")