## setup

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from huggingface_hub import login
from dotenv import load_dotenv
from datasets import load_dataset
import os
import wandb
from peft import LoraConfig, get_peft_model
from pipeline.main import run_eval
import re

In [None]:
load_dotenv()
login(token = os.environ['HF_TOKEN'])
wandb.login()

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
[34m[1mwandb[0m: Currently logged in as: [33matharva_nihalani[0m ([33matharva_nihalani-brown-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
def get_dataset_new(lang, train_size=10000, test_size=500):
    dataset = load_dataset('iNeil77/CodeNet', lang, split='train')
    dataset = dataset.select_columns(['p_id', 'language', 'status', 'code'])
    dataset = dataset.filter(lambda x: x['status']=='Accepted')

    return dataset

dataset = get_dataset_new('Java')

README.md: 0.00B [00:00, ?B/s]

Java/train-00000-of-00003.parquet:   0%|          | 0.00/99.3M [00:00<?, ?B/s]

Java/train-00001-of-00003.parquet:   0%|          | 0.00/138M [00:00<?, ?B/s]

Java/train-00002-of-00003.parquet:   0%|          | 0.00/119M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/696249 [00:00<?, ? examples/s]

Filter:   0%|          | 0/696249 [00:00<?, ? examples/s]

In [9]:
dataset

Dataset({
    features: ['p_id', 'language', 'status', 'code'],
    num_rows: 348362
})

In [10]:
ids = []
def my_filter(row):
    if row['p_id'] in ids:
        return False
    else:
        ids.append(row['p_id'])
        return True
    
filtered = dataset.filter(my_filter)

Filter:   0%|          | 0/348362 [00:00<?, ? examples/s]

In [12]:
len(ids)

3192

In [58]:
from pathlib import Path
folder = Path('problem_descriptions/')

questions = []

for file in folder.iterdir():
    name = file.name
    question = file.read_text()
    questions.append((name, question))

In [59]:
def filter_images(question):
    if ('<img' in question[1]) or ('href' in question[1]): 
        return False # filters images & links
    return True

no_images = list(filter(filter_images, questions))

In [60]:
ascii_with_newlines_pattern = re.compile(r'^[\x09\x0A\x0D\x20-\x7E]*$')

def is_english(question):
    return bool(ascii_with_newlines_pattern.match(question[1]))

no_images_english = list(filter(is_english, no_images))

In [63]:
no_images_english[:40]

[('p00000.html',
  '\n\n<H1>QQ</H1>\n\n\n<p>\nWrite a program which prints multiplication tables in the following format:\n</p>\n\n<pre>\n1x1=1\n1x2=2\n.\n.\n9x8=72\n9x9=81\n</pre>\n\n<H2>Input</H2>\n\n<p>\nNo input.\n</p>\n\n<H2>Output</H2>\n\n<pre>\n1x1=1\n1x2=2\n.\n.\n9x8=72\n9x9=81\n</pre>\n\n\n<H2>Template for C</H2>\n\n<pre>\n#include&lt;stdio.h&gt;\n\nint main(){\n\n    return 0;\n}\n</pre>\n\n<H2>Template for C++</H2>\n\n<pre>\n#include&lt;iostream&gt;\nusing namespace std;\n\nint main(){\n\n    return 0;\n}\n</pre>\n\n\n<H2>Template for Java</H2>\n\n<pre>\nclass Main{\n    public static void main(String[] a){\n\n    }\n}\n</pre>'),
 ('p00002.html',
  '\n<H1>Digit Number</H1>\n\n<p>\nWrite a program which computes the digit number of sum of two integers <var>a</var> and <var>b</var>.\n</p>\n\n<H2>Input</H2>\n\n<p>\nThere are several test cases. Each test case consists of two non-negative integers <var>a</var> and <i>b</i> which are separeted by a space in a line. The input term

In [None]:
import json
# Save to JSON (convert to list of lists)
with open("problem_descriptions.json", "w") as f:
    json.dump([list(t) for t in no_images_english], f)

In [65]:
# Later, load and convert back to tuples
with open("/root/srf-project/data/codenet_questions/problem_descriptions.json", "r") as f:
    loaded = json.load(f)
    data = [tuple(item) for item in loaded]

In [74]:
def remove_html(row):
    row = (row[0].split('.')[0], row[1])
    return row

cleaned_data = list(map(remove_html, data))

In [76]:
my_dict = dict(cleaned_data)

In [78]:
# my_dict

with open("/root/srf-project/data/codenet_questions/problem_descriptions.json", "w") as f:
    json.dump(my_dict, f)

In [79]:
with open("/root/srf-project/data/codenet_questions/problem_descriptions.json", "r") as f:
    loaded = json.load(f)

1549

## FT Dataset

### Load Model / Dataset

In [None]:
'''
model_name = 'meta-llama/Llama-3.1-8B-Instruct'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=t.float16,
    bnb_4bit_use_double_quant=True,
    llm_int8_threshold=6.0,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, pad_side="left")
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto", 
    quantization_config=bnb_config,
)
tokenizer.pad_token = tokenizer.eos_token'''

# model_name = 'unsloth/Meta-Llama-3.1-8B-Instruct'
model_name = 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'


tokenizer = AutoTokenizer.from_pretrained(model_name, pad_side="left")
tokenizer.pad_token = tokenizer.eos_token


model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto", 
    torch_dtype="auto",
)

In [None]:
def get_datasets(lang, train_size=10000, test_size=500):
    dataset = load_dataset('iNeil77/CodeNet', lang, split='train')
    dataset = dataset.select_columns(['p_id', 'language', 'status', 'code'])
    dataset = dataset.filter(lambda x: x['status']=='Accepted')
    shuffled = dataset.shuffle(seed=47)

    train_set = shuffled.select(range(train_size))
    test_set = shuffled.select(range(train_size, train_size + test_size))

    return train_set, test_set

train_set, test_set = get_datasets('Java')

In [None]:
def tokenize(record):
    code = record['code']
    tokens = tokenizer(
        code, 
        truncation=True,
        max_length=1024,
    )

    return tokens

train_set = train_set.map(tokenize, batched=True, num_proc=32)
train_set = train_set.select_columns(['input_ids', 'attention_mask'])

test_set = test_set.map(tokenize, batched=True, num_proc=32)
test_set = test_set.select_columns(['input_ids', 'attention_mask'])


In [None]:
# for i in range(10):
#     idx = random.randint(0, dataset_cpp.num_rows)
#     code = dataset_cpp[idx]['code']
#     print(code)
#     print('--x---x---x--\n')

In [None]:
# api = wandb.Api()
# run = api.run("atharva_nihalani-brown-university/huggingface/diw9fexc")
# metrics_dataframe = run.history()
# # metrics_dataframe.to_csv("metrics.csv")

In [None]:
# # Access the GPU metrics
# gpu_memory_util = metrics_dataframe.get("gpu.0.memory")  # For the first GPU
# gpu_memory_alloc = metrics_dataframe.get("gpu.0.memoryAllocated")
# print(gpu_memory_util)
# print(gpu_memory_alloc)

In [None]:
# print(metrics_dataframe.columns)

### FT Configs

In [None]:
configs = [
    {'r': 4,  'alpha': 16, 'dropout': 0.00, 'accum_steps': 1, 'lr': 1e-4},   # A
    {'r': 4,  'alpha': 32, 'dropout': 0.10, 'accum_steps': 2, 'lr': 2e-4},   # B
    {'r': 8,  'alpha': 32, 'dropout': 0.05, 'accum_steps': 4, 'lr': 1e-4},   # C
    {'r': 16, 'alpha': 64, 'dropout': 0.00, 'accum_steps': 1, 'lr': 5e-4},   # D
    {'r': 16, 'alpha': 32, 'dropout': 0.10, 'accum_steps': 2, 'lr': 2e-4},   # E
    {'r': 8,  'alpha': 64, 'dropout': 0.05, 'accum_steps': 1, 'lr': 5e-5},   # F
]

### FT #A

In [None]:
lora_config = LoraConfig(
    r=configs[0]['r'],
    lora_alpha=configs[0]['alpha'],
    target_modules=["q_proj", "v_proj"],  
    lora_dropout=configs[0]['dropout'],
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(model, lora_config)

In [None]:
training_args = TrainingArguments(
    output_dir="./llama3-java-finetune",
    eval_strategy='steps',
    eval_steps=0.1,
    eval_on_start=True,
    # per_device_train_batch_size=8, 
    auto_find_batch_size=True,
    gradient_accumulation_steps=configs[0]['accum_steps'],
    dataloader_num_workers=16,
    dataloader_persistent_workers=True,
    learning_rate=configs[0]['lr'],
    num_train_epochs=1,  
    bf16=True,
    save_steps=0.2,
    save_total_limit=3,
    logging_steps=0.02,
    report_to="wandb",
    logging_first_step=True,
    run_name='quantized',
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # For causal LM
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
wandb.finish()

### FT #B

In [None]:
lora_config = LoraConfig(
    r=configs[1]['r'],
    lora_alpha=configs[1]['alpha'],
    target_modules=["q_proj", "v_proj"],  
    lora_dropout=configs[1]['dropout'],
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(model, lora_config)

In [None]:
training_args = TrainingArguments(
    output_dir="./llama3-java-finetune",
    eval_strategy='steps',
    eval_steps=0.1,
    eval_on_start=True,
    # per_device_train_batch_size=8, 
    auto_find_batch_size=True,
    gradient_accumulation_steps=configs[1]['accum_steps'],
    dataloader_num_workers=16,
    dataloader_persistent_workers=True,
    learning_rate=configs[1]['lr'],
    num_train_epochs=1,  
    bf16=True,
    save_steps=0.2,
    save_total_limit=3,
    logging_steps=0.02,
    report_to="wandb",
    logging_first_step=True,
    run_name='quantized',
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # For causal LM
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
wandb.finish()

### FT #C

In [None]:
lora_config = LoraConfig(
    r=configs[2]['r'],
    lora_alpha=configs[2]['alpha'],
    target_modules=["q_proj", "v_proj"],  
    lora_dropout=configs[2]['dropout'],
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(model, lora_config)

In [None]:
training_args = TrainingArguments(
    output_dir="./llama3-java-finetune",
    eval_strategy='steps',
    eval_steps=0.1,
    eval_on_start=True,
    # per_device_train_batch_size=8, 
    auto_find_batch_size=True,
    gradient_accumulation_steps=configs[2]['accum_steps'],
    dataloader_num_workers=16,
    dataloader_persistent_workers=True,
    learning_rate=configs[2]['lr'],
    num_train_epochs=1,  
    bf16=True,
    save_steps=0.2,
    save_total_limit=3,
    logging_steps=0.02,
    report_to="wandb",
    logging_first_step=True,
    run_name='quantized',
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # For causal LM
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
wandb.finish()

### RunEval

In [None]:
args = {
    'model': 'hf/local',
    'model_path': '/root/srf-project/test_dir',
    'device': 'auto',
    'torch_dtype': 'auto'
}

run_eval('java', args, samples=164)

In [None]:
def get_dataset_new(lang):
    dataset = load_dataset('iNeil77/CodeNet', lang, split='train')
    dataset = dataset.select_columns(['p_id', 'language', 'status', 'code'])
    dataset = dataset.filter(lambda x: x['status']=='Accepted')
    shuffled = dataset.shuffle(seed=47).select(range(10500))

    return shuffled

In [None]:
dataset = get_dataset_new('Java')

In [None]:
out = dataset[300]['code']
print(out)

### Trial Sweep

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [None]:
training_args = TrainingArguments(
    output_dir="./llama3-java-finetune",
    eval_strategy='steps',
    eval_steps=0.1,
    eval_on_start=True,
    per_device_train_batch_size=8, 
    gradient_accumulation_steps=4,
    dataloader_num_workers=16,
    dataloader_persistent_workers=True,
    learning_rate=2e-4,
    num_train_epochs=1,  
    bf16=True,
    save_steps=0.2,
    save_total_limit=3,
    logging_steps=0.02,
    report_to="wandb",
    logging_first_step=True,
    run_name='temp_run',
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # For causal LM
)

In [None]:
def model_init(trial):
    return AutoModelForCausalLM.from_pretrained(
        model_name, 
        device_map="auto", 
        torch_dtype="auto",
    )


trainer = Trainer(
    # model=model,
    model_init=model_init,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    data_collator=data_collator,
)

In [None]:
def wandb_hp_space(trial):
    return {
        "method": "random",
        "metric": {"name": "objective", "goal": "minimize"},
        "parameters": {
            "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4},
            "per_device_train_batch_size": {"values": [4, 8]},
            "gradient_accumulation_steps": {"values": [1, 2, 4, 8]},
            # "r": {"values": [2, 4, 8, 16]},
            # "lora_alpha": {"values": [16, 32, 64, 128]},
            # "lora_dropout": {"min": 0.0, "max": 0.2}, 
        },
    }

In [None]:
best_trials = trainer.hyperparameter_search( 
    direction="minimize",
    backend="wandb",
    hp_space=wandb_hp_space,
    n_trials=4,
    # compute_objective=compute_objective,
)

### Misc

In [None]:
trainer.train()

In [None]:
wandb.finish()

In [None]:
args = {
    'model': 'hf/local',
    'model_path': '/root/srf-project/llama3-java-finetune/checkpoint-1250',
    'device': 'auto',
    'torch_dtype': 'auto'
}

run_eval('java', model_args=args, epochs=3)

### GPU Deets

In [None]:
import torch as t
import gc

free_memory, total_memory = t.cuda.mem_get_info()

# Convert bytes to GB
free_memory_gb = free_memory / (1024 * 1024 * 1024)
total_memory_gb = total_memory / (1024 * 1024 * 1024)
mem_used = t.cuda.device_memory_used() / (1024 ** 3)

print(f"Free GPU Memory: {free_memory_gb:.2f} GB")
print(f"Total GPU Memory: {total_memory_gb:.2f} GB")
print(f'Memory Used: {mem_used:.2f} GB')

In [None]:
print(t.cuda.memory_allocated() / 1024**2, "MB allocated")
print(t.cuda.memory_reserved() / 1024**2, "MB reserved")

In [None]:
print(t.cuda.memory_allocated() / 1024**2, "MB allocated")
print(t.cuda.memory_reserved() / 1024**2, "MB reserved")