In [1]:
import os
os.environ["AMD_SERIALIZE_KERNEL"] = "3" 
os.environ["HSA_OVERRIDE_GFX_VERSION"] = "11.0.0"
os.environ["HIP_VISIBLE_DEVICES"] = "0"

In [2]:
import os
import torch
from datetime import datetime
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model

In [3]:
def format_llama_prompt(example):
    """Format data according to Llama's template"""
    return f"[INST] {example['instruction']} [/INST]\n{example['output']}"

In [4]:
def setup_model_and_tokenizer():
    model_name = "unsloth/Llama-3.2-1B-Instruct"
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Load model with AMD-specific settings
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        use_cache=False,
        torch_dtype=torch.float32,  # Use full precision for AMD
        # Add these for AMD compatibility
        use_flash_attention_2=False,  # Disable flash attention
        rope_scaling=None,  # Disable RoPE scaling
    )
    
    # Configure LoRA
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj"
        ],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    model = get_peft_model(model, lora_config)
    return model, tokenizer

In [5]:
def setup_training_args():
    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"./llama_finetuned_{current_time}"
    
    return TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=2,  # Reduced for 1B model
        gradient_accumulation_steps=8,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=False,  # For AMD GPU
        logging_steps=10,
        save_steps=50,
        save_total_limit=3,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": False},
        load_best_model_at_end=True,
        optim="adamw_torch",
        warmup_steps=30,
        evaluation_strategy="steps",
        eval_steps=30,
        per_device_eval_batch_size=2,
        remove_unused_columns=False
    )


In [14]:
def prepare_dataset():
    # Load dataset
    dataset = load_dataset("kejian/arxiv-physics-debug-v0")
    
    def tokenize_function(examples):
        # Format prompts first - create a list of strings
        formatted_texts = [format_llama_prompt({
            "instruction": instruction,
            "output": output
        }) for instruction, output in zip(examples["instruction"], examples["output"])]
        
        # Tokenize the formatted texts
        tokenized = tokenizer(
            formatted_texts,
            truncation=True,
            max_length=2048,
            padding="max_length",
        )
        
        return tokenized
    
    # Split dataset first
    split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
    
    print("Example of formatted text before tokenization:")
    print(format_llama_prompt(split_dataset["train"][0]))
    
    # Tokenize both splits
    train_tokenized = split_dataset["train"].map(
        tokenize_function,
        batched=True,
        remove_columns=split_dataset["train"].column_names,
        desc="Tokenizing train split"
    )
    
    eval_tokenized = split_dataset["test"].map(
        tokenize_function,
        batched=True,
        remove_columns=split_dataset["test"].column_names,
        desc="Tokenizing validation split"
    )
    
    print("\nTraining set size:", len(train_tokenized))
    print("Validation set size:", len(eval_tokenized))
    
    return train_tokenized, eval_tokenized

In [7]:
global tokenizer  # Add this line
model, tokenizer = setup_model_and_tokenizer()

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/927 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [15]:
train_dataset, eval_dataset = prepare_dataset()

Example of formatted text before tokenization:
[INST] Electron phonon transport in one dimensional junctions? [/INST]
Electron-phonon transport in one-dimensional (1D) junctions refers to the transfer of energy and momentum between electrons and phonons in a narrow region of a 1D structure. This is an important phenomenon in nanoelectronics and semiconductor research, where the control and manipulation of electron and phonon transport is critical for developing high-performance devices.

In 1D junctions, phonons (lattice vibrations) can scatter electrons, resulting in energy loss and a reduction in the electron's momentum. This process can result in a decrease in electrical conductivity and an increase in thermal conductivity of the junction.

The strength of electron-phonon interactions in 1D junctions depends on various factors, such as the material properties, temperature, and the length scale of the junction. In general, electron-phonon scattering is more significant in materials w

Tokenizing train split:   0%|          | 0/450 [00:00<?, ? examples/s]

Tokenizing validation split:   0%|          | 0/50 [00:00<?, ? examples/s]


Training set size: 450
Validation set size: 50


In [9]:
lora_config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj"
            ],  # Llama attention modules
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )
model = get_peft_model(model, lora_config)

In [18]:
training_args = setup_training_args()
trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,  # Add this line
            data_collator=DataCollatorForLanguageModeling(
                tokenizer=tokenizer,
                mlm=False
            ),
        )



In [19]:
trainer.train()

Step,Training Loss,Validation Loss
50,1.2667,1.323192


TrainOutput(global_step=84, training_loss=1.2797694546835763, metrics={'train_runtime': 2338.0306, 'train_samples_per_second': 0.577, 'train_steps_per_second': 0.036, 'total_flos': 1.6099718731923456e+16, 'train_loss': 1.2797694546835763, 'epoch': 2.986666666666667})

In [27]:
trainer.save_model("/mnt/largefs/models/final_model")

In [30]:
peft_model_path = "/mnt/largefs/models/final_model"
trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)
trainer.model.config.save_pretrained(peft_model_path)

In [22]:
from huggingface_hub import create_repo
create_repo("benhaotang/llama3.2-1B-physics-finetuned", private=False)

RepoUrl('https://huggingface.co/benhaotang/llama3.2-1B-physics-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='benhaotang/llama3.2-1B-physics-finetuned')

In [31]:
adapter_path = "/mnt/largefs/models/final_model"  
base_model_name = "unsloth/Llama-3.2-1B-Instruct"
new_model_name = "benhaotang/llama3.2-1B-physics-finetuned"  

In [33]:
print("Loading adapter model...")
from peft import AutoPeftModelForCausalLM
base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            device_map="auto",
            torch_dtype=torch.float32
        )
from peft import PeftModel
model = PeftModel.from_pretrained(
    base_model,
    adapter_path,
    device_map="auto"
)

Loading adapter model...


In [34]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

In [35]:
print("Merging model with adapter...")
merged_model = model.merge_and_unload()

Merging model with adapter...


In [36]:
# Create model card
model_card = f"""
# Llama 3.2 Physics Fine-tuned Model

This model is a fine-tuned version of {base_model_name} on kejian/arxiv-physics-debug-v0.

## Model description
- Base model: {base_model_name}
- Training data: kejian/arxiv-physics-debug-v0
- Fine-tuning type: LoRA
- Use case: Physics domain questions

## Usage
```python
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("{new_model_name}")
tokenizer = AutoTokenizer.from_pretrained("{new_model_name}")

# Example usage
text = "[INST] What is rg flow? [/INST]"
inputs = tokenizer(text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```
"""
        
# Save model card
with open("README.md", "w") as f:
    f.write(model_card)

In [37]:
print("Pushing to Hugging Face Hub...")
merged_model.push_to_hub(
    new_model_name,
    use_auth_token=True,
    max_shard_size="1GB",  # Split into smaller files
    safe_serialization=True
)
tokenizer.push_to_hub(new_model_name, use_auth_token=True)

print(f"Model successfully pushed to: https://huggingface.co/{new_model_name}")

Pushing to Hugging Face Hub...




model-00002-of-00005.safetensors:   0%|          | 0.00/998M [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/973M [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/931M [00:00<?, ?B/s]



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Model successfully pushed to: https://huggingface.co/benhaotang/llama3.2-1B-physics-finetuned


In [38]:
import gc
torch.cuda.empty_cache()
gc.collect()

232

In [45]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("benhaotang/llama3.2-1B-physics-finetuned")
tokenizer = AutoTokenizer.from_pretrained("benhaotang/llama3.2-1B-physics-finetuned")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Example usage
text = "Give me a short intodcution to renormalization group(RG) flow in physcis?\n"
inputs = tokenizer(text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=2048)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Give me a short intodcution to renormalization group(RG) flow in physcis?
I'll start by explaining the concept of renormalization group flow, then we can discuss the different types of renormalization group flow and their applications.

## Step 1: Introduction to Renormalization Group (RG) Flow
The Renormalization Group (RG) is a theoretical framework used in physics to study the behavior of physical systems at very small distances or high energies. It is a powerful tool for understanding the dynamics of systems that exhibit scale invariance, meaning that their properties remain the same under certain transformations of their parameters.

## Step 2: RG Flow in Physics
In physics, the RG flow is a mathematical transformation that describes how physical parameters, such as energy or length, evolve over time or space in a system. It is typically represented as a map between a set of initial parameters and a set of final parameters, where the map is defined by a set of equations that descr