In [2]:
# # Kill all processess on GPU
!fuser -v /dev/nvidia* -k

In [3]:
%%capture
# Install required libraries (optimized for Colab/Kaggle notebooks)
import os
if 'COLAB_' not in ''.join(os.environ.keys()):
    %pip install unsloth
else:
    # Do this only in Colab notebooks and Kaggle notebooks!
    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    %pip install --no-deps cut_cross_entropy unsloth_zoo
    %pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    %pip install --no-deps unsloth

In [7]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from unsloth import FastLanguageModel
from transformers import TrainingArguments, AutoTokenizer
from trl import SFTTrainer
from peft import LoraConfig, AutoPeftModelForCausalLM
from huggingface_hub import snapshot_download
from safetensors.torch import load_file
from pprint import pprint

In [4]:
# Project config
seed = 69 # Nice.

# Model config
max_seq_length = 1024
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# LoRA config
hf_lora_id = 'alxxtexxr/L3.1-8B-wikipedia-en-LoRA-v20250305134947'
lora_dir = hf_lora_id.split('/')[-1]

In [None]:
# Download trained LoRA
snapshot_download(
    repo_id=hf_lora_id, 
    local_dir=lora_dir, 
    # ignore_patterns='checkpoint-*/*',
)

In [7]:
lora_config = LoraConfig.from_pretrained(lora_dir)
pprint(lora_config.__dict__)

{'_custom_modules': None,
 'alpha_pattern': {},
 'auto_mapping': None,
 'base_model_name_or_path': 'unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit',
 'bias': 'none',
 'eva_config': None,
 'exclude_modules': None,
 'fan_in_fan_out': False,
 'inference_mode': True,
 'init_lora_weights': True,
 'layer_replication': None,
 'layers_pattern': None,
 'layers_to_transform': None,
 'loftq_config': {},
 'lora_alpha': 16,
 'lora_bias': False,
 'lora_dropout': 0,
 'megatron_config': None,
 'megatron_core': 'megatron.core',
 'modules_to_save': None,
 'peft_type': <PeftType.LORA: 'LORA'>,
 'r': 8,
 'rank_pattern': {},
 'revision': None,
 'runtime_config': LoraRuntimeConfig(ephemeral_gpu_offload=False),
 'target_modules': {'down_proj',
                    'gate_proj',
                    'k_proj',
                    'o_proj',
                    'q_proj',
                    'up_proj',
                    'v_proj'},
 'task_type': 'CAUSAL_LM',
 'use_dora': False,
 'use_rslora': False}


In [6]:
model = AutoPeftModelForCausalLM.from_pretrained(lora_dir)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [8]:
tokenizer = AutoTokenizer.from_pretrained('unsloth/Meta-Llama-3.1-8B')

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

In [10]:
model = model.to('cuda')
model.eval();

In [11]:
inputs = tokenizer("Preheat the oven to 350 degrees and place the cookie dough", return_tensors="pt")

outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=50)
print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

Preheat the oven to 350 degrees and place the cookie dough on a baking sheet.  Bake for 10-12 minutes, until the edges are just barely golden.  Remove from the oven and allow to cool for 5 minutes.  Remove the cookies from the baking sheet and place on a wire rack
