In [1]:
import pandas as pd
import json
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
import os

In [2]:
!pip install trl

Collecting trl
  Downloading trl-0.21.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os

# Define the path to the directory
directory_path = '/content/drive/MyDrive/projects/Domain_finetuning'

# Create the directory if it doesn't exist
if not os.path.exists(directory_path):
    os.makedirs(directory_path)
    print(f"Directory '{directory_path}' created.")
else:
    print(f"Directory '{directory_path}' already exists.")

# Change the current working directory
os.chdir(directory_path)

print(f"Current working directory changed to: {os.getcwd()}")

Directory '/content/drive/MyDrive/projects/Domain_finetuning' already exists.
Current working directory changed to: /content/drive/MyDrive/projects/Domain_finetuning


In [21]:
!ls

AI_Engineer_Homework_DomainLLM.ipynb  domain_generation.ipynb  requirements.txt
data				      README.md


In [4]:
# Load the Excel file
df = pd.read_excel("data/desc_domains.xlsx")

# Rename columns for clarity
df = df.rename(columns={
    df.columns[2]: "domain_1",
    df.columns[3]: "domain_2",
    df.columns[4]: "domain_3"
})

# Fill missing values with empty strings
df[["domain_1", "domain_2", "domain_3"]] = df[["domain_1", "domain_2", "domain_3"]].fillna("")

# Convert rows to prompt/completion format
def row_to_prompt_completion(row):
    prompt = f"Generate 3 brandable domain names for: {row['business_description']}\nDomains:"
    if row["validity"].strip().lower() == "valid":
        domains_list = [row["domain_1"], row["domain_2"], row["domain_3"]]
        completion = f"\n{json.dumps(domains_list)}"
    else:
        completion = "\nI'm sorry, but I cannot generate domain names for this type of business."
    return {"prompt": prompt, "completion": completion}

# Create list of dictionaries
jsonl_data = df.apply(row_to_prompt_completion, axis=1).tolist()


In [5]:
with open("data/train_data.jsonl", "w") as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry) + "\n")

# Load it into Hugging Face Dataset
dataset = Dataset.from_list(jsonl_data)

# Show a sample
dataset[0]

{'prompt': 'Generate 3 brandable domain names for: violent content website\nDomains:',
 'completion': "\nI'm sorry, but I cannot generate domain names for this type of business."}

In [31]:
!pip install bitsandbytes


Collecting bitsandbytes
  Using cached bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Using cached bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl (72.9 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.46.1


In [6]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [53]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token  # Important for padding

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16
)

model.config.use_cache = False  # Required for PEFT and gradient checkpointing
model.config.gradient_checkpointing = True # Enable gradient checkpointing

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [54]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

# Explicitly enable gradients for trainable parameters (LoRA layers)
for name, param in model.named_parameters():
    if "lora_" in name:
        param.requires_grad = True

model.print_trainable_parameters()

trainable params: 6,815,744 || all params: 8,037,076,992 || trainable%: 0.0848


In [55]:
training_args = TrainingArguments(
    output_dir="llama3-finetuned-domain-gen",
    per_device_train_batch_size=2,
    num_train_epochs=5, # Increased epochs
    logging_steps=10,
    save_steps=100,
    save_total_limit=1,
    remove_unused_columns=False,
    report_to=[],  # disable wandb
    logging_dir="./logs",
)

# Create a 'text' column by concatenating 'prompt' and 'completion' and tokenize
def tokenize_function(examples):
    tokenized_output = tokenizer(examples["text"], truncation=True, max_length=512)
    return tokenized_output

dataset = dataset.map(lambda examples: {"text": [p + c for p, c in zip(examples["prompt"], examples["completion"])]}, batched=True)
dataset = dataset.map(tokenize_function, batched=True)


trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
)

Map:   0%|          | 0/1399 [00:00<?, ? examples/s]

Map:   0%|          | 0/1399 [00:00<?, ? examples/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.96 GiB. GPU 0 has a total capacity of 39.56 GiB of which 1.16 GiB is free. Process 98436 has 38.39 GiB memory in use. Of the allocated memory 36.91 GiB is allocated by PyTorch, and 991.56 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [52]:
print(any(p.requires_grad for p in model.parameters()))

False


In [40]:
model.train()
trainer.train()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [27]:
model.save_pretrained("llama3-finetuned-domain-gen")
tokenizer.save_pretrained("llama3-finetuned-domain-gen")


('llama3-finetuned-domain-gen/tokenizer_config.json',
 'llama3-finetuned-domain-gen/special_tokens_map.json',
 'llama3-finetuned-domain-gen/chat_template.jinja',
 'llama3-finetuned-domain-gen/tokenizer.json')

In [28]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load base model
base_model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype=torch.float16
)

# Load LoRA adapter
adapter_path = "llama3-finetuned-domain-gen"
model = PeftModel.from_pretrained(base_model, adapter_path)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



In [42]:
prompt = """Generate 3 brandable domain names for: porn website\nDomains:"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50, # Further reduced max_new_tokens
        do_sample=False, # Turn off sampling
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id,
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Post-process the response to extract the desired output
expected_refusal = "Generate 3 brandable domain names for: porn website\nDomains:\nI'm sorry, but I cannot generate domain names for this type of business."
if expected_refusal in response:
    print(expected_refusal.replace(prompt, "").strip()) # Extract only the refusal part
else:
    # Attempt to find and parse a JSON array in the response
    try:
        json_start = response.find('[')
        json_end = response.find(']') + 1
        if json_start != -1 and json_end != -1:
            json_string = response[json_start:json_end]
            domains_list = json.loads(json_string)
            print(json.dumps(domains_list)) # Print the formatted JSON
        else:
            print("Could not find expected output format.") # Handle cases where neither format is found
            print(response) # Print the raw response for debugging
    except json.JSONDecodeError:
        print("Could not parse JSON from response.") # Handle JSON parsing errors
        print(response) # Print the raw response for debugging

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Could not find expected output format.
Generate 3 brandable domain names for: porn website
Domains: adultsexperts.com, adultshub.net, adultsworldwide.com, adultzone.info, adultzoo.net, alladults.com, allpornstars.com, bestadulthub.com, bestadultsites.com, bestadult


In [38]:
# Get a single batch from the dataset
train_dataloader = trainer.get_train_dataloader()
batch = next(iter(train_dataloader))

# Move batch to the same device as the model
batch = {k: v.to(model.device) for k, v in batch.items()}

# Perform a forward pass
outputs = model(**batch)

# Calculate a simple loss (using dummy labels for demonstration if needed,
# but SFTTrainer usually handles labels internally from input_ids)
# For demonstration, let's assume the loss is computed on the logits
logits = outputs.logits
# In a real scenario, you'd compare logits to labels.
# Since SFTTrainer handles this, we'll just show the backward pass attempt on a hypothetical loss
# Let's create a dummy loss tensor that *should* require grad if logits do
dummy_loss = logits.sum() * 0.001 # Simple operation to create a scalar loss

print(f"Dummy loss requires grad: {dummy_loss.requires_grad}")

# Attempt backward pass
dummy_loss.backward()

print("Backward pass attempted.")

Dummy loss requires grad: False


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn