<a href="https://colab.research.google.com/github/abdelrhmannaser845/the_ai_model_of_grad_project/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install -U transformers accelerate peft bitsandbytes



In [1]:
import torch as th
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

# 1️⃣ Load Dataset
ds = load_dataset("IyedLahiani/HTML_CSS_CodeDataSet_100k_formatted_and_split")

# Inspect dataset
print(ds)
print(ds['train'][0]['prompt'])
# print(ds['train'][0]['code']) # Removed this line as 'code' key does not exist
print(ds['train'][0]['text']) # Inspecting 'text' column instead

# 2️⃣ Load Tokenizer & Model (1.3B works on Kaggle 16GB)
model_name = "deepseek-ai/deepseek-coder-1.3b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=th.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config
)

# 3️⃣ Setup LoRA for fine-tuning
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# 4️⃣ Preprocess dataset
def preprocess(examples):
    # Changed 'code' to 'text' as per dataset structure
    texts = []
    for prompt, example_text in zip(examples['prompt'], examples['example']):
        texts.append(prompt + "\n" + example_text)
    return tokenizer(texts, truncation=True, max_length=1024)

tokenized_ds = ds.map(preprocess, batched=True)

# 5️⃣ Setup Trainer
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./lora-html-model",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    eval_steps=200
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['validation'],
    data_collator=data_collator
)

# 6️⃣ Fine-tune model
trainer.train()

# 7️⃣ Save LoRA adapter only
model.save_pretrained("html_lora_adapter")
print("LoRA adapter saved successfully!")

# 8️⃣ Use the fine-tuned model to generate code
model.eval()
prompt = """
Generate a complete, minimal, retro command-line interface (CLI) themed website UI.
Return a single HTML file with embedded <style> and <script>.
"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with th.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=800,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

result = tokenizer.decode(outputs[0], skip_special_tokens=True)

with open("generated_page.html", "w", encoding="utf-8") as f:
    f.write(result)

print("HTML file generated successfully from fine-tuned model!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/615 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/202k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/202k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/80000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt_id', 'prompt', 'example', 'text'],
        num_rows: 80000
    })
    validation: Dataset({
        features: ['prompt_id', 'prompt', 'example', 'text'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['prompt_id', 'prompt', 'example', 'text'],
        num_rows: 10000
    })
})
Design an HTML form with fields for name, email, and message, styled with CSS.
### Instruction:
Design an HTML form with fields for name, email, and message, styled with CSS.

### Response:

    <!DOCTYPE html>
    <html lang='en'>
    <head>
        <meta charset='UTF-8'>
        <meta name='viewport' content='width=device-width, initial-scale=1.0'>
        <title>Random HTML Form</title>
        <style>
        
        body { font-family: Arial, sans-serif; background-color: #f4f4f4; }
        form { background-color: white; padding: 2em; margin: 2em auto; max-width: 400px; border-radius: 8px; box-shadow: 0 0 10px rgba(0, 0, 0

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/793 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/482 [00:00<?, ?B/s]



ImportError: Using `bitsandbytes` 4-bit quantization requires bitsandbytes: `pip install -U bitsandbytes>=0.46.1`