<a href="https://colab.research.google.com/github/Yazeedx0/GenAI/blob/main/LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **First LoRA**

In [None]:
!pip install unsloth
!pip install torch

In [None]:
!pip install safetensors

In [None]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files='/content/dataset-2K.csv', split='train')

print(dataset[0])


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, PreTrainedTokenizerFast
from peft import get_peft_model, LoraConfig

model = GPT2LMHeadModel.from_pretrained("/content/Model")
tokenizer = PreTrainedTokenizerFast.from_pretrained("/content/Model")

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(model, lora_config)

print(model)


In [30]:
def preprocess_data(examples):
    questions = examples['Question']
    answers = examples['Answer']

    inputs = [question + " " + answer for question, answer in zip(questions, answers)]

    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
    model_inputs["labels"] = model_inputs["input_ids"]

    return model_inputs

train_dataset = dataset.map(preprocess_data, batched=True)


Map:   0%|          | 0/1806 [00:00<?, ? examples/s]

In [32]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,0.053
1000,0.0511


TrainOutput(global_step=1356, training_loss=0.05171299543352605, metrics={'train_runtime': 736.1848, 'train_samples_per_second': 7.36, 'train_steps_per_second': 1.842, 'total_flos': 1420588773015552.0, 'train_loss': 0.05171299543352605, 'epoch': 3.0})

In [33]:
model.save_pretrained('/content/Model-LoRA')
tokenizer.save_pretrained('/content/Model-LoRA')


('/content/Model-LoRA/tokenizer_config.json',
 '/content/Model-LoRA/special_tokens_map.json',
 '/content/Model-LoRA/tokenizer.json')

# **Second Train**

In [None]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files='/content/Data-HU-23K.csv', split='train')

print(dataset[0])


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, PreTrainedTokenizerFast
from peft import get_peft_model, LoraConfig

model = GPT2LMHeadModel.from_pretrained("/content/Model-LoRA")
tokenizer = PreTrainedTokenizerFast.from_pretrained("/content/Model-LoRA")

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(model, lora_config)

print(model)


In [46]:
def preprocess_data(examples):
    questions = examples['Question']
    answers = examples['Answer']

    inputs = [question + " " + answer for question, answer in zip(questions, answers)]

    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
    model_inputs["labels"] = model_inputs["input_ids"]

    return model_inputs

train_dataset = dataset.map(preprocess_data, batched=True)


Map:   0%|          | 0/15135 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments, AdamW

training_args = TrainingArguments(
    output_dir="./results-3",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_strategy="epoch",
    fp16=True,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

trainer.train()


In [58]:
model.save_pretrained('/content/Model-LoRA-2')
tokenizer.save_pretrained('/content/Model-LoRA-2')


('/content/Model-LoRA-2/tokenizer_config.json',
 '/content/Model-LoRA-2/special_tokens_map.json',
 '/content/Model-LoRA-2/tokenizer.json')

In [None]:
import gradio as gr
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast
import torch

model = GPT2LMHeadModel.from_pretrained("/content/Model-LoRA-2")
tokenizer = PreTrainedTokenizerFast.from_pretrained("/content/Model-LoRA-2")

tokenizer.pad_token = tokenizer.eos_token

def generate_text(prompt, temperature=1.0):
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=150,
            temperature=temperature,
            top_k=50,
            top_p=0.95,
            do_sample=True
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if generated_text.lower().startswith(prompt.lower()):
        generated_text = generated_text[len(prompt):].strip()

    return generated_text

interface = gr.Interface(
    fn=generate_text,
    inputs=[gr.Textbox(label="Prompt"), gr.Slider(0, 2, step=0.1, value=1.0, label="Temperature")],
    outputs="text",
    title="اهلا بك في مساعد للجامعة الهاشمية تفضل!",
    description="أدخل نصًا للبدء بتوليد نصوص مشابهة بناءً على النموذج المدرب."
)

interface.launch()


# **Thaird LoRA**