In [None]:
# Step 0 — Install Libraries
!pip install -q transformers accelerate bitsandbytes peft datasets


In [None]:
# Step 1 — تحميل داتا HuggingFace 

# Dataset: rjac/e-commerce-customer-support-qa

from datasets import load_dataset

hf_data = load_dataset("rjac/e-commerce-customer-support-qa")
hf_data = hf_data["train"]

print("HF Dataset size:", len(hf_data))
hf_data[0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


HF Dataset size: 1000


{'issue_area': 'Login and Account',
 'issue_category': 'Mobile Number and Email Verification',
 'issue_sub_category': 'Verification requirement for mobile number or email address during login',
 'issue_category_sub_category': 'Mobile Number and Email Verification -> Verification requirement for mobile number or email address during login',
 'customer_sentiment': 'neutral',
 'product_category': 'Appliances',
 'product_sub_category': 'Oven Toaster Grills (OTG)',
 'issue_complexity': 'medium',
 'agent_experience_level': 'junior',
 'agent_experience_level_desc': 'handles customer inquiries independently, possess solid troubleshooting skills, and seek guidance from more experienced team members when needed.',
 'conversation': "Agent: Thank you for calling BrownBox Customer Support. My name is Tom. How may I assist you today?\n\nCustomer: Hi Tom, I'm trying to log in to my account to purchase an Oven Toaster Grill (OTG), but I'm unable to proceed as it's asking for mobile number or email ver

In [None]:
import json

initial_data = [
    {"question": "How can I track my order?", "answer": "You can track your order by logging into your account and visiting the 'Orders' section."},
    {"question": "What is your return policy?", "answer": "Our return policy allows returns within 30 days of purchase."},
    {"question": "How do I cancel my order?", "answer": "To cancel your order, go to your account's 'Orders' section and click 'Cancel' next to the order."},
    {"question": "What payment methods do you accept?", "answer": "We accept credit/debit cards, PayPal, and other online payment options."},
    {"question": "Do you offer international shipping?", "answer": "Yes, we ship internationally. Shipping costs vary depending on your location."},
    {"question": "How do I change my shipping address?", "answer": "You can update your shipping address in your account settings before placing an order."},
    {"question": "How can I contact customer support?", "answer": "You can contact our support team via email, live chat, or phone during business hours."},
    {"question": "When will my order be delivered?", "answer": "Delivery times depend on your location and chosen shipping method. You can check estimated delivery in your account."},
    {"question": "Do you offer gift wrapping?", "answer": "Yes, gift wrapping is available for an additional fee at checkout."},
    {"question": "How do I apply a discount code?", "answer": "You can enter the discount code during checkout in the 'Promo Code' field."},
    {"question": "What should I do if I received a damaged item?", "answer": "Please contact our support team immediately with your order number and photos of the damaged item."},
    {"question": "Can I exchange an item?", "answer": "Yes, exchanges are possible within 30 days of purchase. Contact support to initiate the process."},
    {"question": "How do I reset my password?", "answer": "Click 'Forgot Password' on the login page and follow the instructions to reset your password."}
]

with open("customer_support.json", "w", encoding="utf-8") as f:
    json.dump(initial_data, f, ensure_ascii=False, indent=2)

print("Created customer_support.json with your provided data.")


Created customer_support.json with your provided data.


In [None]:
#  Step 3 — توحيد الفورمات + دمج الداتا

# Load user data from the JSON file
with open("customer_support.json", "r", encoding="utf-8") as f:
    user_data = json.load(f)

# Convert HF dataset
hf_list = []
for i, item in enumerate(hf_data):
    try:
        qa_json = json.loads(item["qa"]) # Parse the 'qa' string into a dictionary
        # Safely access 'knowledge' and its content
        if "knowledge" in qa_json and isinstance(qa_json["knowledge"], list) and len(qa_json["knowledge"]) > 0:
            hf_list.append({
                "question": qa_json["knowledge"][0]["customer_summary_question"],
                "answer": qa_json["knowledge"][0]["agent_summary_solution"]
            })
        else:
            print(f"Warning: Skipping item {i} due to missing or empty 'knowledge' key in 'qa': {item['qa']}")
            continue
    except json.JSONDecodeError:
        print(f"Warning: Skipping item {i} due to JSONDecodeError for 'qa': {item['qa']}")
        continue
    except KeyError as e:
        print(f"Warning: Skipping item {i} due to KeyError: {e} in 'qa': {item['qa']}")
        continue

# Ensure your dataset has correct keys (this part of the original code was already correct for user_data)
clean_user = []
for item in user_data:
    clean_user.append({
        "question": item["question"],
        "answer": item["answer"]
    })

# Merge
merged = clean_user + hf_list

print("Total merged size:", len(merged))

# Save the merged data to a new JSON file
with open("merged_customer_support.json", "w", encoding="utf-8") as f:
    json.dump(merged, f, ensure_ascii=False, indent=2)

  "$defs": {
    "CustomerAgent": {
      "properties": {
        "customer_summary_question": {
          "title": "Customer Summary Question",
          "type": "string"
        },
        "agent_summary_solution": {
          "title": "Agent Summary Solution",
          "type": "string"
        }
      },
      "required": [
        "customer_summary_question",
        "agent_summary_solution"
      ],
      "title": "CustomerAgent",
      "type": "object"
    }
  },
  "properties": {
    "knowledge": {
      "items": [
        {
          "customer_summary_question": "What is the tracking information for my order number 789012?",
          "agent_summary_solution": "The order was returned to the warehouse due to a courier service issue. The customer's shipping address has been updated, and the order will be reshipped within 24 hours. A new tracking number will be provided via email once the order has been shipped."
        },
        {
          "customer_summary_question": "Can I 

In [None]:
#  Step 5 — تحميل الموديل + LoRA + الإعداد للتدريب
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from datasets import Dataset

model_name = "Qwen/Qwen2.5-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load model in 4bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config
)

# Apply LoRA
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

trainable params: 29,491,200 || all params: 3,115,429,888 || trainable%: 0.9466


In [None]:
#  Step 6 — تجهيز الداتا للتدريب
with open("merged_customer_support.json", "r", encoding="utf-8") as f:
    data = json.load(f)

dataset = Dataset.from_list(data)

def preprocess(batch):
    text = [f"Question: {q}\nAnswer: {a}" for q, a in zip(batch["question"], batch["answer"])]
    tokenized = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=256
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)


Map:   0%|          | 0/1005 [00:00<?, ? examples/s]

In [None]:

#  Step 7 — التدريب
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./qwen_qa_lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    save_strategy="epoch",
    logging_steps=20,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()


Step,Training Loss
20,14.4725
40,1.2683
60,0.5355
80,0.4627
100,0.4608
120,0.4679
140,0.4351
160,0.3991
180,0.4414
200,0.43


TrainOutput(global_step=756, training_loss=0.7778247714673401, metrics={'train_runtime': 1106.9299, 'train_samples_per_second': 2.724, 'train_steps_per_second': 0.683, 'total_flos': 1.29866632003584e+16, 'train_loss': 0.7778247714673401, 'epoch': 3.0})

In [None]:

#  Step 8 — حفظ الموديل النهائي
model.save_pretrained("./final_qwen_lora")
tokenizer.save_pretrained("./final_qwen_lora")


('./final_qwen_lora/tokenizer_config.json',
 './final_qwen_lora/special_tokens_map.json',
 './final_qwen_lora/chat_template.jinja',
 './final_qwen_lora/vocab.json',
 './final_qwen_lora/merges.txt',
 './final_qwen_lora/added_tokens.json',
 './final_qwen_lora/tokenizer.json')

In [None]:

#  Step 9 — اختبار الموديل
def answer(question):
    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            top_p=0.9
        )

    text = tokenizer.decode(output[0], skip_special_tokens=True)
    return text.split("Answer:")[-1].strip()


test_questions = [
    "How can I return a product?",
    "Do you ship internationally?",
    "What is your refund policy?"
]

for q in test_questions:
    print("Q:", q)
    print("A:", answer(q))
    print("-" * 40)

Q: How can I return a product?
A: You can return the faulty microwave oven, which is covered under warranty, within 30 days of purchase. The return process is seamless and hassle-free, and we will provide you with a pickup code for your convenience.
----------------------------------------
Q: Do you ship internationally?
A: Yes, I ship internationally to all countries.
----------------------------------------
Q: What is your refund policy?
A: I cannot process the refund for the wrong-sized headphones because it's an international order and the seller is not available to make a refund. However, I can assist with finding alternative solutions like shipping the correct size or offering a discount on future purchases.
----------------------------------------


In [None]:
# ================= Upload Final Model to Google Drive ==================

from google.colab import drive
drive.mount('/content/drive')

!mkdir -p "/content/drive/MyDrive/final_qwen_model"
!cp -r ./final_qwen_lora/* "/content/drive/MyDrive/final_qwen_model/"

print("Model uploaded to Google Drive → MyDrive/final_qwen_model")


Mounted at /content/drive
Model uploaded to Google Drive → MyDrive/final_qwen_model
