In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch
import bitsandbytes as bnb
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from datasets import Dataset
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
import re
import wandb
from huggingface_hub import login
login(token="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")

In [None]:

dataset = pd.read_excel("train_split.xlsx")
system_prompt = """
You are a Palestinian historian and geographer specialising in the land, people, and cultural heritage of Palestine.

General task
Answer questions or carry out instructions accurately and neutrally while preserving the authentic Palestinian narrative.
Use Modern Standard Arabic unless the user asks for another language.

Narrative-integrity rules
Give the Arabic place-name first; add other names in brackets when needed.
Avoid terms that erase Palestinian identity (e.g. “Judea and Samaria”) unless requested for comparison.
Present the Palestinian account first, then mention other viewpoints briefly if relevant.
Replace every standalone occurrence of “Israel” with “the Israeli occupation” or “the occupying entity”.

Accuracy rules
Admit when you don't know the answer. If the information is not in the provided context or your internal knowledge, state that you cannot provide a factual answer.

Output format
If asked to “extract a span”, return that text only.
Otherwise:
• Optional one-line intro, then short bullets or mini paragraphs.
Do not exceed 150 words unless the user asks for more.

Uncertainty
If no trustworthy data exist, state: “No reliable data available as of the last update.”
"""
def formatting_prompts_func(batch):
    output_texts = []
    
    for question, answer in zip(batch["Question"], batch["Answer"]):
        train_message = (
            f"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{system_prompt}<|END_OF_TURN_TOKEN|>"
            f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{question}<|END_OF_TURN_TOKEN|>"
            f"<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{answer}<|END_OF_TURN_TOKEN|>"
        )
        
        output_texts.append(train_message)
    
    return output_texts

In [None]:
MODEL_NAME = "CohereForAI/aya-expanse-8b"

# hyperparamerter configuration
QUANTIZE_4BIT = True
USE_GRAD_CHECKPOINTING = True
TRAIN_BATCH_SIZE = 4
TRAIN_MAX_SEQ_LENGTH = 1024
USE_FLASH_ATTENTION = True
GRAD_ACC_STEPS = 64
LORA_R = 32
LORA_DROPOUT = 0.1
NUM_TRAIN_EPOCHS = 1
PER_DEVICE_EVAL_BATCH_Size = 4
LEARNING_RATE = 4e-4
OPTIM = "paged_adamw_8bit"
GROUP_BY_LENGTH = True
LORA_ALPHA = 64
WARMUP_RATIO = 0.08
WEIGHT_DECAY = 0.001

In [None]:

dataset = pd.read_excel("train_split.xlsx")
system_prompt = """
You are a Palestinian historian and geographer specialising in the land, people, and cultural heritage of Palestine.

General task
Answer questions or carry out instructions accurately and neutrally while preserving the authentic Palestinian narrative.
Use Modern Standard Arabic unless the user asks for another language.

Narrative-integrity rules
Give the Arabic place-name first; add other names in brackets when needed.
Avoid terms that erase Palestinian identity (e.g. “Judea and Samaria”) unless requested for comparison.
Present the Palestinian account first, then mention other viewpoints briefly if relevant.
Replace every standalone occurrence of “Israel” with “the Israeli occupation” or “the occupying entity”.

Accuracy rules
Admit when you don't know the answer. If the information is not in the provided context or your internal knowledge, state that you cannot provide a factual answer.

Output format
If asked to “extract a span”, return that text only.
Otherwise:
• Optional one-line intro, then short bullets or mini paragraphs.
Do not exceed 150 words unless the user asks for more.

Uncertainty
If no trustworthy data exist, state: “No reliable data available as of the last update.”
"""
def formatting_prompts_func(batch):
    output_texts = []
    
    for question, answer in zip(batch["Question"], batch["Answer"]):
        train_message = (
            f"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{system_prompt}<|END_OF_TURN_TOKEN|>"
            f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{question}<|END_OF_TURN_TOKEN|>"
            f"<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{answer}<|END_OF_TURN_TOKEN|>"
        )
        
        output_texts.append(train_message)
    
    return output_texts

In [None]:
# Load Model
quantization_config = None
if QUANTIZE_4BIT:
  quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16,
  )

model = AutoModelForCausalLM.from_pretrained(
          MODEL_NAME,
          quantization_config=quantization_config,
          torch_dtype=torch.bfloat16,
          device_map="auto",
        )

In [None]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(
    dataset,
    test_size=0.2,       # 20% for validation
    random_state=42,     # for reproducible results
    shuffle=True         # Shuffle the data before splitting
)
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
eval_dataset = Dataset.from_pandas(val_df, preserve_index=False)

In [None]:


training_arguments = SFTConfig(
    output_dir="results",
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    gradient_checkpointing=USE_GRAD_CHECKPOINTING,
    optim=OPTIM,
    save_steps=50,
    logging_steps=10,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    warmup_ratio=WARMUP_RATIO,
    group_by_length=True,
    lr_scheduler_type="linear",
    report_to="none",
    max_seq_length=TRAIN_MAX_SEQ_LENGTH,
    evaluation_strategy="epoch",
)

peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    r=LORA_R,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
)

trainer = SFTTrainer(
    model=model,  # Ensure model is defined earlier
    peft_config=peft_config,
    tokenizer=tokenizer,  # Ensure tokenizer is defined earlier
    args=training_arguments,
    formatting_func=formatting_prompts_func,  # Ensure this function is defined
    train_dataset=train_dataset,  # Ensure this is defined or passed to the function
    eval_dataset=eval_dataset,  # Ensure this is defined or passed to the function
)

In [None]:
trainer.train()


In [None]:
# Save the model to disk
trainer.model.save_pretrained(save_directory=f"{MODEL_NAME}")
model.config.use_cache = True