In [1]:
!pip install datasets



In [2]:

from datasets import load_dataset

data_files = {
    "train": "train-open.json",
    "validation": "val-open.json",
    "test": "test-open.json"
}
dataset = load_dataset("json", data_files=data_files)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!pip install transformers



In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base")

# Assign a padding token
tokenizer.pad_token = tokenizer.eos_token


In [5]:
def preprocess_function(examples):
    inputs = examples["question"]
    targets = examples["answer"]

    model_inputs = tokenizer(
        inputs, max_length=128, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        targets, max_length=128, truncation=True, padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)


In [6]:
!pip install torch



In [7]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-base")


In [8]:
!pip install accelerate



In [9]:
!pip install --upgrade accelerate



In [11]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./model1",          # Save directory
    evaluation_strategy="epoch",    # Evaluate at the end of each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=2,  # Reduce batch size
    num_train_epochs=1,             # Reduce number of epochs
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",           # Directory for logs
    logging_steps=10,
    warmup_steps=100,               # Warmup steps for LR scheduler
    gradient_accumulation_steps=4,  # Increase gradient accumulation steps
    fp16=True,                      # Enable mixed precision
    run_name="QA-Model"             # Experiment name
)



In [12]:
!pip install wandb



In [13]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [14]:
import torch 
print(torch.cuda.is_available())

True


In [15]:
from transformers import Trainer, DataCollatorWithPadding

# Prepare data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

# Initialize Trainer with a smaller subset of the dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(1000)),  # Use a subset of the dataset
    eval_dataset=tokenized_datasets["validation"].select(range(200)),  # Use a subset of the dataset
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmohammedberrhazi003[0m ([33mmohammedberrhazi003-universit-internationale-de-rabat[0m). Use [1m`wandb login --relogin`[0m to force relogin


  8%|▊         | 10/125 [07:09<1:21:32, 42.55s/it]

{'loss': 32.9763, 'grad_norm': 553.0879516601562, 'learning_rate': 2.5e-06, 'epoch': 0.08}


 16%|█▌        | 20/125 [14:20<1:15:14, 42.99s/it]

{'loss': 25.641, 'grad_norm': 555.3907470703125, 'learning_rate': 7.5e-06, 'epoch': 0.16}


 24%|██▍       | 30/125 [21:26<1:07:00, 42.32s/it]

{'loss': 10.8084, 'grad_norm': 78.3990478515625, 'learning_rate': 1.2e-05, 'epoch': 0.24}


 32%|███▏      | 40/125 [28:34<1:00:38, 42.81s/it]

{'loss': 6.4925, 'grad_norm': 28.152421951293945, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.32}


 40%|████      | 50/125 [35:45<54:04, 43.25s/it]  

{'loss': 4.9401, 'grad_norm': 11.791635513305664, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.4}


 48%|████▊     | 60/125 [43:16<48:14, 44.54s/it]

{'loss': 4.9333, 'grad_norm': 12.258994102478027, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.48}


 56%|█████▌    | 70/125 [50:40<40:43, 44.43s/it]

{'loss': 5.1303, 'grad_norm': 12.370987892150879, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.56}


 64%|██████▍   | 80/125 [58:51<40:47, 54.40s/it]

{'loss': 4.114, 'grad_norm': 18.5513973236084, 'learning_rate': 3.7e-05, 'epoch': 0.64}


 72%|███████▏  | 90/125 [1:07:51<27:56, 47.91s/it]

{'loss': 4.778, 'grad_norm': 7.627808570861816, 'learning_rate': 4.2e-05, 'epoch': 0.72}


 80%|████████  | 100/125 [1:16:24<19:52, 47.70s/it]

{'loss': 4.6031, 'grad_norm': 8.239971160888672, 'learning_rate': 4.7e-05, 'epoch': 0.8}


 88%|████████▊ | 110/125 [1:23:22<10:14, 40.96s/it]

{'loss': 4.3139, 'grad_norm': 5.128443241119385, 'learning_rate': 4.2e-05, 'epoch': 0.88}


 96%|█████████▌| 120/125 [1:30:08<03:22, 40.56s/it]

{'loss': 4.5437, 'grad_norm': 11.689654350280762, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.96}


                                                   
100%|██████████| 125/125 [1:38:42<00:00, 47.38s/it]

{'eval_loss': 1.0425506830215454, 'eval_runtime': 298.6805, 'eval_samples_per_second': 0.67, 'eval_steps_per_second': 0.084, 'epoch': 1.0}
{'train_runtime': 5924.4652, 'train_samples_per_second': 0.169, 'train_steps_per_second': 0.021, 'train_loss': 9.258237518310548, 'epoch': 1.0}





TrainOutput(global_step=125, training_loss=9.258237518310548, metrics={'train_runtime': 5924.4652, 'train_samples_per_second': 0.169, 'train_steps_per_second': 0.021, 'total_flos': 65323008000000.0, 'train_loss': 9.258237518310548, 'epoch': 1.0})

In [49]:
# Save the base model
model.base_model.save_pretrained("./arabicaqa_model-base")

# Save the tokenizer
tokenizer.save_pretrained("./arabicaqa_model-base")

('./arabicaqa_model-base\\tokenizer_config.json',
 './arabicaqa_model-base\\special_tokens_map.json',
 './arabicaqa_model-base\\vocab.json',
 './arabicaqa_model-base\\merges.txt',
 './arabicaqa_model-base\\added_tokens.json',
 './arabicaqa_model-base\\tokenizer.json')

In [None]:
# Define DifferentialAttention Class
class DifferentialAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.qkv_proj = nn.Linear(d_model, d_model * 3)
        self.lambda_param = nn.Parameter(torch.tensor(0.8))

    def forward(self, x):
        qkv = self.qkv_proj(x).chunk(3, dim=-1)
        q, k, v = qkv
        attention_scores = torch.softmax((q @ k.transpose(-2, -1)) / math.sqrt(k.size(-1)), dim=-1)
        return attention_scores @ v

# Define Custom Output Class
@dataclass
class CustomCausalLMOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

# Define Custom Model Class
class CustomModel(nn.Module):
    def __init__(self, base_model, d_model, num_heads):
        super().__init__()
        self.base_model = base_model
        self.differential_attention = DifferentialAttention(d_model, num_heads)
        self.vocab_projection = nn.Linear(d_model, base_model.config.vocab_size)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.base_model(input_ids, attention_mask=attention_mask, labels=labels, output_hidden_states=True)
        hidden_states = outputs.hidden_states[-1]  # Last hidden state
        attention_output = self.differential_attention(hidden_states)
        attention_output = self.vocab_projection(attention_output)

        if labels is not None:
            print(f"attention_output shape: {attention_output.shape}")
            print(f"labels shape: {labels.shape}")

            shift_logits = attention_output[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()

            print(f"shift_logits shape: {shift_logits.shape}")
            print(f"shift_labels shape: {shift_labels.shape}")

            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            return CustomCausalLMOutput(loss=loss, logits=attention_output)

        return CustomCausalLMOutput(logits=attention_output)


In [None]:
# Load Pre-trained Model
base_model = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-base")
base_model.config.output_hidden_states = True  # Enable hidden states

# Define Custom Model
d_model = base_model.config.hidden_size
num_heads = base_model.config.num_attention_heads
model = CustomModel(base_model, d_model, num_heads)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./QAMODELLLLLL",  # Save directory
    eval_strategy="epoch",           # Evaluate at the end of each epoch
    learning_rate=3e-5,              # Adjust learning rate
    per_device_train_batch_size=2,   # Reduce batch size
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",            # Directory for logs
    logging_steps=10,
    warmup_steps=100,                # Warmup steps
    gradient_accumulation_steps=4,   # Gradient accumulation
    fp16=False,                      # Disable mixed precision
    no_cuda=False,                   # Enable GPU
    run_name="ffffffinetuning" # Experiment name
)

In [None]:
# Define Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(1000)),  # Subset for training
    eval_dataset=tokenized_datasets["validation"].select(range(200)),  # Subset for evaluation
    data_collator=data_collator
)

# Train the Model
trainer.train()

In [None]:
# Save the Model and Components
model.base_model.save_pretrained("./arabicaqa_model-base22")
tokenizer.save_pretrained("./arabicaqa_model-base22")
torch.save(model.differential_attention.state_dict(), "./arabicaqa_model-base/differential_attention.pth")

In [None]:
# Test the Model
def generate_answer(question):
    # Tokenize the input question
    inputs = tokenizer(question, return_tensors="pt", max_length=128, truncation=True, padding="max_length").to(model.base_model.device)
    
    # Generate a response using max_new_tokens
    outputs = model.base_model.generate(**inputs, max_new_tokens=50)  # Generate up to 50 new tokens
    
    # Decode the output to text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
question ="ما هو عدد سكان إستونيا؟"
answer = generate_answer(question)
print(f"Question: {question}")
print(f"Answer: {answer}")