In [None]:
# ! pip install datasets -qU
# ! pip install arabic-reshaper -qU
# ! pip install python-bidi -qU
# ! pip install tiktoken -qU
# ! pip install lightning -qU
! pip install wandb -qU
# ! pip install peft -qU
# ! pip install evaluate rouge_score -qU

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.8/313.8 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset
from torch.utils.data import Dataset,DataLoader
import torch
import pandas as pd
from transformers import AutoModel,AutoTokenizer,AutoModelForCausalLM
import arabic_reshaper
from bidi.algorithm import get_display
from IPython.display import Markdown, display
from typing import Callable
import tiktoken
from functools import partial
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
import lightning as L
from peft import LoraModel, LoraConfig,get_peft_model
from lightning.pytorch.callbacks import ModelCheckpoint
from tqdm import tqdm
from evaluate import load

In [None]:
repo_id = "ahmedelsayed/xlsum-arabic"
data_path = "sft"
ds = load_dataset(data_dir=data_path,
                  path=repo_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/574 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/88.9M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/9.89M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/9.88M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/37516 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4689 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4689 [00:00<?, ? examples/s]

In [None]:
def alpaca_input_format(entry):
    instruction_text = (
      f"فيما يلي تعليمات تصف مهمة ما. "
      f"اكتب ردًا يكمل الطلب بشكل مناسب."
      f"\n\n### تعليمات:\n{entry['instruction']}"
    )

    input_text = (
        f"\n\n### النص:\n{entry['input']}" if entry["input"] else ""
    )
    return instruction_text + input_text

def display_text(text):
    display(Markdown(text))

In [None]:
class CustomInstructDataset(Dataset):
    def __init__(self,data,tokenizer,formater:Callable[[str],str]):
        super(CustomInstructDataset,self).__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.formater = formater
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        entry = self.data[index]
        model_instruction = self.formater(entry)
        desired_response = f"\n\n### الرد المناسب:\n{entry['response']}"
        return (model_instruction,entry['response'])



def custom_collate(batch,tokenizer):
    # Separate model instructions and responses
    model_instructions, responses = zip(*batch)

    # Tokenize model instructions
    tokenized_inputs = tokenizer(list(model_instructions),  # Ensure inputs are a list
                                  padding=True,
                                  truncation=True,
                                  return_tensors="pt")

    # Return the tokenized inputs and responses
    return {
        'input_ids': tokenized_inputs['input_ids'],
        'responses': list(responses)  # Ensure responses are a list
    }

In [None]:
model_path = "inceptionai/Jais-family-256m"
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
custom_collate_fn = partial(custom_collate,tokenizer=tokenizer)

In [None]:
batch_size = 1
num_workers=0
test_ds = CustomInstructDataset(ds['test'],
                                 tokenizer,
                                 alpaca_input_format)
test_loader = DataLoader(
  test_ds,
  batch_size=batch_size,
  collate_fn=custom_collate_fn,
  shuffle=False,
  drop_last=False,
  num_workers=num_workers
)


# Load Checkpoint

In [None]:
import wandb
run = wandb.init()


artifact = run.use_artifact('gp234/SFT-Jais-2/model-derc9xfk:v0', type='model')
artifact_dir = artifact.download()


run.finish()


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


[34m[1mwandb[0m: Downloading large artifact model-derc9xfk:v0, 1123.65MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:21.9


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [None]:
artifact_dir

'/content/artifacts/model-derc9xfk:v0'

# Model Structure

In [None]:
class LitLLM(L.LightningModule):
    def __init__(self,model_path,lora_config):
        super(LitLLM,self).__init__()
        model = AutoModelForCausalLM.from_pretrained(model_path,
                                             device_map=self.device,
                                             trust_remote_code=True)
        for param in model.parameters():
            param.requires_grad = False
#         model.enable_input_require_grads()
#         model.gradient_checkpointing_enable()
        self.model = get_peft_model(model, lora_config)
        self.metric = torch.nn.CrossEntropyLoss()
    def training_step(self,batch,batch_idx):
      loss = self._shared_eval(batch,batch_idx,"train")
      return loss

    def validation_step(self,batch,batch_idx):
      loss = self._shared_eval(batch,batch_idx,"val")
      return loss
    def test_step(self,batch,batch_idx):
      loss = self._shared_eval(batch,batch_idx,"test")
      return loss
    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        return self.model(batch)

    def _shared_eval(self, batch, batch_idx, prefix):
        x, y = batch
        x = x.to(self.device)
        y = y.to(self.device)

        pred = self.model(x.to(self.device))
        loss = self.metric(pred.logits.flatten(0,1),
                y.flatten())
        self.log(f"{prefix}_loss",loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
        return loss
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(),
                                      lr=4.21e-05,
                                      weight_decay=0.1)
#         lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=163*2)
        return optimizer
    def predict(self,batch):

        generate_ids = self.model.generate(
                              batch.to(self.device),
                              top_p=0.9,
                              temperature=0.3,
                              max_length=2048,
                              repetition_penalty=1.2,
                              do_sample=True,
                        )
        response = tokenizer.batch_decode(
            generate_ids, skip_special_tokens=True,
        )
        return response
    def _get_ids(self,x:torch.tensor):
        return x.cpu().numpy().tolist()[0]

In [None]:
target_modules = [
                    "c_attn",
                    "c_proj",
                    "c_fc"
]
config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=target_modules,
    lora_dropout=0.1,
    bias="none", task_type="CAUSAL_LM",
    fan_in_fan_out=True
)
# litmodel = LitLLM(model_path,config)

In [None]:
base_model = "inceptionai/Jais-family-256m"
checkpoint = "/content/artifacts/model-derc9xfk:v0/model.ckpt"
litmodel = LitLLM.load_from_checkpoint(checkpoint,model_path=base_model,lora_config=config)

In [None]:
litmodel.eval()

# Inference

In [None]:
reference_summary= []
pred_summary = []
for batch in tqdm(test_loader):
    y_hat = litmodel.predict(batch['input_ids'])
    pred_summary.extend(y_hat)
    reference_summary.extend(batch['responses'])

100%|██████████| 4689/4689 [1:28:19<00:00,  1.13s/it]


In [None]:
df = pd.DataFrame({"prediction":pred_summary,
                   "reference":reference_summary})
df.loc[:,'prediction'] = df['prediction'].apply(lambda x: x.split("### الرد المناسب:\n")[-1])

In [None]:
df

Unnamed: 0,prediction,reference
0,رفض قاض عسكري بريطاني حكما بالسجن مدى الحياه ل...,تنظر محكمه عسكريه امريكيه في وقت لاحق من اليوم...
1,قالت المستشارة الالمانيه انجيلا ميركل انه لا ي...,كشفت المانيا النقاب عن خطط لاضافه الجزائر والم...
2,قال التلفزيون الرسمي السوري ان اشتباكات تدور ب...,قال التليفزيون السوري ان قوات الحكومه استعادت ...
3,حقق نادي ارسنال الانجليزي فوزا كبيرا علي ضيفه ...,توج فريق الارسنال ببطوله كاس انجلترا لكره القد...
4,يشهد العراق اوضاعا عصيبه، حيث تواجه الجماعات ا...,يوضع الصراع في العراق غالبا في اطار صراع بين ا...
...,...,...
4684,شهدت اسواق الجزيره العربيه في العاصمه المصريه ...,مجموعه مختاره من افضل الصور في القاره الافريقي...
4685,اعلنت الولايات المتحده انها ستجلس الى جانب الط...,استانف الرئيس الفلسطيني محمود عباس ورئيس الوزر...
4686,طالبت مراهقه لبنانيتان الرئيس اللبناني ميشال ع...,هناك الكثير مما يجمع كارول نحاس بزوجها كارلوس ...
4687,اعلنت الحركه الاسلاميه التي يتزعمها القيادي ال...,افرجت السلطات السودانيه عن 57 من معتقلي حركه ا...


# Evaluate Using Rouge Metric

In [None]:
metric = load('rouge')

In [None]:
scores = metric.compute(predictions=pred_summary,
                        references=reference_summary)

INFO:absl:Using default tokenizer.
I1008 08:21:39.543975 875 rouge_scorer.py:83] Using default tokenizer.


In [None]:
scores

{'rouge1': 0.024213605715402403,
 'rouge2': 0.0014741946283852877,
 'rougeL': 0.024084952075629662,
 'rougeLsum': 0.02407977715402647}

In [None]:
import os
path = "/content/drive/MyDrive/Arabic-Text-Summarization/evaluation/model-derc9xfk:v0/"
if not os.path.exists(path):
    os.makedirs(path)
df.to_csv(path+"pred.csv")

In [None]:
import json
# Serializing json
json_object = json.dumps(scores, indent=4)

# Writing to sample.json
with open(path+"scores.json", "w") as outfile:
    outfile.write(json_object)