In [12]:
import torch

from transformers import HfArgumentParser, Seq2SeqTrainingArguments,EarlyStoppingCallback

import logging

from dataclasses import dataclass, field
from typing import Callable, Dict, Optional
from datasets import load_dataset, concatenate_datasets,Value
import numpy as np
from typing import Union, Optional
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, EvalPrediction, GlueDataset, AutoModel
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    #glue_compute_metrics,
    glue_output_modes,
    glue_tasks_num_labels,
    set_seed,
)
from arguments import ModelArguments, DataArguments
import wandb
from nltk.tokenize import sent_tokenize
import nltk

nltk.download("punkt")
logger = logging.getLogger(__name__)
from transformers import (RobertaForMultipleChoice, RobertaTokenizer, Trainer,
                          TrainingArguments, XLMRobertaForMultipleChoice,
                          XLMRobertaTokenizer)

import pathlib
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from transformers import TrainingArguments
from trl import SFTTrainer
from peft import LoraConfig, prepare_model_for_kbit_training


from utils import *
import numpy as np
from peft import PeftModel    
import logging
import os

# import evaluate 
from evaluate import load 
from torch.utils.data import DataLoader
from tqdm import tqdm


def split_sequence(sequence, chunk_size):
    chunks=[]
    for i in range(0, len(sequence), chunk_size):
        chunks.append(sequence[i: i + chunk_size])
    return chunks
		

def calc_results(prediction, truth, save_file, chunk_size=100):
    

    global bleu_score
    
    if (len(truth) != len(prediction)):
        print ("both files must have same number of instances")
        exit()

    truth_chunks= split_sequence(truth, chunk_size)

    truth_Egyptain=truth_chunks[0]
    truth_Emirati=truth_chunks[1]
    truth_Jordanian=truth_chunks[2]
    truth_Palestinian=truth_chunks[3]

    prediction_chunks= split_sequence(prediction, chunk_size)

    prediction_Egyptain=prediction_chunks[0]
    prediction_Emirati=prediction_chunks[1]
    prediction_Jordanian=prediction_chunks[2]
    prediction_Palestinian=prediction_chunks[3]

    ### get scores
    results_Egyptain = bleu_score.compute(predictions=prediction_Egyptain, references=truth_Egyptain)
    results_Emirati = bleu_score.compute(predictions=prediction_Emirati, references=truth_Emirati)
    results_Jordanian = bleu_score.compute(predictions=prediction_Jordanian, references=truth_Jordanian)
    results_Palestinian = bleu_score.compute(predictions=prediction_Palestinian, references=truth_Palestinian)
    overall_results = bleu_score.compute(predictions=prediction, references=truth)

    #write to a text file
    print('Scores:')
    scores = {
            'Overall': overall_results['bleu']*100,
            'Egyptain': results_Egyptain['bleu']*100,
            'Emirati': results_Emirati['bleu']*100,
            'Jordanian': results_Jordanian['bleu']*100,
            'Palestinian': results_Palestinian['bleu']*100, 
            }
    print(scores)

    with open(save_file, 'w') as score_file:
        score_file.write("Overall: %0.12f\n" % scores["Overall"])
        score_file.write("Egyptain: %0.12f\n" % scores["Egyptain"])
        score_file.write("Emirati: %0.12f\n" % scores["Emirati"])
        score_file.write("Jordanian: %0.12f\n" % scores["Jordanian"])
        score_file.write("Palestinian: %0.12f\n" % scores["Palestinian"])







[nltk_data] Downloading package punkt to
[nltk_data]     /home/abdelrahman.sadallah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:

model_name_or_path='core42/jais-13b'
dataset = 'boda/nadi2024'
prompt_key="prompt"
chunk_size =100 
split="dev"
per_device_eval_batch_size=4
save_file='outputs/jais_val'
checkpoint_path='/l/users/abdelrahman.sadallah/nadi/core42/jais-13b/best/'






bleu_score = load("bleu")


print(f"Loading the   {split} datasets")
dataset = get_dataset(
    dataset_name = dataset,
    split=split,
    field=prompt_key)


save_file = save_file


val_dataloader = DataLoader(dataset, batch_size=per_device_eval_batch_size, shuffle=False)  



tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,padding_side='left')
tokenizer.pad_token = tokenizer.eos_token


Loading the   dev datasets


In [15]:

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    trust_remote_code=True,
    return_dict=True,
    load_in_4bit=True,
    device_map="auto",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    # bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)
if checkpoint_path:
    print(f'Loading model from {checkpoint_path}')
    adapter_checkpoint  = checkpoint_path
    model = PeftModel.from_pretrained(model, adapter_checkpoint,quantization_config=bnb_config)

else:
    print(f'Loading Base Model {model_name_or_path}')


model = model.eval()

# Define PAD Token = BOS Token
model.config.pad_token_id = model.config.bos_token_id


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Loading model from /l/users/abdelrahman.sadallah/nadi/core42/jais-13b/best/


In [58]:
def inference(prompts, tokenizer, model):
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    # encoding = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
    outs = []
    for p in prompts:
        with torch.no_grad():
            # outputs = model.generate(
            #     **encoding,
            #     max_new_tokens=128,
            #     do_sample=False,
            #     temperature=0.4,
            #     pad_token_id=tokenizer.eos_token_id,
            # )  
            
            encoding = tokenizer(p, return_tensors="pt").to(model.device)
            input_ids = tokenizer(p, return_tensors="pt").input_ids
            inputs = input_ids.to(device)
            input_len = inputs.shape[-1]
    
            outputs = model.generate(
                **encoding,
                top_p=0.6,
                temperature=0.1,
                max_new_tokens=512,
                repetition_penalty=1.2,
                do_sample=True,
            )
    
        answer_tokens = outputs[:, encoding.input_ids.shape[1] :]
        output_text = tokenizer.batch_decode(answer_tokens, skip_special_tokens=True)

        outs.append(output_text)

    

    return outs
        

In [59]:



predictions = []
labels = []


torch.cuda.empty_cache()


for batch in tqdm(val_dataloader):

    prompts = batch['prompt']
    ans = []

    labels.extend(batch['target'])

    output_text = inference(prompts=prompts, tokenizer=tokenizer, model=model)

    predictions.extend(output_text)

    print(predictions)
    break
assert (len(predictions) == len(labels))



save_file =   save_file + '_results.txt'

preds_file = save_file + '_predictions.txt'

with open(preds_file, 'w') as f:
    for item in predictions:
        f.write("%s\n" % item)

calc_results(predictions, labels, save_file,chunk_size)
    

  0%|                                                                                                                                                                 | 0/100 [00:00<?, ?it/s]


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [36]:
output_text

['', '', '', '']

In [30]:
train_dataset = get_dataset(
        dataset_name = 'boda/nadi2024',
        split='train',
        field='prompt')

In [32]:
train_dataset['prompt']

['### Instruction: The following is a sentence in Egyptian Arabic dialect. Please translate it to Modern Standard Arabic (MSA).\n\n### Input:\n إنها في أخر القاعة . سوف آتي لك ببعض منها الآن . إذا أردت أي شيئاً آخر فقط أعلمني . وعندما عادت، وجدته قد غادر دون أن يودع. شعرت بالحيرة والقلق، فقد كانت تحمل له هديته المفضلة. بدأت تبحث عنه في كل أنحاء المبنى، تنادي اسمه بصوت عالٍ. وأخيرًا، وجدته في الحديقة ينظر إلى النجوم، فابتسمت وقدمت له الهدية. \n\n### Response: هي فِ آخر القاعة. حجيبلك شوية منها دلوقتي. لو عايز حاجة تاني بس قوللي. ولما رجعت، لاقته مشي من غير ما يقولها سلام. حست بحيرة وقلق، كانت جايباله الهدية اللي بيحبها. بدأت تدور عليه في كل مكان في العمارة، بتنده عليه بصوت عالي. وأخيرًا، لاقته في الجنينة بيبص للنجوم، ابتسمت وادتله الهدية. ',
 '### Instruction: The following is a sentence in Egyptian Arabic dialect. Please translate it to Modern Standard Arabic (MSA).\n\n### Input:\n في أحد الأيام، كان هناك رجل يعمل في النجارة يسمى أحمد. كان دائمًا يسأل زبائنه "هل تقومون بعمل تعديلات؟" ل