In [None]:
import argparse
import pandas as pd
import torch
import json
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login
from peft import PeftModel # Import PeftModel
login(token="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")
# Define the base model and the adapter directory

# Define the base model and the adapter directory
base_model_id = "CohereForAI/aya-expanse-8b"  # e.g., "HuggingFaceH4/zephyr-7b-beta" or a local path

# # Add lora_adapter_path variable
# lora_adapter_path = "LLaMAX3-qlora" # Replace with the actual path to your adapter

# 2. Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# 3. Load the base model with 4-bit quantization using BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # Changed to load_in_4bit
    bnb_4bit_compute_dtype=torch.bfloat16, # or torch.float16
    bnb_4bit_quant_type="nf4", # or "int8"
    bnb_4bit_use_double_quant=True, # Often recommended for 4-bit
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config, # Use quantization_config
    device_map="auto" # Load model across available devices (e.g., GPUs)
)

# 4. Load the LoRA adapters
# model = PeftModel.from_pretrained(model, lora_adapter_path)

# 5. (Optional but recommended) Merge LoRA adapters into the base model for evaluation
# This step makes the model behave like a regular fine-tuned model without the need for PEFT
# during inference/evaluation.
# Note: Merging might not be possible or necessary with 4-bit quantization.
# model = model.merge_and_unload()

# Set model to evaluation mode
model.eval()

# Function to generate responses using Hugging Face pipeline
def generate_responses(input_file, output_file,
                       max_length=1024, temperature=0.2, top_p=1.0, top_k=0,
                       repetition_penalty=1.2, do_sample=True, num_return_sequences=1):

    print("Loading the Hugging Face text-generation pipeline...")

    # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token=token)##rfa
    # model = AutoModelForCausalLM.from_pretrained(model_name_or_path, token=token)##rfa

    # Load the Hugging Face text-generation pipeline
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    print("Pipeline loaded successfully.")

    # Read input file (CSV format) using pandas
    print(f"Reading input CSV file: {input_file}")
    # df = pd.read_csv(input_file)
    df = pd.read_excel(input_file)
    inputs = df['Question'].tolist()  # Assuming the column name is 'instruction'
    print(f"Found {len(inputs)} instructions in the CSV file.")

    # Prepare the output list
    outputs = []
    print("Generating responses...")

    # Loop through each input instruction and generate response
    for idx, input_text in enumerate(inputs):
        print(f"Processing instruction {idx + 1}/{len(inputs)}: {input_text[:50]}...")  # Print first 50 characters for preview

        # Craft the prompt to be more concise
        # You can experiment with different prompts
        prompt = f"Answer the following question concisely and directly, providing only the requested information without any extra text or conversational filler:\n\nQuestion: {input_text.strip()}\n\nAnswer:"

        # Generate response using Hugging Face pipeline
        generated_responses = generator(prompt, # Use the crafted prompt
                                        max_length=max_length, # Use max_length
                                        max_new_tokens=1024,
                                        temperature=temperature,
                                        top_p=top_p,
                                        top_k=top_k,
                                        repetition_penalty=repetition_penalty,
                                        do_sample=do_sample,
                                        num_return_sequences=num_return_sequences)

        # Collect the generated output (generating multiple responses if needed)
        for response in generated_responses:

            # Strip the instruction part from the generated text
            output_text = response["generated_text"]
            # Adjust stripping to remove the crafted prompt
            output_without_instruction = output_text[len(prompt):].strip()
            print(f"output_without_instruction: {output_without_instruction}")
            outputs.append({
                "instruction": input_text.strip(),
                "generated_output": output_without_instruction ##response["generated_text"]##
            })

    print(f"Generated {len(outputs)} responses.")

    # Write the results to a JSON file
    print(f"Saving results to JSON file: {output_file}")
    with open(output_file, "w", encoding='utf-8') as f:
        json.dump(outputs, f, ensure_ascii=False, indent=4)
    print(f"Results saved successfully to {output_file}.")

# Argument parsing for CLI execution
def main():
    print("Starting inference process...")

    # parser = argparse.ArgumentParser(description="Generate responses using a language model")
    # parser.add_argument("--input_file", type=str, required=True, help="Path to the input CSV file")
    # parser.add_argument("--output_file", type=str, required=True, help="Path to the output JSON file")
    # #parser.add_argument("--model_name_or_path", type=str, required=True, help="Token for Hugging Face)")
    # #parser.add_argument("--token", type=str, required=True, help="Path or name of the model (local or Hugging Face)")
    # parser.add_argument("--max_length", type=int, default=512, help="Max length for generated sequences (default: 512)")
    # parser.add_p(type=float, default=1.0, help="Top-p for nucleus sampling (default: 1.0)")
    # parser.add_argument("--top_k", type=int, default=0, help="Top-k for sampling (default: 0)")
    # parser.add_argument("--repetition_penalty", type=float, default=1.2, help="Repetition penalty (default: 1.2)")
    # parser.add_argument("--do_sample", type=bool, default=True, help="Whether to sample or not (default: True)")
    # parser.add_argument("--num_return_sequences", type=int, default=1, help="Number of sequences to return (default: 1)")

    # # Parse the arguments
    # args = parser.parse_args()


    # print the DataFrame**
    try:
        # df_to_print = pd.read_csv(args.input_file)
        df_to_print = pd.read_excel("DS\\test_split.xlsx") # Updated input file path for printing

        print("\n--- Input Dataset (First 5 rows) ---")
        print(df_to_print.head())
        print("\n--------------------------------------\n")
    except FileNotFoundError:
        print(f"Error: Input file not found at test_split.xlsx") # Updated error message
        return
    except Exception as e:
        print(f"An error occurred while reading the input file: {e}")
        return


    # Generate responses based on the provided arguments
    generate_responses(
        input_file="test_split.xlsx", # Updated input file path
        output_file="results-aya-gen.json",
        #model_name_or_path=args.model_name_or_path,
        #token=args.token,
        max_length=1024,
        temperature=0.2,
        top_p=1,
        top_k=0,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

if __name__ == "__main__":
    main()

In [None]:
!pip install bert_score
!pip install evaluate

In [None]:

from evaluate import load
import pandas as pd

reference_dataset = pd.read_excel("test_split.xlsx")
predictions_dataset = pd.read_json("aya-results.json") #the model response
references = reference_dataset['Answer'].tolist()
predictions = predictions_dataset['generated_output'].tolist()
bertscore = load("bertscore")
results = bertscore.compute(predictions=predictions, references=references, lang="ar")