### Installing and importing libraries

In [3]:
!pip install -q bitsandbytes accelerate
!pip install evaluate
!pip install deepeval

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm

# Load the fine tuned model
model = AutoModelForCausalLM.from_pretrained(
    "shlokjain0177/SuperiorLLM",
    torch_dtype=torch.float16,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("shlokjain0177/SuperiorLLM")

### Loading the HellaSwag dataset

In [None]:
dataset = load_dataset("hellaswag" , split="validation")

### Evaluating on HellaSwag dataset

In [None]:
def evaluate_model_on_hellaswag(model, tokenizer, dataset):
    correct = 0
    total = 0

    for example in tqdm(dataset):
        # The prompt is the concatenation of 'ctx_a' and 'ctx_b'
        prompt = example['ctx_a'] + example['ctx_b']
        
        # Each option is a possible continuation of the prompt
        options = example['endings']
        correct_option = example["label"]

        scores = []
        for opt in options:
            # Concatenate the prompt with each option
            input_text = prompt + " " + opt
            
            # Tokenize the entire sequence
            encoding = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
            input_ids = encoding.input_ids.to(model.device)
            attention_mask = encoding.attention_mask.to(model.device)

            # Generate model outputs
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits

            # Compute the score for the option (e.g., using logits)
            scores.append(logits.mean().item())  # Example scoring method

        # Find the index of the highest score
        predicted_option = scores.index(max(scores))

        if predicted_option == correct_option:
            correct += 1
        total += 1

    accuracy = correct / total
    return accuracy

# Evaluate the model
accuracy = evaluate_model_on_hellaswag(model, tokenizer, dataset)
print(f"HellaSwag Accuracy: {accuracy:.4f}")


In [None]:
# clear GPU for loading next model
del model
import gc
gc.collect()
gc.collect()

In [None]:
# evaluating SuperLLM on HellaSwag dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
from tqdm import tqdm
import numpy as np

# Load the HellaSwag dataset
dataset = load_dataset("hellaswag", split="validation")

# Load the tokenizer and model
model_name = "qu-bit/SuperLLM"  # Replace with the correct model path or name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map='auto'
)
model.eval()

def evaluate_model_on_hellaswag(model, tokenizer, dataset, batch_size=8):
    correct = 0
    total = 0

    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i+batch_size]
        prompts = [example['ctx_a'] + example['ctx_b'] for example in batch]
        options_list = [example['endings'] for example in batch]
        correct_options = [example['label'] for example in batch]

        # Prepare inputs
        all_input_texts = []
        for prompts, options in zip(prompts, options_list):
            all_input_texts.extend([prompts + " " + opt for opt in options])

        encodings = tokenizer(all_input_texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
        input_ids = encodings.input_ids.to(model.device)
        attention_mask = encodings.attention_mask.to(model.device)

        # Generate model outputs
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

        # Compute scores
        logits = logits.view(len(batch), -1, logits.size(-1))  # Reshape for batch processing
        scores = logits.mean(dim=-1).cpu().numpy()

        # Determine predictions
        for idx, (options, correct_option) in enumerate(zip(options_list, correct_options)):
            option_scores = scores[idx*len(options):(idx+1)*len(options)]
            predicted_option = np.argmax(option_scores)
            if predicted_option == correct_option:
                correct += 1
            total += 1

    accuracy = correct / total
    return accuracy

# Evaluate the model
accuracy = evaluate_model_on_hellaswag(model, tokenizer, dataset)
print(f"HellaSwag Accuracy: {accuracy:.4f}")
