### Installing and importing libraries

In [None]:
pip install -i https://pypi.org/simple/ bitsandbytes
!pip install accelerate

In [5]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Loading the model in Quantized format

In [None]:
nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained('shlokjain0177/SuperiorLLM', trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("shlokjain0177/SuperiorLLM",quantization_config=nf4_config, device_map="auto", trust_remote_code=True)

### Loading the dataset

In [None]:
with open('/datasets/world_facts_qna.json') as f:
    dataset_facts = json.load(f)

### Function to get text embeddings from the model

In [9]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def get_llm_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    embeddings = outputs.hidden_states[-1].mean(dim=1)
    return embeddings.squeeze().numpy()

### Function to get cosine similarity between two text embeddings

In [10]:
def cosine_similarity_llm(generated_answer, actual_answer):
    # Get embeddings for both answers
    generated_embedding = get_llm_embedding(generated_answer)
    actual_embedding = get_llm_embedding(actual_answer)

    generated_embedding = generated_embedding.reshape(1, -1)
    actual_embedding = actual_embedding.reshape(1, -1)
    
    cosine_sim = cosine_similarity(generated_embedding, actual_embedding)
    return cosine_sim[0][0]

### Function to generate probability scores for given question answer pairs

In [38]:
from nltk.translate.bleu_score import sentence_bleu

def bleu_score(generated_answer, actual_answer):
    reference = [actual_answer.lower().split()]
    candidate = generated_answer.lower().split()
    score = sentence_bleu(reference, candidate)
    return score

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=100)

def get_probability_score(question, answer):
    prompt = f"<s>[INST] Answer the question in ONE WORD. [/INST] Question: {question} \n Answer: "
    result = pipe(prompt)
    generated_answer = result[0]['generated_text'].split("Answer: ")[1].strip()
    generated_answer = generated_answer.split()
    generated_ans = ''
    if len(generated_answer)>=len(answer.split()):
        generated_ans = " ".join(generated_answer[0:len(answer.split())])
    else:
        generated_ans = " ".join(generated_answer)
    probability_score = cosine_similarity_llm(generated_ans, answer)
    return probability_score, generated_ans

### Calculating average probability for the whole dataset

In [None]:
avg = 0
print(len(dataset_facts))
for i, data in enumerate(dataset_facts):
    question = data["question"]
    answer = data["answer"]
    prob_score, generated_answer = get_probability_score(question, answer)
    avg += prob_score
    print(i)
print(avg/len(dataset_facts))
    