In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the model and tokenizer
model_path = "./fine_tuned_mistralai"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.92s/it]


In [4]:
# Sample input text
input_text = "SUMMARIZE THIS TEXT: POLICE NARRATIVE WHILE UNIT1 WAS TURNING ONTO W44_, PEDESTRIAN UNIT2 WAS-CROSSING-THE-STREET- UTILIZING-THE -CROSSWALK WHILE ATTEMPTING TO EXECUTE THE TURN AT A LOW SPEED, UNIT? STRUCK PEDESTRIAN UNIT2 CAUSING HER TO FALL TO THE GROUND (FAILURE TO YIELD TO CROSS PEDESTRIAN) GCAT INDICATORS:  AT INTERSECTION TURN INVOLVED"
reference_text = "A driver of a commercial truck turned left and struck a person crossing an intersection. Police officers did not ticket the driver."

In [5]:
# Function to generate text
def generate_text(prompt, max_new_tokens=50, num_return_sequences=1):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Generate text
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,  # Limit only the output length
            num_return_sequences=num_return_sequences,
            do_sample=True,  # Enable sampling
            top_k=50,        # Control randomness
            top_p=0.95       # Control randomness
        )
    
    # Decode the generated text
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_texts[0]

In [6]:
def evaluate_sari(prompt, generated_texts, references):
    """
    Evaluate the SARI score between the source (prompt), generated text, and references.
    
    Args:
    - prompt: Original input sentence (source)
    - generated_texts: List of generated sentences by the model
    - references: List of simplified reference sentences
    
    Returns:
    - SARI score
    """
    sari_scores = []
    for generated_text in generated_texts:
        # Evaluate using the SARI score
        sari_score = corpus_sari([prompt], [generated_text], [references])
        sari_scores.append(sari_score)
        print(f"SARI score: {sari_score}")
    
    return sari_scores



In [7]:
# Example usage
generated_texts = generate_text(input_text, max_new_tokens=100, num_return_sequences=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [36]:
from evaluate import load
sari = load("sari")
sari_score = sari.compute(sources=[input_text], predictions=[generated_texts], references=[[reference_text]])
print(input_text)
print(generated_texts)
print(reference_text)
sari_score

SUMMARIZE THIS TEXT: POLICE NARRATIVE WHILE UNIT1 WAS TURNING ONTO W44_, PEDESTRIAN UNIT2 WAS-CROSSING-THE-STREET- UTILIZING-THE -CROSSWALK WHILE ATTEMPTING TO EXECUTE THE TURN AT A LOW SPEED, UNIT? STRUCK PEDESTRIAN UNIT2 CAUSING HER TO FALL TO THE GROUND (FAILURE TO YIELD TO CROSS PEDESTRIAN) GCAT INDICATORS:  AT INTERSECTION TURN INVOLVED
SUMMARIZE THIS TEXT: POLICE NARRATIVE WHILE UNIT1 WAS TURNING ONTO W44_, PEDESTRIAN UNIT2 WAS-CROSSING-THE-STREET- UTILIZING-THE -CROSSWALK WHILE ATTEMPTING TO EXECUTE THE TURN AT A LOW SPEED, UNIT? STRUCK PEDESTRIAN UNIT2 CAUSING HER TO FALL TO THE GROUND (FAILURE TO YIELD TO CROSS PEDESTRIAN) GCAT INDICATORS:  AT INTERSECTION TURN INVOLVED an. police car in involved bicycle injuries involved pedest vehicle struck adult a driver injuries passenger crossing the of a crash in caused A intersection person crossing a a The a driver A the The person driver driver passenger passenger. crash the crash intersection an injuries the struck The injuries driv

{'sari': 38.63365590352072}

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_dataset

# Load the HellaSwag dataset and a pre-trained model
dataset = load_dataset("hellaswag")

In [16]:
from datasets import DatasetDict, Dataset
data = {
    'input': [input_text],
    'generated': [generated_texts],
    'reference': [reference_text]
}

# Convert the dictionary to a Hugging Face Dataset object
dataset = Dataset.from_dict(data)

dataset_dict = DatasetDict({"validation": dataset})

# Display the dataset
print(dataset_dict['validation'])

Dataset({
    features: ['input', 'generated', 'reference'],
    num_rows: 1
})


In [21]:

def evaluate_hellaswag(dataset, model, tokenizer):
    correct_predictions = 0
    total = 0

    # Iterate through the dataset
    for example in dataset:
        context = example['input']
        choices = [example['generated']]  # Assuming generated text is a single choice (for multiple choices, adjust accordingly)
        reference = example['reference']
        
        choice_scores = []
        
        # Evaluate each choice
        for choice in choices:
            input_text = context + " " + choice  # Concatenate context with choice
            input_ids = tokenizer(input_text, return_tensors="pt").input_ids

            # Generate outputs (logits)
            with torch.no_grad():
                outputs = model(input_ids)
                logits = outputs.logits

            # Compute the average score for the completion
            choice_score = logits.mean().item()  # Or use logits.sum() depending on your scoring method
            choice_scores.append(choice_score)

        # Select the best choice (highest score)
        predicted_choice_idx = torch.argmax(torch.tensor(choice_scores))

        # Compare with the reference (assuming reference is the correct index or correct choice)
        if predicted_choice_idx == reference:
            correct_predictions += 1
        total += 1

    accuracy = correct_predictions / total
    print(f"Accuracy on HellaSwag: {accuracy * 100:.2f}%")

# Now run the evaluation function
evaluate_hellaswag(dataset, model, tokenizer)



Accuracy on HellaSwag: 0.00%
