#Python Installs

In [None]:
!pip install datasets
!pip install ai2-olmo
!pip install datasets transformers torch bert_score
!pip install tf-keras
!pip install torch torchvision accelerate
!pip install hf_olmo
!pip install tabulate
!pip install scikit-learn
!pip install sentence-transformers
!pip install bleurt
!pip install scipy
!pip install krippendorfffrom google.colab import drive

[31mERROR: Could not find a version that satisfies the requirement hf_olmo (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for hf_olmo[0m[31m
[31mERROR: Could not find a version that satisfies the requirement bleurt (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for bleurt[0m[31m
[31mERROR: Could not find a version that satisfies the requirement krippendorfffrom (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for krippendorfffrom[0m[31m
[0m

#Get Models from Google Drive

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


#Imports

In [None]:
import torch
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tabulate import tabulate
import pandas as pd

# Model Paths and Initialization

In [None]:
# Paths to fine-tuned models (adjust paths as needed)
t5_model_path = "/content/drive/My Drive/fine_tuned_flan_t5_base"
llama_model_path = "/content/drive/My Drive/meta_llama_2-7b-hf"

# Initialize T5 model and tokenizer
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_path)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_path).to('cuda')

# Initialize LLaMA model and tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
llama_model = AutoModelForCausalLM.from_pretrained(
    llama_model_path,
    torch_dtype=torch.float16,
).to('cuda')
llama_tokenizer.pad_token = llama_tokenizer.eos_token


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#Load Dataset and Preprocessing

In [None]:
data = load_dataset("anab/ACORN", split="train")

# Extract the last 10 entries from the dataset
last_entries = list(data)[-500:]

#Helper Functions

In [None]:
# Predict the correct choice using T5
def predict_choice_t5(question, ground_truth_explanation, choices):
    input_text = f"Question: {question} Explanation: {ground_truth_explanation} Choices: {', '.join(choices)} Which is the correct choice?"
    inputs = t5_tokenizer(input_text, return_tensors="pt").to('cuda')
    outputs = t5_model.generate(**inputs, max_length=50)
    predicted_choice = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_choice

# Create prompt for LLaMA few-shot prediction
def create_llama_prompt(question, ground_truth_explanation, choices, few_shot_examples):
    prompt = ""
    for example in few_shot_examples:
        prompt += (
            f"Question: {example['question']}\n"
            f"Explanation: {example['explanation']}\n"
            f"Choices: {', '.join(example['choices'])}\n"
            f"Correct Choice: {example['correct_choice']}\n\n"
        )
    prompt += (
        f"Question: {question}\n"
        f"Explanation: {ground_truth_explanation}\n"
        f"Choices: {', '.join(choices)}\n"
        f"Select one of the above as the Correct Choice:"
    )
    return prompt

# Validate LLaMA's prediction to ensure it matches one of the given choices
def validate_llama_prediction(predicted_choice, choices):
    for choice in choices:
        if choice.strip().lower() in predicted_choice.strip().lower():
            return choice  # Return the valid choice
    return "Invalid Prediction"  # Handle invalid outputs

#Processing Entries and Predictions

In [None]:
# Replace the Few-Shot Examples Setup section
few_shot_counts = [1, 3, 5, 7, 10]
all_examples = list(data)[:10]  # Get first 10 examples for maximum few-shot count

# Modify the Processing Entries and Predictions section
results_by_shots = {}
t5_results = []
llama_results = {}

for few_shot_count in few_shot_counts:
    few_shot_examples = [
        {
            "question": entry["question"],
            "choices": entry["choices"],
            "correct_choice": entry["choices"][entry["label"]],
            "explanation": entry["explanation"],
        }
        for entry in all_examples[:few_shot_count]
    ]
    
    llama_correct_count = 0
    t5_correct_count = 0
    
    for entry in last_entries:
        question = entry['question']
        choices = entry['choices']
        ground_truth_explanation = entry['explanation']
        correct_choice = choices[entry['label']]

        # T5 prediction (only need to do this once)
        if few_shot_count == few_shot_counts[0]:  # Only for first iteration
            predicted_choice_t5 = predict_choice_t5(question, ground_truth_explanation, choices)
            if predicted_choice_t5.strip() == correct_choice.strip():
                t5_correct_count += 1

        # LLaMA prediction
        llama_prompt = create_llama_prompt(question, ground_truth_explanation, choices, few_shot_examples)
        llama_inputs = llama_tokenizer(llama_prompt, return_tensors="pt", padding=True).to('cuda')
        llama_outputs = llama_model.generate(
            **llama_inputs,
            max_new_tokens=20,
            temperature=0.1,
            top_p=0.85,
            top_k=40,
            repetition_penalty=1.1,
        )
        predicted_choice_llama = llama_tokenizer.decode(llama_outputs[0], skip_special_tokens=True).split("Correct Choice:")[-1].strip()
        validated_choice_llama = validate_llama_prediction(predicted_choice_llama, choices)

        if validated_choice_llama.strip() == correct_choice.strip():
            llama_correct_count += 1

    # Store results for this few-shot count
    if few_shot_count == few_shot_counts[0]:
        t5_accuracy = (t5_correct_count / len(last_entries)) * 100
        t5_results = [t5_accuracy] * len(few_shot_counts)  # Same accuracy for all points
    
    llama_accuracy = (llama_correct_count / len(last_entries)) * 100
    llama_results[few_shot_count] = llama_accuracy

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(few_shot_counts, t5_results, marker='o', label='T5', linestyle='-')
plt.plot(few_shot_counts, [llama_results[k] for k in few_shot_counts], marker='s', label='LLaMA', linestyle='-')
plt.xlabel('Number of Few-Shot Examples')
plt.ylabel('Accuracy (%)')
plt.title('Model Accuracy vs Number of Few-Shot Examples')
plt.legend()
plt.grid(True)
plt.show()

# Print the numerical results
print("\nAccuracy Results:")
print(f"T5 Accuracy: {t5_results[0]:.2f}%")
print("\nLLaMA Accuracy by Few-Shot Count:")
for count in few_shot_counts:
    print(f"{count} shots: {llama_results[count]:.2f}%")

# Save results to CSV
results_df = pd.DataFrame({
    'Few_Shot_Count': few_shot_counts,
    'T5_Accuracy': t5_results,
    'LLaMA_Accuracy': [llama_results[k] for k in few_shot_counts]
})
results_df.to_csv('accuracy_comparison.csv', index=False)
print("\nResults have been saved to 'accuracy_comparison.csv'")