<a href="https://colab.research.google.com/github/automix-llm/automix/blob/main/Scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Scripts for adding scores to outputs

In [1]:
import re
import string
from collections import Counter
import pandas as pd

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def normalize_answer(s):
    """Lower text and remove punctuation, articles, and extra whitespace."""
    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def calculate_f1_for_models(df, model_sizes, ground_truth_col='output'):
    """
    Calculates F1 score for different model sizes and adds the results as new columns in the DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing prediction data.
    - model_sizes (list of str): List containing strings that denote model sizes.
      Used to create column names dynamically.
    - ground_truth_col (str, optional): The name of the column containing ground truth data.
      Defaults to 'output'.

    Returns:
    - pd.DataFrame: The original DataFrame with added columns for the F1 scores.
    """
    for size in model_sizes:
        pred_col = f'llama{size}_pred_ans'
        f1_col = f'llama{size}_f1'
        df[f1_col] = df.apply(
            lambda r: f1_score(prediction=r[pred_col], ground_truth=r[ground_truth_col]),
            axis=1
        )
    return df



#### For quality, LLAMA2-13b sometimes generates only the option (e.g., a). Simple matching with output won't work, so we have to do map the generated option to the correct answer and do the matching.


In [2]:
import pandas as pd
import re
from typing import List

def extract_option(row: pd.Series) -> str:
    """
    Extracts the correct option from the provided row.

    Parameters:
        row (pd.Series): A row of a DataFrame, expected to contain 'question' and 'output' columns.

    Returns:
        str: The letter of the correct option, or None if not found.
    """
    options = re.findall(r'\((\w)\) ([\w\s]+)', row['question'])
    for option, value in options:
        if value.strip() == row['output'].strip():
            return option
    return None

def extract_option_from_prediction(pred: str) -> str:
    """
    Extracts the selected option letter from a prediction string.

    Parameters:
        pred (str): The prediction string, expected to start with an option letter.

    Returns:
        str: The extracted option letter, or None if not found or if `pred` is empty.
    """
    if len(pred.strip()) == 0:
        return None

    option = pred.split()[0]
    for char in option:
        if char in ['A', 'B', 'C', 'D']:
            return char
    return None

def calculate_f1_for_multi_choice(df: pd.DataFrame, model_sizes: List[str], datasets: List[str]=["quality"]) -> pd.DataFrame:
    """
    Computes F1 scores for predictions in multiple-choice format.

    It extracts correct and predicted options and computes F1 scores, with special handling
    for certain datasets. This function mutates the input DataFrame by adding new columns
    for extracted options and possibly modifying F1 scores.

    Parameters:
        df (pd.DataFrame): The DataFrame containing prediction and ground truth data.
            Expected to contain columns in the format 'llama{size}_pred_ans'.
        model_sizes (List[str]): List of strings indicating the model sizes for which
            predictions are available in `df` (e.g., ['13b', '70b']).
        datasets (List[str], optional): List of dataset names that require special handling.
            Defaults to ["quality"].

    Returns:
        pd.DataFrame: The original DataFrame with additional/modified columns for extracted
            options and potentially modified F1 scores.
    """
    df['correct_option'] = df.apply(extract_option, axis=1)

    for size in model_sizes:
        pred_ans_col = f'llama{size}_pred_ans'
        pred_option_col = f'llama{size}_pred_option'
        f1_col = f'llama{size}_f1'

        # Remove single quotes from predictions for specified datasets
        df[pred_ans_col] = df.apply(lambda r: r[pred_ans_col] if r["dataset"] not in datasets else r[pred_ans_col].replace("'", ""), axis=1)

        # Extract the option from the prediction
        df[pred_option_col] = df[pred_ans_col].apply(extract_option_from_prediction)

        # Compute the F1 score: if dataset is in `datasets`, F1 is 1 if predicted option matches correct option, else it's 0
        df[f1_col] = df.apply(lambda r: r[pred_option_col] == r['correct_option'] if r["dataset"] in datasets else r[f1_col], axis=1)

    return df


In [3]:
inputs_with_predictions = pd.read_json("data/automix_llama2_outputs_ver_n32.jsonl",
                                       orient="records", lines=True)

In [4]:
model_sizes = ['13b', '70b']

# Calculating F1 scores for each model size
inputs_with_predictions = calculate_f1_for_models(inputs_with_predictions, model_sizes)

# Further processing and calculating F1 scores for multi-choice questions
inputs_with_predictions = calculate_f1_for_multi_choice(inputs_with_predictions, model_sizes)

In [11]:
inputs_with_predictions = inputs_with_predictions[['id', 'pid', 'base_ctx', 'question', 'output', 'dataset',
       'llama13b_pred_ans', 'llama70b_pred_ans', 'llama13b_ver', 'split',
       'p_ver_13b', 'llama13b_ver_n32', 'p_ver_13b_n32', 'llama13b_f1',
       'llama70b_f1']]

In [29]:
inputs_with_predictions.groupby("dataset")[['llama13b_f1', 'llama70b_f1']].mean()

Unnamed: 0_level_0,llama13b_f1,llama70b_f1
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
cnli,0.393899,0.553111
coqa,0.478972,0.611645
narrative_qa,0.205739,0.265013
qasper,0.151614,0.286654
quality,0.242174,0.32587
