Install required Libraries

In [None]:
!pip install -q transformers accelerate einops langchain bitsandbytes xformers
!pip install -q python-Levenshtein

In [None]:
import pandas as pd
import os

In [None]:
# Fetch train and test splis of dataset from directory
# Replace the value of 'DATA_DIR' with the path to data directory
DATA_DIR = "path/to/dataset"
ds_split_map = {
    "train":os.path.join(DATA_DIR, "Kwame AI - NSMQ Riddles Train"),
    "test":os.path.join(DATA_DIR, "Kwame AI - NSMQ Riddles Test"),
    "test_2019":os.path.join(DATA_DIR, "NSMQ Contest with Video Links - 2019"),
    "test_2020":os.path.join(DATA_DIR, "NSMQ Contest with Video Links - 2020"),
    "test_2019_with_contexts":os.path.join(DATA_DIR, "NSMQ Contest with Video Links - 2019_with_contexts"),
    "test_2020_with_contexts":os.path.join(DATA_DIR, "NSMQ Contest with Video Links - 2020_with_contexts"),
}

In [None]:
from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

In [None]:
# Fetch Mistral-7B-Instruct-v0.1 model
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
model_name = model_id.split('/')[-1]

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)


pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        use_cache=True,
        device_map="auto",
        max_length=500,
        temperature = 1,
        do_sample=False,
        #top_k=1,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        torch_dtype=torch.float16,
)

In [None]:
llm = HuggingFacePipeline(pipeline=pipe)

# Function to get mistral answer for live riddles
def answer_with_mistral(riddle):
  template = """  <s>[INST]
      You are a science prodigy currently competing in a National Science competition. \
      You are now in the fifth round, where you must provide a one-word answer to a riddle. \
      Remember, your answer should consist of just the term the riddle is pointing to, and nothing else. \
      Adding additional text will result in point deductions. \
      Here's an example to guide you:
      Riddle: You might think I am a rather unstable character because I never stay at one place. \
            However, my motion obeys strict rules and I always return to where I started \
            And even if i have to leave that spot again I do it in strict accordance to time. \
            I can be named in electrical and mechanical contexts \
            In all cases, I obey the same mathematical rules. \
            In order to fully analyse me you would think about a stiffness or force constant restoring force and angular frequency. \

      Answer: oscillator

      Read the riddle below and provide the correct answer.

      Riddle: {riddle}
      Answer:
      [/INST] </s>

  """

  prompt = PromptTemplate(template=template, input_variables=["riddle"])
  llm_chain = LLMChain(prompt=prompt, llm=llm)
  answer = llm_chain.run({"riddle":riddle})
  return answer

In [None]:
import string, re


def remove_articles(text):
    """
      Remove articles [the|a|an] from `text`

      Args:
        text: str

      Returns:
        text with articles removed: str
    """
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)

def normalize_text(s):
    """
      Removing articles and punctuation, and standardizing whitespace are all typical text processing steps.

      Args:
        s: (str) string to normalize

      Returns:
        noralized string: str
    """

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation.replace("/", ""))
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_punc(lower(s)))

In [None]:
import Levenshtein

def per_clue_based_fm(model, fm_scores, answers, model_answer, threshold=0.70):
    """
    Computes Fuzzy Match Score
    Fuzzy Match: percentage of times model_answer is close to ground_truth

    Args:
      model: str - model being used
      answers: list[str] - ground truth answers to riddle
      model_answer: str output of model
      threshold: float - similarity threshold (e.g., 0.8 for 80% similarity)

    Returns:
      bool: True/False
    """
    # Remove empty entries from answers
    answers = [answer for answer in answers if answer.strip() != '_' or answer.strip() != '']

    if any(ground_truth in model_answer for ground_truth in answers) or any(model_answer in ground_truth for ground_truth in answers):
      fm_scores[model] += 1
      return True
    return False


def per_clue_based_em(model, em_scores, answers, model_answer):
  """
    Compute Exact Match Score.
    Exact Match: Number of times model_answer exactly equals ground_truth

    Args:
      model: str - model being used
      answers: list[str] - ground truth answers to riddle
      model_answer: str output of model

    Returns:
      bool: True/False
  """
  # Remove emptry entries from answers
  answers = [answer for answer in answers if answer.strip() != '_' or answer.strip() != '']
  if any(ground_truth == model_answer for ground_truth in answers):
    em_scores[model] += 1
    return True
  return False

In [None]:
import pandas as pd

def read_dataset(filepath: str):
    # Read the entire dataset from the CSV file
    df = pd.read_csv(f"{filepath}.csv")

    # Split the data by year into two DataFrames
    df_2019 = df[df['Year'] == 2019]
    df_2020 = df[df['Year'] == 2020]

    # Fetch clues, answers, and subjects from the 2019 DataFrame and convert to lists
    clues_2019 = df_2019[["Clue 1", "Clue 2", "Clue 3", "Clue 4", "Clue 5", "Clue 6", "Clue 7", "Clue 8"]]
    clues_2019.fillna("", inplace=True)
    clues_2019 = clues_2019.values.tolist()

    answers_2019 = df_2019[["Answer 1", "Answer 2", "Answer 3", "Answer 4"]]
    answers_2019.fillna("_", inplace=True)
    answers_2019 = answers_2019.values.tolist()

    subjects_2019 = df_2019["Subject"].tolist()

    # Fetch clues, answers, and subjects from the 2020 DataFrame and convert to lists
    clues_2020 = df_2020[["Clue 1", "Clue 2", "Clue 3", "Clue 4", "Clue 5", "Clue 6", "Clue 7", "Clue 8"]]
    clues_2020.fillna("", inplace=True)
    clues_2020 = clues_2020.values.tolist()

    answers_2020 = df_2020[["Answer 1", "Answer 2", "Answer 3", "Answer 4"]]
    answers_2020.fillna("_", inplace=True)
    answers_2020 = answers_2020.values.tolist()

    subjects_2020 = df_2020["Subject"].tolist()

    return (clues_2019, answers_2019, subjects_2019), (clues_2020, answers_2020, subjects_2020)


filepath = ds_split_map["test"]
eval_dataset_2019, eval_dataset_2020 = read_dataset(filepath)

In [None]:
import os

def human_eval(eval_dataset, year):
    # Initialize EM and FM dictionaries
    fm_scores = {
        model_name: 0
    }

    em_scores = {
        model_name: 0
    }

    riddles, ground_truth_answers, subjects = eval_dataset

    # Create dictionaries to store EM and FM scores per subject
    em_scores_per_subject = {}
    fm_scores_per_subject = {}

    # Get unique subjects
    unique_subjects = set(subjects)

    # Initialize scores for each subject
    for subject in unique_subjects:
        em_scores_per_subject[subject] = {model_name: 0}
        fm_scores_per_subject[subject] = {model_name: 0}

    # Create a list to store log data for each riddle
    log_data = []

    # Iterate over riddles
    for riddle_num, (riddle, answer, subject) in enumerate(zip(riddles, ground_truth_answers, subjects), start=1):
        # Initialize EM and FM values
        em_value = False
        fm_value = False

        clues = riddle
        clues = [clue for clue in clues if clue.strip() != '']
        answers = [remove_articles(normalize_text(ans)).strip() for ans in answer]
        answers = [ans if ans != '' else '_' for ans in answers]

        cur_clues = ''
        fm_answer_found = False

        for clue_num, clue in enumerate(clues, start=1):
            cur_clues += '\n ' + clue

            model_answer = answer_with_mistral(cur_clues).lower().replace("answer to the riddle is", '')
            model_answer = remove_articles(normalize_text(model_answer.split(':')[-1]).replace('"', '')).strip()

            em_check = per_clue_based_em(model_name, em_scores, answers, model_answer)

            if not fm_answer_found:
                fm_check = per_clue_based_fm(model_name, fm_scores, answers, model_answer)
                if fm_check:
                    fm_answer_found = True

            if em_check:
                em_value = True
                fm_value = True
                break

            if fm_check:
                fm_value = True

        # Create a dictionary for the log data of the current riddle
        log_entry = {
            "Clue 1": riddle[0] if len(riddle) > 0 else "",
            "Clue 2": riddle[1] if len(riddle) > 1 else "",
            "Clue 3": riddle[2] if len(riddle) > 2 else "",
            "Clue 4": riddle[3] if len(riddle) > 3 else "",
            "Clue 5": riddle[4] if len(riddle) > 4 else "",
            "Clue 6": riddle[5] if len(riddle) > 5 else "",
            "Clue 7": riddle[6] if len(riddle) > 6 else "",
            "Clue 8": riddle[7] if len(riddle) > 7 else "",
            "Answer 1": answer[0] if len(answer) > 0 else "",
            "Answer 2": answer[1] if len(answer) > 1 else "",
            "Answer 3": answer[2] if len(answer) > 2 else "",
            "Answer 4": answer[3] if len(answer) > 3 else "",
            "Model Answer": model_answer,
            "EM Value": em_value,
            "FM Value": fm_value,
            "Riddle Answered On": clue_num,
            "Subject": subject
        }

        # Append the log data to the list
        log_data.append(log_entry)

        # Update EM and FM scores per subject
        em_check_per_subject = em_scores_per_subject[subject].get(model_name, 0)
        em_scores_per_subject[subject][model_name] = em_check_per_subject + (1 if em_value else 0)

        fm_check_per_subject = fm_scores_per_subject[subject].get(model_name, 0)
        fm_scores_per_subject[subject][model_name] = fm_check_per_subject + (1 if fm_value else 0)

        # Print a message indicating the completion of logging for the current riddle
        print(f"Logged riddle {riddle_num}")

    # Create a DataFrame from the log data
    log_df = pd.DataFrame(log_data)

    # Save the DataFrame to a CSV file
    log_filename = os.path.join(DATA_DIR, f"Results/{model_name}_{year}_human_eval_log.csv")
    log_df.to_csv(log_filename, index=False)

    return em_scores, em_scores_per_subject, fm_scores, fm_scores_per_subject


In [None]:
# Get scores for 2019 riddles
scores_2019 = human_eval(eval_dataset_2019, 2019)

# Get scores for 2020 riddles
scores_2020 = human_eval(eval_dataset_2020, 2020)

In [None]:
scores_2019, scores_2020

In [None]:
import pandas as pd

def get_subject_counts(filepath: str):
    # Read the entire dataset from the CSV file
    df = pd.read_csv(f"{filepath}.csv")

    # Split the data by year into two DataFrames
    df_2019 = df[df['Year'] == 2019]
    df_2020 = df[df['Year'] == 2020]

    # Fetch subjects from the 2019 DataFrame and count their occurrences
    subjects_2019_counts = df_2019["Subject"].value_counts().to_dict()

    # Fetch subjects from the 2020 DataFrame and count their occurrences
    subjects_2020_counts = df_2020["Subject"].value_counts().to_dict()

    return subjects_2019_counts, subjects_2020_counts

# Example usage:
filepath = ds_split_map["test"]
subjects_2019_counts, subjects_2020_counts = get_subject_counts(filepath)
print(subjects_2019_counts)
print("================================================")
print(subjects_2020_counts)

In [None]:
import csv

def normalize_scores(em_scores, fm_scores, total_riddles):
  normalized_em_scores = {model: round((score / total_riddles) * 100, 2) for model, score in em_scores.items()}
  normalized_fm_scores = {model: round((score / total_riddles) * 100, 2) for model, score in fm_scores.items()}
  return normalized_em_scores, normalized_fm_scores

def save_metrics_to_csv(filename, em_scores, fm_scores, total_riddles):
  norm_fname = os.path.join(DATA_DIR, f"Results/{model_name}_eval_em_fm_{filename}_v4.csv")

  # Open the CSV file for writing
  with open(norm_fname, 'w', newline='', encoding="utf-8") as norm_csv_file:
    writer = csv.writer(norm_csv_file)

    # Write the header row
    writer.writerow(["MODEL", "EM Score", "EM SCORE (%)", "FM Score", "FM SCORE (%)", "NUMBER OF RIDDLES"])

    # Normalize scores and write rows for each model
    normalized_em_scores, normalized_fm_scores = normalize_scores(em_scores, fm_scores, total_riddles)
    for model in em_scores.keys():
      em_score = em_scores[model]
      norm_em_score = normalized_em_scores[model]
      fm_score = fm_scores[model]
      norm_fm_score = normalized_fm_scores[model]
      content = [model, em_score, norm_em_score, fm_score, norm_fm_score, total_riddles]
      writer.writerow(content)

In [None]:
total_riddles = 160 # Replace with total number of riddles if different

em_scores_19, em_subject_scores_19, fm_scores_19, fm_subject_scores_19 = scores_2019
em_scores_20, em_subject_scores_20, fm_scores_20, fm_subject_scores_20 = scores_2020

# Save metrics to CSV
save_metrics_to_csv('2019', em_scores_19, fm_scores_19, total_riddles-4)
save_metrics_to_csv('2020', em_scores_20, fm_scores_20, total_riddles)

In [None]:
import csv

def ps_breakdown_csv(ps_em_scores, ps_fm_scores, counts, year):
    csv_filename = os.path.join(DATA_DIR, f"Results/{model_name}_subject_breakdown_{year}.csv")

    with open(csv_filename, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)

        # Write the header row
        writer.writerow(['Subject', 'EM Score', 'EM Score (%)', 'FM Score', 'FM Score (%)', 'Count'])

        for subject, ps_em_score, ps_fm_score in zip(ps_em_scores.keys(), ps_em_scores.values(), ps_fm_scores.values()):
            em_percentage = round((ps_em_score[model_name] / counts[subject]) * 100, 2)
            fm_percentage = round((ps_fm_score[model_name] / counts[subject]) * 100, 2)

            writer.writerow([subject, ps_em_score[model_name], em_percentage, ps_fm_score[model_name], fm_percentage, counts[subject]])

In [None]:
# 2019
ps_breakdown_csv(em_subject_scores_19, fm_subject_scores_19, subjects_2019_counts, "2019")

# 2020
ps_breakdown_csv(em_subject_scores_20, fm_subject_scores_20, subjects_2020_counts, "2020")