# Install

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q accelerate bitsandbytes
# !pip install -q dataframe_image

# Utils

In [2]:
from collections import defaultdict
import glob
from IPython.display import clear_output, display
from numpy import argmax
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import io
import math
import numpy as np
import os
import pandas as pd
import requests
import torch
import torch.nn.functional as F
# import dataframe_image as dfi

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def softmax(x):
    z = x - max(x)
    numerator = np.exp(z)
    denominator = np.sum(numerator)
    softmax = numerator/denominator
    return softmax

@torch.no_grad()
def get_logprobs_causal(model, tokenizer, prompt, device):
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    output_ids = inputs["input_ids"][:, 1:].to(device)

    try:
        outputs = model(**inputs, labels=input_ids)
    except Exception as e:
        if "unexpected keyword argument 'token_type_ids'" in str(e):
            inputs.pop("token_type_ids")
            outputs = model(**inputs, labels=input_ids)
        else:
            raise

    logits = outputs.logits.to(torch.double).to(device)
    output_ids = output_ids.to(logits.get_device())  # in case of parallelism
    logprobs = torch.gather(F.log_softmax(logits, dim=2), 2, output_ids.unsqueeze(2))

    return logprobs.mean()

def predict_classification(model, tokenizer, input_text, labels, device):
    probs = [get_logprobs_causal(model, tokenizer, input_text+label, device) for label in labels]
    return torch.stack(probs)

def predict_classification_by_letter(model, tokenizer, input_text, labels, device):
    choices = ['A', 'B', 'C', 'D', 'E'][:len(labels)]
    choice_ids = [tokenizer.encode(choice)[-1] for choice in choices]
    with torch.no_grad():
        inputs = tokenizer(input_text, return_tensors="pt")
        input_ids = inputs["input_ids"].to(device)

        try:
            outputs = model(**inputs, labels=input_ids)
        except BaseExceptionGroup as e:
            if "unexpected keyword argument 'token_type_ids'" in str(e):
                inputs.pop("token_type_ids")
                outputs = model(**inputs, labels=input_ids)
            else:
                raise

        last_token_logits = outputs.logits[:, -1, :]
        choice_logits = last_token_logits[:, choice_ids].detach().cpu().numpy()
        conf = softmax(choice_logits[0])
        pred_index = np.argmax(choice_logits[0])
        pred = choices[pred_index]

    return conf, pred

def get_prompt(eval_args):
  ### Few-shot logic manually added because it was not included in the github repo
  ### Prompt template directly taken from the original paper (https://arxiv.org/pdf/2310.04928)
  FEWSHOT_PROMPT = 'Ini adalah beberapa contoh soal [SUBJECT]\n\n[EX1]\n[OPT1]\nJawaban: [K1]\n\n[EX2]\n[OPT2]\nJawaban: [K2]\n\n[EX3]\n[OPT3]\nJawaban: [K3]\n\n[INPUT]\n[OPTION]\nJawaban: '
  PROMPT = 'Ini adalah soal [SUBJECT] untuk [LEVEL]. Pilihlah salah satu jawaban yang dianggap benar!\n\n[INPUT]\n[OPTION]\n\nJawaban: '
  if eval_args.fewshot:
    return FEWSHOT_PROMPT
  else:
    return PROMPT

def prepare_data(prompt, data, eval_args, seed=111):
    sample_size = eval_args.mmlu_sample
    group_by = eval_args.group_by

    if group_by not in ['subject', 'kelas']:
        raise ValueError("group_by must be either 'subject' or 'kelas'!")

    main_task_data = data[data['is_for_fewshot'] == 0]
    fewshot_data = data[data['is_for_fewshot'] == 1]

    grouped_data = dict(tuple(main_task_data.groupby(group_by)))
    num_groups = len(grouped_data)
    samples_per_group = sample_size // num_groups

    if sample_size % num_groups != 0:
        raise ValueError(f"Sample size must be a multiple of {num_groups} (number of '{group_by}')")

    id = []
    inputs = []
    kunci = []
    subject = []
    kelas = []
    outputs_options = []

    ### Ensure each group has the same number of questions
    balanced_samples = []
    for group, group_data in grouped_data.items():
        balanced_samples.append(group_data.sample(n=min(samples_per_group, len(group_data)), random_state=seed))
    balanced_data = pd.concat(balanced_samples)

    for idx, row in balanced_data.iterrows():
        if row['level'] == 'Seleksi PTN':
            level = 'seleksi masuk universitas'
        else:
            try:
                level = f"{math.trunc(float(row['kelas']))} {row['level']}"
            except:
                level = f"{row['kelas']} {row['level']}"

        if eval_args.fewshot:
            # Sample few-shot examples from the same subject as the main task
            subject_fewshot_data = fewshot_data[fewshot_data['subject'] == row['subject']]
            if len(subject_fewshot_data) < 3:
                raise ValueError(f"Some subjects have <3 few-shot examples!")
            fewshot_examples = subject_fewshot_data.sample(n=min(3, len(subject_fewshot_data)), random_state=seed)

            fewshot_prompt = prompt.replace('[SUBJECT]', row['subject'])
            for i, (_, ex) in enumerate(fewshot_examples.iterrows(), start=1):
                fewshot_prompt = fewshot_prompt.replace(f'[EX{i}]', ex['soal'])
                fewshot_prompt = fewshot_prompt.replace(f'[OPT{i}]', ex['jawaban'])
                fewshot_prompt = fewshot_prompt.replace(f'[K{i}]', ex['kunci'])

            fewshot_prompt = fewshot_prompt.replace('[INPUT]', row['soal'])
            fewshot_prompt = fewshot_prompt.replace('[OPTION]', row['jawaban'])
            inputs.append(fewshot_prompt)
        else:
            inputs.append(
                prompt.replace('[SUBJECT]', row['subject']).\
                       replace('[LEVEL]', level).\
                       replace('[INPUT]', row['soal']).\
                       replace('[OPTION]', row['jawaban'])
            )

        id.append(row['id'])
        kunci.append(row['kunci'])
        subject.append(row['subject'])
        kelas.append(row['kelas'])
        outputs_options.append(row['jawaban'].split('\n'))

    return id, inputs, kunci, subject, kelas, outputs_options

class EvalArgs:
    def __init__(self, quant, mmlu_sample, group_by, fewshot: bool, by_letter=True):
        if quant == '1bit':
            self.quant = BitsAndBytesConfig(load_in_1bit=True)
        elif quant == '4bit':
            self.quant = BitsAndBytesConfig(load_in_4bit=True)
        elif quant == '8bit':
            self.quant = BitsAndBytesConfig(load_in_8bit=True)
        elif quant == 'fp16':
            self.quant = None
        else:
            raise ValueError("""Invalid quantization config. Must choose between '1bit', '4bit', '8bit', or 'fp16'""")
        self.fewshot = fewshot
        self.quant_str = quant
        self.group_by = group_by
        self.by_letter = by_letter
        self.mmlu_sample = mmlu_sample
        self.output_folder = f"{'0-shot' if not self.fewshot else '3-shot'}"

# Eval and display code

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

def eval(eval_args, model_names: list):
  indommlu = pd.read_csv(io.StringIO(requests.get("https://raw.githubusercontent.com/fajri91/IndoMMLU/main/data/indoMMLU.csv").text), encoding='utf-8')
  ### Fix column format
  indommlu['kelas'] = indommlu['kelas'].astype(str).str.strip().apply(lambda x: str(int(float(x))) if x.replace('.', '').isdigit() else x)

  ### Remove ambiguous class categories
  if eval_args.group_by == 'kelas':
    indommlu = indommlu[indommlu['kelas'].isin([str(i) for i in range(1, 13)] + ['PTN'])]

  total_questions = eval_args.mmlu_sample
  prompt = get_prompt(eval_args)
  id, inputs, kunci, subject, kelas, outputs_options = prepare_data(prompt, eval_args=eval_args, data=indommlu)
  scores = {}

  for model_name in model_names:
    os.makedirs(eval_args.output_folder, exist_ok=True)
    SAVE_FILE = f"{eval_args.output_folder}/{model_name.split('/')[-1]} {('(3-shot)' if eval_args.fewshot else '(0-shot)')}.csv"

    clear_output()
    for k,v in scores.items():
      print(f"{k}: {v}/{total_questions}")
    print('')

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16,
                                                quantization_config=eval_args.quant,
                                                trust_remote_code=True, device_map="auto",
                                                attn_implementation='eager', use_cache=True
                                                )

    try:model.to(device)
    except: pass

    clear_output()
    for k,v in scores.items():
      print(f"{k}: {v}/{total_questions}")
    print('')

    model.eval()

    preds = []
    probs = []
    print(model_name)
    for idx in tqdm(range(len(inputs))):
        if not eval_args.by_letter:
            out = predict_classification(model, tokenizer, inputs[idx], outputs_options[idx], device)
            prob = out.cpu().detach().tolist()
            pred = argmax(prob)
            letter_pred = chr(ord('A') + pred)
            preds.append(letter_pred)
            probs.append(prob)
        else:
            conf, pred, = predict_classification_by_letter(model, tokenizer, inputs[idx], outputs_options[idx], device)
            probs.append(conf)
            preds.append(pred)

        if device == 'cuda':
            torch.cuda.empty_cache()

    del(model)

    output_df = pd.DataFrame()
    output_df['id'] = id
    output_df['input'] = inputs
    output_df['subject'] = subject
    output_df['kelas'] = kelas
    output_df['kunci'] = kunci
    output_df['options'] = outputs_options
    output_df['preds'] = preds
    output_df['probs'] = probs
    output_df['is_correct'] = (output_df['kunci'] == output_df['preds']).astype(int)
    output_df.to_csv(SAVE_FILE, encoding='utf-8', index=False)
    correct_answers = (output_df['kunci'] == output_df['preds']).sum()

    scores[model_name] = correct_answers

  clear_output()

  for k,v in scores.items():
    print(f"{k}: {v}/{total_questions}")

  return scores

def scores_df(eval_args):
    """
    Display the results in a clean pandas dataframe + save it as a nice png file
    """
    folder = eval_args.output_folder
    group = eval_args.group_by
    csv_files = glob.glob(os.path.join(folder, '*.csv'))
    results = {}
    for file in csv_files:
        model_name = os.path.basename(file)[:-4]
        df = pd.read_csv(file)
        subject_scores = df.groupby(group)['is_correct'].agg(lambda x: round((sum(x) / len(x)) * 100, 2))
        results[model_name] = subject_scores

    result_df = pd.DataFrame(results)
    
    total_scores = result_df.mean().round(2)
    total_row = pd.DataFrame(total_scores).T
    total_row.index = ['Total']

    result_df = pd.concat([total_row, result_df])
    result_df['Average'] = result_df.mean(axis=1).round(2)
    result_df = result_df.reset_index()
    new_columns = [group] + ['-'.join(col.split('-')[:2]) if '-' in col else col for col in result_df.columns if col != 'index']
    result_df.columns = new_columns

    cols = result_df.columns.tolist()
    avg_col = cols.pop(cols.index('Average'))
    cols = [group, avg_col] + [col for col in cols if col not in [group, avg_col]]
    result_df = result_df[cols]

    result_df = pd.concat([
        result_df[result_df[group] == 'Total'],
        result_df[result_df[group] != 'Total'].sort_values('Average', ascending=False)
    ]).reset_index(drop=True)

    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.float_format', '{:.2f}'.format)

    pd.set_option('display.colheader_justify', 'left')
    print(f'Scores grouped by {group}, sorted by Average (descending).\nAll models are instruction-tuned and in {eval_args.quant_str}.')

    styled_df = result_df.style.set_properties(**{'text-align': 'right'}, subset=[group]).set_properties(**{'text-align': 'center'}, subset=result_df.columns[1:]).format({col: '{:.2f}' for col in result_df.columns if col != group})

    display(styled_df)
    result_df.to_csv(f"indommlu {('(3-shot)' if eval_args.fewshot else '(0-shot)')}.csv", encoding='utf-8', index=False, float_format='%.2f')

    #### Disabled because it doesn't seem to work on colan on my browser (mozilla)
    # try: dfi.export(styled_df, f'indommlu-{eval_args.quant_str}.png')
    # except Exception as e:
    #     print(f"Error saving dataframe to image: \n{e}")
    #     pass

def scores_plot(eval_args, scores=None, filepath = None, from_file=False):
    """
    Display the results in a nice matplotlib barplot + save the image file
    """
    total_questions = eval_args.mmlu_sample

    ### Some from_file logic so this code can be reused in other contexts
    if not from_file and scores is None:
        raise ValueError(f"""'from_file' can't be False while 'scores' is not given!""")
    if from_file and scores is not None:
        raise ValueError(f"""'from_file' can't be True while 'scores' is given!""")
    if from_file and filepath is None:
        raise ValueError(f"""'from_file' can't be True while 'filepath' is not given!""")

    if from_file:
        current_dir = os.getcwd()
        folder_path = os.path.join(current_dir, filepath)
        scores = {}

        for filename in os.listdir(folder_path):
            if filename.endswith('.csv'):
                file_path = os.path.join(folder_path, filename)
                df = pd.read_csv(file_path)
                model_name = os.path.splitext(filename)[0]
                scores[model_name] = df['is_correct'].sum()

    models = list(scores.keys())
    model_scores = [scores[model] / total_questions * 100 for model in models]

    names = [model.split('/')[-1] for model in models]
    colors = sns.color_palette("pastel", n_colors=len(models))

    plt.figure(figsize=(8, 6))
    bar_width = 0.8
    bars = plt.bar(range(len(models)), model_scores, color=colors, edgecolor='white',
                   width=bar_width, align='center')

    plt.title(f"Model Comparison - {'3-shot' if eval_args.fewshot else '0-shot'}", fontsize=16)
    plt.xlabel('Models', fontsize=12)
    plt.ylabel('Score', fontsize=12)

    max_score = max(model_scores)
    plt.ylim(0, min(100, max_score + 5))
    plt.xlim(-1, len(models))

    plt.xticks(range(len(models)), names, rotation=45, ha='right', fontsize=10)
    plt.yticks(fontsize=10)

    for i, bar in enumerate(bars):
        height = bar.get_height()
        plt.text(i, height, f'{height:.1f}',
                 ha='center', va='bottom', fontsize=10)

    plt.tight_layout()
    plt.savefig(f"scores_{'3-shot' if eval_args.fewshot else '0-shot'}.png", dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()

# Run

In [None]:
############ Use to test if code is working properly ############
test_model = ['unsloth/Qwen2-0.5B-Instruct-bnb-4bit']

############ Prequantized 4-bit models for faster download ############
############ All models are instruction-tuned
models_4bit = [
    'unsloth/llama-3-8b-Instruct-bnb-4bit',
    'afrizalha/aya-23-8B_4bit',
    'afrizalha/gemma-2-9b-it-4bit',
    'unsloth/Phi-3-mini-4k-instruct-bnb-4bit',
    'unsloth/mistral-7b-instruct-v0.3-bnb-4bit',
    'unsloth/Qwen2-7B-Instruct-bnb-4bit',
    ]

############ Regular fp16 models (not evaluated) ############
############ All models are instruction-tuned
models_fp16 = [
    'meta-llama/Meta-Llama-3-8B-Instruct',
    'CohereForAI/aya-23-8B',
    'microsoft/Phi-3-mini-4k-instruct',
    'google/gemma-2-9b-it',
    'mistralai/mistral-7b-instruct-v0.3',
    'Qwen/Qwen2-7B-Instruct',
    ]

In [None]:
############ Zero-shot evaluation
eval_args = EvalArgs(quant='4bit', group_by='subject', by_letter=True,
                     mmlu_sample=1000, fewshot=True)
scores_fewshot = eval(eval_args, models_4bit)
scores_df(eval_args)
scores_plot(eval_args, scores=scores_fewshot)

In [None]:
############ Few-shot evaluation
eval_args = EvalArgs(quant='4bit', group_by='subject', by_letter=True,
                     mmlu_sample=1000, fewshot=False)
scores_zeroshot = eval(eval_args, models_4bit)
scores_df(eval_args)
scores_plot(eval_args, scores=scores_zeroshot)