# Evaluation

## Load Package

In [1]:
import os
import json
from random import seed, shuffle
from collections import Counter
import scipy.stats as stats

from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from _api_key import get_openai_api_key

## Basic Function

In [2]:
def load_json_file(file_path):
    """
    Load json file
    """
    with open(file_path,'r',encoding='utf-8') as f:
        file = json.load(f)
        f.close()
    return file

def save_json_file(file, file_path, sort_keys:bool=False):
    """
    Save json file
    """
    with open(file_path,'w',encoding='utf-8') as f:
        json.dump(file, f, indent=4, ensure_ascii=False, sort_keys=sort_keys)
        f.close()

In [3]:
def splitDataset(file, puntype='hom'):
    """
    Enter path or json file to separate the pun part from the non-pun part of the dataset
    """
    if isinstance(file,str):
        dataset = load_json_file(file)
    else:
        dataset = file
    punDataset = dict()
    nonpunDataset = dict()
    for ID in dataset:
        data = dataset[ID]
        if puntype in ID:
            if data.get('pun_word', False):
                punDataset[ID] = data
            else:
                nonpunDataset[ID] = data
    return punDataset, nonpunDataset

def ids_sampling(ids:list, sample_size:int=100):
    """
    Random sampling from the IDs
    """
    seed(2024)
    shuffle(ids); shuffle(ids)
    return ids[0:sample_size]

## Function of Evaluating Explanation

### Punchline Check

In [4]:
def evaluate_explanation_by_punchline(dataset, explanations, target:str, sample_fn=None,
                                      save:bool=False):
    """
    Fine-grained manual evaluation of explanation
    Pun: pun_word, alter_word, pun_sense, alter_sense
    Non-pun: rationality, informativeness
    """
    def get_expl(explanation, target):
        if target == 'human':
            expl = explanation[f'{target}_explanation']
        else:
            expl = explanation[f'{target}_explanation def_true CoT_true examples_true']['biased_to_pun']
        return expl

    target_range = ['human','gpt-3.5-turbo-1106','gpt-4-1106-preview','gemini-pro','claude-3-opus-20240229',
                    'vicuna-7b-v1.5','llama-2-7b-chat','mistral-7b-instruct-v0.2','openchat-3.5-0106']
    assert target in target_range
    path = f'./results/pun_explanation_punchline_check({target}).json'
    if os.path.exists(path):
        record = load_json_file(path)
    else:
        record = dict()
    # Whether to conduct sampling
    IDs = list(dataset.keys())
    if sample_fn is not None:
        IDs = sample_fn(IDs)
    for ID in tqdm(IDs):
        # Get text and explanation
        data = dataset[ID]
        text = data['human_text']
        expl = get_expl(explanations[ID], target)
        # Integrate into record
        if data.get('pun_word',False):
            pun_word = data['pun_word']
            pun_sense = data['pun_sense']
            alter_word = data['alter_word']
            alter_sense = data['alter_sense']
            if 'hom' in ID:
                record[ID] = {'text': text,
                              'pun_word':pun_word,
                              'pun_sense':pun_sense,
                              'alter_word':alter_word,
                              'alter_sense':alter_sense,
                              'explanation': expl,
                              'human_eval':{'pun_word':None, 'alter_word':-1,
                                            'pun_sense':None, 'alter_sense':None}}
            else:
                record[ID] = {'text': text,
                              'pun_word':pun_word,
                              'pun_sense':pun_sense,
                              'alter_word':alter_word,
                              'alter_sense':alter_sense,
                              'explanation': expl,
                              'human_eval':{'pun_word':None, 'alter_word':None,
                                            'pun_sense':None, 'alter_sense':None}}
        else:
            if target == 'human':
                record[ID] = {'text': text,
                              'explanation': expl,
                              'human_eval':{'rationality':-1, 'informativeness':-1}}
            else:
                record[ID] = {'text': text,
                              'explanation': expl,
                              'human_eval':{'rationality':None, 'informativeness':None}}
    if save:
        save_json_file(record,path)

In [5]:
def punchline_check_integration(*file_paths, evaluator:str='human_eval', save:bool=False,
                                path:str='pun_explanation_punchline_check.json'):
    """
    Integrate the results of punchline check into one file
    """
    if 'results' not in path:
        path = './results/' + path
    integration = dict()
    for file_path in file_paths:
        if os.path.exists(file_path):
            file = load_json_file(file_path)
            target = file_path.split('(')[-1].split(')')[0]
            key = f'{evaluator} {target}_explanation'
            for ID in file:
                if ID not in integration:
                    integration[ID] = dict()
                # Data of the evaluation
                eval = file[ID][evaluator]
                integration[ID].update({key: eval})
        else:
            raise AssertionError(f"{file_path} is not a valid file")
    if save:
        save_json_file(integration, path)

In [6]:
def punchline_check_summary(evaluations, save:bool=False, path:str='pun_explanation_punchline_check_metrics.json'):
    """
    Calculate the various indicators of punchline check  \n
    Pun: mean of pun_word, alter_word, pun_sense, alter_sense  \n
    Non-pun: mean of rationality, informativeness, R1I0, R0I1
    """
    def dict_union(*dicts):
        # Add values when dictionary keys are the same
        keys = set() # union of keys
        for d in dicts:
            # Make sure inputs are all dictionary type
            assert isinstance(d, dict)
            keys = keys | d.keys()
        if 'pun_word' in keys:
            keys = sorted(list(keys), key=lambda x:x[::-1])
        else:
            keys = sorted(list(keys), reverse=True)
        union = dict()
        for key in keys:
            union[key] = sum([d.get(key,0) for d in dicts])
        return union

    if 'results' not in path:
        path = './results/' + path
    IDs = list(evaluations.keys())
    eval_keys = list(evaluations[IDs[0]].keys())
    summary = dict()
    counts = dict()
    # Summarize the results of punchline check of different models
    for eval_key in eval_keys:
        hom_pun_evals = [evaluations[ID][eval_key] for ID in IDs if ('hom' in ID and 'pun_word' in evaluations[ID][eval_key])]
        het_pun_evals = [evaluations[ID][eval_key] for ID in IDs if ('het' in ID and 'pun_word' in evaluations[ID][eval_key])]
        non_pun_evals = [evaluations[ID][eval_key] for ID in IDs if (evaluations[ID][eval_key] not in hom_pun_evals + het_pun_evals)]
        # Add additional indicators of non-puns
        for i in range(len(non_pun_evals)):
            non_pun_eval = non_pun_evals[i]
            rationality = non_pun_eval['rationality']
            informativeness = non_pun_eval['informativeness']
            R1I0 = 1 if (rationality == 1 and informativeness == 0) else 0
            R0I1 = 1 if (rationality == 0 and informativeness == 1) else 0
            non_pun_eval.update({'R1I0':R1I0, 'R0I1':R0I1})
            non_pun_evals[i] = non_pun_eval
        summary[eval_key] = {'hom_pun': dict_union(*hom_pun_evals),
                             'het_pun': dict_union(*het_pun_evals),
                             'non_pun': dict_union(*non_pun_evals)}
        counts[eval_key] = {'hom_pun': len(hom_pun_evals),
                            'het_pun': len(het_pun_evals),
                            'non_pun': len(non_pun_evals)}
        # For homographic puns, the pun word and alternative word are the same
        summary[eval_key]['hom_pun']['alter_word'] = summary[eval_key]['hom_pun']['pun_word']
    # Normalization
    for eval_key in summary:
        for type_key in summary[eval_key]:
            temp = summary[eval_key][type_key]
            num = counts[eval_key][type_key]
            temp ={k:(round(v/num,4) if v>=0 else None) for k,v in temp.items()}
            summary[eval_key][type_key] = temp
    print(json.dumps(summary, indent=4))
    if save:
        save_json_file(summary, path)

### Pairwise Comparison

In [7]:
def evaluate_explanation_by_comparison(dataset, explanations, target1:str, target2:str, swap:bool=False, sample_fn=None,
                                       eval_model=None, examples:dict=None, batch_size:int=1, save:bool=False):
    """
    Coarse-grained pairwise comparison of human explanation and model explanation  \n
    Small batch manual eval to align the performance of model(gpt4) eval
    """
    def get_expl(explanation, target):
        # Get corresponding explanation
        if target == 'human':
            expl = explanation[f'{target}_explanation']
        else:
            expl = explanation[f'{target}_explanation def_true CoT_true examples_true']['biased_to_pun']
        return expl
    def parse_output(output:str, swap:bool=False):
        # Parse the output and get the choice
        try:
            output = output[output.index('{'): output.index('}')+1]
        except:
            output = output
        try:
            choice_str = eval(output)['Choice']
        except:
            choice_str = output
        if "Explanation 1 is much better" in choice_str:
            choice = 1
        elif "Explanation 2 is much better" in choice_str:
            choice = 2
        else:
            choice = 0
        if swap and choice > 0:
            choice = 3 - choice
        return choice

    target_range = ['human','gpt-3.5-turbo-1106','gpt-4-1106-preview','gemini-pro','claude-3-opus-20240229',
                    'vicuna-7b-v1.5','llama-2-7b-chat','mistral-7b-instruct-v0.2','openchat-3.5-0106']
    assert target1 in target_range
    assert target2 in target_range
    # Path of the output file
    if eval_model is None:
        path = f'./results/pun_explanation_pairwise_comparison({target1}_vs_{target2}).json'
    else:
        if sample_fn is None:
            path = f'./results/pun_explanation_pairwise_comparison.json'
        else:
            path = f'./results/pun_explanation_pairwise_comparison_pilot.json'
    if os.path.exists(path):
        record = load_json_file(path)
    else:
        record = dict()
    # Whether to conduct sampling
    IDs = list(dataset.keys())
    if sample_fn is not None:
        IDs = sample_fn(IDs)

    # [A]. Evaluate by human
    if eval_model is None:
        for ID in tqdm(IDs):
            data = dataset[ID]
            assert data.get('pun_word',False) # Make sure it's a pun
            text = data['human_text']
            pun_word = data['pun_word']
            pun_sense = data['pun_sense']
            alter_word = data['alter_word']
            alter_sense = data['alter_sense']
            expl1 = get_expl(explanations[ID], target1)
            expl2 = get_expl(explanations[ID], target2)
            record[ID] = {'text': text,
                          'pun_word':pun_word,
                          'pun_sense':pun_sense,
                          'alter_word':alter_word,
                          'alter_sense':alter_sense,
                          'explanation1': expl1,
                          'explanation2': expl2,
                          'human_eval':{'winner':None}}
        if save:
            save_json_file(record, path)
    # [B]. Evaluate by model (llm)
    else:
        # <B.a>. Construct the prompt
        definition = """<*Definition*>\nPuns are a form of wordplay exploiting different meanings of a word or similar-sounding words.\n\n"""
        instruction = """<*Instruction*>\nBelow is a pun text, double meanings of the pun and two corresponding explanations. Please carefully judge which explanation is of better quality. Any explanation that fails to indicate the correct pun, misses the potential phonetic similarity between pun-alternative word pair, misses a layer of correct meaning in the pun or contains other errors is a worse explanation. Meanwhile, explanations without the above errors are better explanations. To complete the task, you must cautiously choose from one of the three answers: "Explanation 1 is much better", "Explanation 2 is much better", "Explanation 1 and 2 are of similar quality". Additionally, You must output the current status in a parsable JSON format. An example output looks like:\n{{"Choice": "XXX"}}"""
        if examples is not None:
            examples_temp = []
            for ID in examples:
                example = examples[ID]
                examples_temp.append(f"Text: {example['text']}\nDouble Meanings of the Pun: "
                                     f"1. pun word and its meaning: {example['pun_word']} <{example['pun_sense']}>. "
                                     f"2. alternative word and its meaning: {example['alter_word']} <{example['alter_sense']}>.\n"
                                     f"Explanation 1: {example['explanation1']}\n"
                                     f"Explanation 2: {example['explanation2']}\n"
                                     f"Output:\n{{{{\"Choice\": \"{example['choice']}\"}}}}")
            examples_string = '\n\n<*Examples*>\n' + '\n\n'.join(examples_temp)
        else:
            examples_string = ''
        testing = "\n\n<*Your Response*>\nText: {text}\nDouble Meanings of the Pun: " \
                  "1. pun word and its meaning: {pun_word} <{pun_sense}>. " \
                  "2. alternative word and its meaning: {alter_word} <{alter_sense}>.\n" \
                  "Explanation 1: {expl1}\nExplanation 2: {expl2}\n" \
                  "Output:"
        prompt_string = definition + instruction + examples_string + testing
        chat_prompt = ChatPromptTemplate.from_template(prompt_string)
        # <B.b>. Call LLM to respond
        model_name = eval_model.model_name
        key_eval = f'{model_name}_eval {target1}_vs_{target2} swap' if swap else f'{model_name}_eval {target1}_vs_{target2}'
        IDs_loaded = []
        for ID in record:
            if record[ID].get(key_eval, False):
                IDs_loaded.append(ID)
        all_ind = list(range(0,len(IDs)))
        batch_ind = list(range(0,len(IDs),batch_size))
        for ind in tqdm(all_ind):
            if ind not in batch_ind:
                continue
            # Remove the data that has already been evaluated
            IDs_batch = IDs[ind: ind+batch_size]
            IDs_batch = list(set(IDs_batch)-set(IDs_loaded))
            if len(IDs_batch) == 0:
                continue
            _inputs = []
            for ID in IDs_batch:
                # Get text and explanation
                data = dataset[ID]
                assert data.get('pun_word',False) # Make sure it's a pun
                text = data['human_text']
                pun_word = data['pun_word']
                pun_sense = data['pun_sense']
                alter_word = data['alter_word']
                alter_sense = data['alter_sense']
                expl1 = get_expl(explanations[ID], target1)
                expl2 = get_expl(explanations[ID], target2)
                if not swap:
                    _inputs.append(chat_prompt.format_messages(text=text, pun_word=pun_word, pun_sense=pun_sense,
                                                                 alter_word=alter_word, alter_sense=alter_sense,
                                                                 expl1=expl1, expl2=expl2))
                else:
                    _inputs.append(chat_prompt.format_messages(text=text, pun_word=pun_word, pun_sense=pun_sense,
                                                                 alter_word=alter_word, alter_sense=alter_sense,
                                                                 expl1=expl2, expl2=expl1))
            # call the llm
            _outputs = [out.content for out in eval_model.batch(_inputs)]
            # print(_inputs[0][0].content)
            # print(_outputs[0])
            # break
            for ID, out in zip(IDs_batch, _outputs):
                # 1 means target1 is better, 2 means target2 is better, 0 means model is not sure which is better,
                winner = parse_output(out, swap=swap)
                evaluation = {'winner':winner}
                if ID not in record:
                    record[ID] = {key_eval: evaluation}
                else:
                    record[ID].update({key_eval: evaluation})
            if save:
                save_json_file(record, path)

In [8]:
def pairwise_comparison_integration(*file_paths, save:bool=False,
                                    path:str='pun_explanation_pairwise_comparison_pilot.json'):
    """
    Integrate the results of pairwise comparison into one file (for manual pilot)
    """
    if 'results' not in path:
        path = './results/' + path
    integration = dict()
    for file_path in file_paths:
        if os.path.exists(file_path):
            file = load_json_file(file_path)
            # Part of human eval in pilot
            if '(' in file_path and ')' in file_path:
                evaluator = 'human_eval'
                targets = file_path.split('(')[-1].split(')')[0]
                key = f'{evaluator} {targets}'
                for ID in file:
                    if ID not in integration:
                        integration[ID] = dict()
                    eval = file[ID][evaluator]
                    integration[ID].update({key: eval})
            # Part of model eval in pilot
            else:
                IDs = list(file.keys())
                evaluators = list(file[IDs[0]].keys())
                for evaluator in evaluators:
                    for ID in file:
                        if ID not in integration:
                            integration[ID] = dict()
                        eval = file[ID][evaluator]
                        integration[ID].update({evaluator: eval})
        else:
            raise AssertionError(f"{file_path} is not a valid file")
    if save:
        save_json_file(integration, path)

In [9]:
def pairwise_comparison_summary(evaluations, is_pilot:bool=False, save:bool=False):
    """
    Calculate the various indicators of pairwise comparison  \n
    Pilot: consistency between model and human and consistency after exchanging explanations  \n
    Full-scale: average winning rate
    """
    if is_pilot:
        # Pilot
        path = r'./results/pun_explanation_pairwise_comparison_pilot_metrics.json'
        IDs = list(evaluations.keys())
        eval_keys = list(evaluations[IDs[0]].keys())
        evaluators = {eval_key.split(' ')[0] for eval_key in eval_keys} - {'human_eval'}
        targets = {eval_key.split(' ')[1] for eval_key in eval_keys}
        summary = dict()
        for target in targets:
            if target not in summary:
                summary[target] = dict()
            for evaluator in evaluators:
                # Consistency with human
                key = 'consistency_with_human'
                human_eval = [evaluations[ID][f'human_eval {target}']['winner'] for ID in IDs]
                model_eval = [evaluations[ID][f'{evaluator} {target}']['winner'] for ID in IDs]
                CWH = sum([h==m for h,m in zip(human_eval, model_eval)])/len(human_eval)
                summary[target][key] = round(CWH, 4)
                # Consistency between swap
                key = 'consistency_between_swap'
                model_eval1 = [evaluations[ID][f'{evaluator} {target}']['winner'] for ID in IDs]
                model_eval2 = [evaluations[ID][f'{evaluator} {target} swap']['winner'] for ID in IDs]
                CBS = sum([m1==m2 for m1,m2 in zip(model_eval1, model_eval2)])/len(model_eval1)
                summary[target][key] = round(CBS, 4)
        # Human and model agreement
        human_eval, model_eval = [], []
        for target in targets:
            for evaluator in evaluators:
                human_eval.extend([evaluations[ID][f'human_eval {target}']['winner'] for ID in IDs])
                model_eval.extend([evaluations[ID][f'{evaluator} {target}']['winner'] for ID in IDs])
        # Various correlation coefficients
        pearson_corr, p_value_pearson = stats.pearsonr(human_eval, model_eval)
        spearman_corr, p_value_spearman = stats.spearmanr(human_eval, model_eval)
        kendall_tau, p_value_kendall = stats.kendalltau(human_eval, model_eval)
        print(f'pearson_corr:{pearson_corr}, p_value_pearson:{p_value_pearson}\n'
              f'spearman_corr:{spearman_corr}, p_value_spearman:{p_value_spearman}\n'
              f'kendall_tau:{kendall_tau}, p_value_kendall:{p_value_kendall}')
    else:
        # Full-scale
        path = r'./results/pun_explanation_pairwise_comparison_metrics.json'
        winner = {0:"tie", 1:"human_win", 2:"model_win", }
        IDs = list(evaluations.keys())
        targets = list(evaluations[IDs[0]].keys())
        summary = dict()
        for target in targets:
            if target not in summary:
                summary[target] = dict()
            hom_eval = [evaluations[ID][target]['winner'] for ID in IDs if 'hom' in ID]
            het_eval = [evaluations[ID][target]['winner'] for ID in IDs if 'het' in ID]
            hom_num =len(hom_eval)
            het_num = len(het_eval)
            hom_counter = Counter(hom_eval)
            het_counter = Counter(het_eval)
            hom_summary = {w:round(hom_counter[i]/hom_num,4) for i,w in winner.items()}
            het_summary = {w:round(het_counter[i]/het_num,4) for i,w in winner.items()}
            summary[target]['hom'] = hom_summary
            summary[target]['het'] = het_summary
    print(json.dumps(summary, indent=4))
    if save:
        save_json_file(summary, path)

## Dataset and Examples

In [10]:
explanation_path = r'./results/pun_explanation.json'
pun_explanation = load_json_file(explanation_path)

hom_path = r'./dataset/hom_dataset.json'
het_path = r'./dataset/het_dataset.json'
hom_punDataset, hom_nonpunDataset = splitDataset(hom_path, puntype='hom')
het_punDataset, het_nonpunDataset = splitDataset(het_path, puntype='het')

In [11]:
# Choose data from examples manually
hom_examples = {
    "hom_1404":{"text":"He had trouble in his new job making tents and got himself into a flap .",
                "pun_word": "flap",
                "pun_sense": "an excited state of agitation",
                "alter_word": "flap",
                "alter_sense": "any broad thin and limber covering attached at one edge; hangs loose or projects freely",
                "explanation1":"To get in a flap means to get upset, or unhappy about something. Flaps can also refer to cloth that is hinged on one side that cover an opening. A tent is a shelter made from fabric, and has flaps. This joke is playing on the word 'flaps' and its different meanings.",
                "explanation2":"The text exploits the double meaning of 'flap,' which can refer to both a problem or commotion and a part of a tent. It plays on the ambiguity of the word in the context of the job, creating a pun.",
                "choice":"Explanation 1 is much better"},
    "hom_1162":{"text":"Old carpenters never die , they just lumber around .",
                "pun_word": "lumber",
                "pun_sense": "move heavily or clumsily",
                "alter_word": "lumber",
                "alter_sense": "cut lumber, as in woods and forests",
                "explanation1":"Carpenters make and repair wooden objects and structures. Lumber is a type of wood. Lumber also means to move in a slow, heavy, awkward way. Carpenters lumber around.",
                "explanation2":"The text is a play on the phrase 'Old [people] never die, they just...' where the second part humorously reinterprets the expected outcome. In this case, 'lumber around' plays on the idea of old carpenters working with lumber while also suggesting a slow or clumsy movement, creating a pun on the expected phrase.",
                "choice":"Explanation 2 is much better"},
    "hom_705":{"text":"Driving on so many turnpikes was taking its toll .",
               "pun_word": "toll",
               "pun_sense": "a fee levied for the use of roads or bridges (used for maintenance)",
               "alter_word": "toll",
               "alter_sense": "value measured by what must be given or done or undergone to obtain something",
               "explanation1":"When someone is 'taking its toll' it means that something is having a bad effect on someone. A toll is a charge payable for permission to use a particular bridge or road. The word 'toll' is being used in two different ways here.",
               "explanation2":"The text is using the word 'toll' in a double entendre. It refers both to the physical tolls paid on turnpikes and to 'taking its toll' as in having a negative effect or cost.",
               "choice":"Explanation 1 and 2 are of similar quality"}
}

het_examples = {
    "het_1453":{"text":"The whistling fisherman was always out of tuna .",
                "pun_word": "tuna",
                "pun_sense": "any very large marine food and game fish of the genus Thunnus; related to mackerel; chiefly of warm waters",
                "alter_word": "tune",
                "alter_sense": "the property of producing accurately a note of a given pitch",
                "explanation1":"The joke is a play on words. The word 'tuna' sounds like 'tune'.  The fisherman who whistles is always 'out of tune' and the word tune was replaced with a word for a type of fish.",
                "explanation2":"The text exploits the double meaning of 'tuna' - it can refer to both the fish and the musical instrument. This creates a play on words.",
                "choice":"Explanation 1 is much better"},
    "het_1774":{"text":"In a billiard hall life can be pretty rough at the wrong end of the queue .",
                "pun_word": "queue",
                "pun_sense": "a line of people or vehicles waiting for something",
                "alter_word": "cue",
                "alter_sense": "sports implement consisting of a tapering rod used to strike a cue ball in pool or billiards",
                "explanation1":"This is a play on words. The word 'queue' means a line of people waiting for something but a 'queue' is also a term for equipment used in billiards.",
                "explanation2":"The text plays on the homophones 'queue' and 'cue', creating a humorous double meaning. 'Queue' refers to the waiting line, while 'cue' is a stick used in billiards. The phrase 'wrong end of the queue' is a common expression, but the play on words adds a comedic twist.",
               "choice":"Explanation 2 is much better"},
    "het_530":{"text":"A tangled bell ringer tolled himself off .",
               "pun_word": "toll",
               "pun_sense": "ring slowly",
               "alter_word": "tell off",
               "alter_sense": "reprimand",
               "explanation1":"The joke is a pun. A bell can be 'tolled' by ringing it.  'Tolled' sounds like 'told' which makes it sound like the bell ringer told himself off, or scolded himself.",
               "explanation2":"The text plays on the homophones 'tolled' and 'told', using the word 'tolled' in the context of a bell ringer (which relates to the ringing or tolling of bells) and 'told' as in scolding oneself (told sb off). This creates a humorous double meaning.",
               "choice":"Explanation 1 and 2 are of similar quality"},
}

## Evaluate Explanation

### Punchline Check

#### gpt3.5

In [9]:
target='gpt-3.5-turbo-1106'
evaluate_explanation_by_punchline(hom_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(hom_nonpunDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_nonpunDataset, pun_explanation, target, ids_sampling, save=True)

100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]


#### gpt4

In [10]:
target='gpt-4-1106-preview'
evaluate_explanation_by_punchline(hom_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(hom_nonpunDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_nonpunDataset, pun_explanation, target, ids_sampling, save=True)

100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]


#### gemini-pro

In [11]:
target='gemini-pro'
evaluate_explanation_by_punchline(hom_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(hom_nonpunDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_nonpunDataset, pun_explanation, target, ids_sampling, save=True)

100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]


#### claude3

In [12]:
target='claude-3-opus-20240229'
evaluate_explanation_by_punchline(hom_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(hom_nonpunDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_nonpunDataset, pun_explanation, target, ids_sampling, save=True)

100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]


#### vicuna

In [13]:
target='vicuna-7b-v1.5'
evaluate_explanation_by_punchline(hom_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(hom_nonpunDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_nonpunDataset, pun_explanation, target, ids_sampling, save=True)

100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]


#### llama2

In [14]:
target='llama-2-7b-chat'
evaluate_explanation_by_punchline(hom_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(hom_nonpunDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_nonpunDataset, pun_explanation, target, ids_sampling, save=True)

100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]


#### mistral

In [15]:
target='mistral-7b-instruct-v0.2'
evaluate_explanation_by_punchline(hom_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(hom_nonpunDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_nonpunDataset, pun_explanation, target, ids_sampling, save=True)

100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]


#### openchat

In [16]:
target='openchat-3.5-0106'
evaluate_explanation_by_punchline(hom_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(hom_nonpunDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_nonpunDataset, pun_explanation, target, ids_sampling, save=True)

100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]


#### human

In [17]:
target='human'
evaluate_explanation_by_punchline(hom_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(hom_nonpunDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_punDataset, pun_explanation, target, ids_sampling, save=True)
evaluate_explanation_by_punchline(het_nonpunDataset, pun_explanation, target, ids_sampling, save=True)

100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]


#### *summary

In [12]:
punchline_check_human = r'./results/human_eval/pun_explanation_punchline_check(human).json'
punchline_check_gpt4 = r'./results/human_eval/pun_explanation_punchline_check(gpt-4-1106-preview).json'
punchline_check_gpt35 = r'./results/human_eval/pun_explanation_punchline_check(gpt-3.5-turbo-1106).json'
punchline_check_gemini = r'./results/human_eval/pun_explanation_punchline_check(gemini-pro).json'
punchline_check_claude3 = r'./results/human_eval/pun_explanation_punchline_check(claude-3-opus-20240229).json'
punchline_check_vicuna = r'./results/human_eval/pun_explanation_punchline_check(vicuna-7b-v1.5).json'
punchline_check_llama2 = r'./results/human_eval/pun_explanation_punchline_check(llama-2-7b-chat).json'
punchline_check_mistral = r'./results/human_eval/pun_explanation_punchline_check(mistral-7b-instruct-v0.2).json'
punchline_check_openchat = r'./results/human_eval/pun_explanation_punchline_check(openchat-3.5-0106).json'
path=r'./results/pun_explanation_punchline_check.json'

# punchline_check_integration(punchline_check_human, punchline_check_gpt35, punchline_check_gpt4, punchline_check_gemini,
#                             punchline_check_vicuna, punchline_check_llama2, punchline_check_mistral, punchline_check_openchat,
#                             punchline_check_claude3, save=True, path=path)
punchline_check_summary(evaluations=load_json_file(path), save=True)

{
    "human_eval human_explanation": {
        "hom_pun": {
            "pun_word": 0.95,
            "alter_word": 0.95,
            "pun_sense": 0.95,
            "alter_sense": 0.95
        },
        "het_pun": {
            "pun_word": 0.97,
            "alter_word": 0.97,
            "pun_sense": 0.94,
            "alter_sense": 0.93
        },
        "non_pun": {
            "rationality": null,
            "informativeness": null,
            "R1I0": 0.0,
            "R0I1": 0.0
        }
    },
    "human_eval gpt-3.5-turbo-1106_explanation": {
        "hom_pun": {
            "pun_word": 0.88,
            "alter_word": 0.88,
            "pun_sense": 0.81,
            "alter_sense": 0.81
        },
        "het_pun": {
            "pun_word": 0.91,
            "alter_word": 0.55,
            "pun_sense": 0.82,
            "alter_sense": 0.57
        },
        "non_pun": {
            "rationality": 0.51,
            "informativeness": 0.65,
            "R1I0": 0.05,
       

### By comparison

In [12]:
# Connect gpt-4-0613
gpt4_name = 'gpt-4-0613'
temperature = 0.0
openai_api_key = get_openai_api_key()  # use your api key
gpt4 = ChatOpenAI(model_name=gpt4_name, temperature=temperature,
                    openai_api_key=openai_api_key, request_timeout=120)

#### pilot (small batch)

In [12]:
# Evaluate by human
target1 = 'human'
target2 = 'gpt-3.5-turbo-1106'
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2, sample_fn=ids_sampling, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2, sample_fn=ids_sampling, save=True)

target1 = 'human'
target2 = 'gpt-4-1106-preview'
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2, sample_fn=ids_sampling, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2, sample_fn=ids_sampling, save=True)

100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]


In [11]:
# Evaluate by model (gpt4)
# In our early experiments, we found that the performances of the model under 0-shot and 3-shot conditions are almost the same, but 0-shot can save about half the tokens.
target1 = 'human'
target2 = 'gpt-3.5-turbo-1106'
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2, sample_fn=ids_sampling,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2, swap=True, sample_fn=ids_sampling,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2, sample_fn=ids_sampling,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2, swap=True, sample_fn=ids_sampling,
                                   eval_model=gpt4, batch_size=10, save=True)

target1 = 'human'
target2 = 'gpt-4-1106-preview'
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2, sample_fn=ids_sampling,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2, swap=True, sample_fn=ids_sampling,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2, sample_fn=ids_sampling,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2, swap=True, sample_fn=ids_sampling,
                                   eval_model=gpt4, batch_size=10, save=True)

100%|██████████| 100/100 [00:19<00:00,  5.21it/s]
100%|██████████| 100/100 [00:18<00:00,  5.37it/s]
100%|██████████| 100/100 [00:18<00:00,  5.35it/s]
100%|██████████| 100/100 [00:18<00:00,  5.38it/s]
100%|██████████| 100/100 [00:20<00:00,  4.78it/s]
100%|██████████| 100/100 [00:19<00:00,  5.07it/s]
100%|██████████| 100/100 [00:18<00:00,  5.41it/s]
100%|██████████| 100/100 [00:20<00:00,  4.92it/s]


In [10]:
# Pilot summary and calculate consistency
pairwise_comparison_human1 = r'./results/human_eval/pun_explanation_pairwise_comparison(human_vs_gpt-3.5-turbo-1106).json'
pairwise_comparison_human2 = r'./results/human_eval/pun_explanation_pairwise_comparison(human_vs_gpt-4-1106-preview).json'
pairwise_comparison_gpt4 = r'./results/human_eval/pun_explanation_pairwise_comparison_pilot.json'
path=r'./results/pun_explanation_pairwise_comparison_pilot.json'

# pairwise_comparison_integration(pairwise_comparison_human1,pairwise_comparison_human2,pairwise_comparison_gpt4,
#                                 save=True, path=path)
pairwise_comparison_summary(evaluations=load_json_file(path), is_pilot=True, save=True)

pearson_corr:0.6874723368340706, p_value_pearson:2.925280119706293e-57
spearman_corr:0.7179691699112483, p_value_spearman:1.3233771965771166e-64
kendall_tau:0.7072809840255064, p_value_kendall:4.6518082528801396e-52
{
    "human_vs_gpt-4-1106-preview": {
        "consistency_with_human": 0.87,
        "consistency_between_swap": 0.855
    },
    "human_vs_gpt-3.5-turbo-1106": {
        "consistency_with_human": 0.895,
        "consistency_between_swap": 0.825
    }
}


#### human vs gpt3.5

In [18]:
target1 = 'human'
target2 = 'gpt-3.5-turbo-1106'
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)

100%|██████████| 810/810 [00:00<00:00, 50437.01it/s]
100%|██████████| 647/647 [00:00<00:00, 40531.03it/s]


#### human vs gpt4

In [19]:
target1 = 'human'
target2 = 'gpt-4-1106-preview'
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)

100%|██████████| 810/810 [00:00<00:00, 51356.50it/s]
100%|██████████| 647/647 [00:00<?, ?it/s]


#### human vs gemini-pro

In [20]:
target1 = 'human'
target2 = 'gemini-pro'
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)

100%|██████████| 810/810 [00:00<00:00, 51897.02it/s]
100%|██████████| 647/647 [00:00<00:00, 46334.43it/s]


#### human vs claude3

In [20]:
target1 = 'human'
target2 = 'claude-3-opus-20240229'
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)

100%|██████████| 810/810 [02:44<00:00,  4.93it/s]
100%|██████████| 647/647 [02:05<00:00,  5.16it/s]


#### human vs vicuna

In [21]:
target1 = 'human'
target2 = 'vicuna-7b-v1.5'
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)

100%|██████████| 810/810 [00:00<00:00, 90092.45it/s]
100%|██████████| 647/647 [00:00<00:00, 41036.06it/s]


#### human vs llama2

In [22]:
target1 = 'human'
target2 = 'llama-2-7b-chat'
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)

100%|██████████| 810/810 [00:00<00:00, 51750.77it/s]
100%|██████████| 647/647 [00:00<00:00, 85806.45it/s]


#### human vs mistral

In [23]:
target1 = 'human'
target2 = 'mistral-7b-instruct-v0.2'
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)

100%|██████████| 810/810 [00:00<00:00, 296016.92it/s]
100%|██████████| 647/647 [00:00<?, ?it/s]


#### human vs openchat

In [24]:
target1 = 'human'
target2 = 'openchat-3.5-0106'
evaluate_explanation_by_comparison(hom_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)
evaluate_explanation_by_comparison(het_punDataset, pun_explanation, target1, target2,
                                   eval_model=gpt4, batch_size=10, save=True)

100%|██████████| 810/810 [00:00<00:00, 39236.22it/s]
100%|██████████| 647/647 [00:00<?, ?it/s]


#### *summary

In [12]:
path = r'./results/pun_explanation_pairwise_comparison.json'
pairwise_comparison_summary(load_json_file(path), save=True)

{
    "gpt-4-0613_eval human_vs_gpt-3.5-turbo-1106": {
        "hom": {
            "tie": 0.6679,
            "human_win": 0.1716,
            "model_win": 0.1605
        },
        "het": {
            "tie": 0.4451,
            "human_win": 0.442,
            "model_win": 0.1128
        }
    },
    "gpt-4-0613_eval human_vs_gpt-4-1106-preview": {
        "hom": {
            "tie": 0.7691,
            "human_win": 0.0173,
            "model_win": 0.2136
        },
        "het": {
            "tie": 0.6847,
            "human_win": 0.0866,
            "model_win": 0.2287
        }
    },
    "gpt-4-0613_eval human_vs_gemini-pro": {
        "hom": {
            "tie": 0.6531,
            "human_win": 0.1654,
            "model_win": 0.1815
        },
        "het": {
            "tie": 0.4173,
            "human_win": 0.4606,
            "model_win": 0.1221
        }
    },
    "gpt-4-0613_eval human_vs_vicuna-7b-v1.5": {
        "hom": {
            "tie": 0.3506,
            "huma