# Evaluation

## Load Package

In [1]:
import os
import json
from sklearn.metrics import cohen_kappa_score
import scipy.stats as stats

## Basic Function

In [2]:
def load_json_file(file_path):
    """
    Load json file
    """
    with open(file_path,'r',encoding='utf-8') as f:
        file = json.load(f)
        f.close()
    return file

def save_json_file(file, file_path):
    """
    Save json file
    """
    with open(file_path,'w',encoding='utf-8') as f:
        json.dump(file, f, indent=4, ensure_ascii=False)
        f.close()

## Function of Evaluating recognition

In [3]:
def evaluate_recognition(recognitions, limit_to:str=None, save:bool=False, path='pun_recognition_metrics.json'):
    """
    Calculate the metrics of pun recognition
    """
    path = './results/' + path
    record = dict()
    puntype = ['hom', 'het']
    IDs = list(recognitions.keys())
    gold_key = 'human_judge'
    eval_keys = [key for key in recognitions[IDs[0]].keys() if key != gold_key]
    if limit_to is not None:
        eval_keys = [eval_key for eval_key in eval_keys if limit_to in eval_key]
    # Metrics: TRP, TNP, Kappa
    for eval_key in eval_keys:
        for pt in puntype:
            IDs_pt = [ID for ID in IDs if pt in ID]
            gold = [recognitions[ID][gold_key] for ID in IDs_pt]
            pred1 = [recognitions[ID][eval_key]['biased_to_pun'] for ID in IDs_pt]
            pred2 = [recognitions[ID][eval_key]['biased_to_non-pun'] for ID in IDs_pt]
            size_pun = sum(gold)
            size_nonpun = len(gold) - size_pun
            # Biased_to_pun
            TPR1 = sum([1 if (g==1 and g==p1) else 0 for g,p1 in zip(gold,pred1)]) / size_pun
            TNR1 = sum([1 if (g==0 and g==p1) else 0 for g,p1 in zip(gold,pred1)]) / size_nonpun
            # Biased_to_non-pun
            TPR2 = sum([1 if (g==1 and g==p2) else 0 for g,p2 in zip(gold,pred2)]) / size_pun
            TNR2 = sum([1 if (g==0 and g==p2) else 0 for g,p2 in zip(gold,pred2)]) / size_nonpun
            # Kappa
            Kappa = cohen_kappa_score(pred1, pred2)
            if eval_key not in record:
                record[eval_key] = dict()
            sub_rec = dict()
            sub_rec['biased_to_pun'] = {'TPR':round(TPR1,4),'TNR':round(TNR1,4)}
            sub_rec['biased_to_non-pun'] = {'TPR':round(TPR2,4),'TNR':round(TNR2,4)}
            sub_rec['Kappa'] = round(Kappa, 4)
            record[eval_key][pt] = sub_rec
    print(json.dumps(record, indent=4))
    if save:
        if os.path.exists(path):
            total_record = load_json_file(path)
        else:
            total_record = dict()
        total_record.update(record)
        save_json_file(total_record, path)

## Dataset

In [4]:
recognition_path = r'./results/pun_recognition.json'
pun_recognition = load_json_file(recognition_path)

## Evaluate recognition

### gpt3.5

In [5]:
evaluate_recognition(recognitions=pun_recognition, limit_to='gpt-3.5', save=True)

{
    "gpt-3.5-turbo-1106_judge def_false CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.9901,
                "TNR": 0.2243
            },
            "biased_to_non-pun": {
                "TPR": 0.8531,
                "TNR": 0.7346
            },
            "Kappa": 0.2915
        },
        "het": {
            "biased_to_pun": {
                "TPR": 0.9768,
                "TNR": 0.2625
            },
            "biased_to_non-pun": {
                "TPR": 0.8284,
                "TNR": 0.7295
            },
            "Kappa": 0.342
        }
    },
    "gpt-3.5-turbo-1106_judge def_true CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.9741,
                "TNR": 0.4882
            },
            "biased_to_non-pun": {
                "TPR": 0.837,
                "TNR": 0.8483
            },
            "Kappa": 0.5108
        },
        "het": {
            "biase

### gpt4

In [6]:
evaluate_recognition(recognitions=pun_recognition, limit_to='gpt-4', save=True)

{
    "gpt-4-1106-preview_judge def_false CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.9877,
                "TNR": 0.6303
            },
            "biased_to_non-pun": {
                "TPR": 0.9852,
                "TNR": 0.684
            },
            "Kappa": 0.8936
        },
        "het": {
            "biased_to_pun": {
                "TPR": 0.9598,
                "TNR": 0.6212
            },
            "biased_to_non-pun": {
                "TPR": 0.9397,
                "TNR": 0.6693
            },
            "Kappa": 0.8837
        }
    },
    "gpt-4-1106-preview_judge def_true CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.9716,
                "TNR": 0.8404
            },
            "biased_to_non-pun": {
                "TPR": 0.9753,
                "TNR": 0.8373
            },
            "Kappa": 0.9487
        },
        "het": {
            "bias

### gemini(pro)

In [7]:
evaluate_recognition(recognitions=pun_recognition, limit_to='gemini-pro', save=True)

{
    "gemini-pro_judge def_false CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.9975,
                "TNR": 0.1659
            },
            "biased_to_non-pun": {
                "TPR": 0.9494,
                "TNR": 0.6714
            },
            "Kappa": 0.2875
        },
        "het": {
            "biased_to_pun": {
                "TPR": 0.983,
                "TNR": 0.1924
            },
            "biased_to_non-pun": {
                "TPR": 0.8501,
                "TNR": 0.6593
            },
            "Kappa": 0.2965
        }
    },
    "gemini-pro_judge def_true CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.9951,
                "TNR": 0.3207
            },
            "biased_to_non-pun": {
                "TPR": 0.8494,
                "TNR": 0.8863
            },
            "Kappa": 0.3142
        },
        "het": {
            "biased_to_pun": {
  

### claude3

In [8]:
evaluate_recognition(recognitions=pun_recognition, limit_to='claude-3-opus', save=True)

{
    "claude-3-opus-20240229_judge def_false CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.9889,
                "TNR": 0.624
            },
            "biased_to_non-pun": {
                "TPR": 0.9778,
                "TNR": 0.733
            },
            "Kappa": 0.8669
        },
        "het": {
            "biased_to_pun": {
                "TPR": 0.9691,
                "TNR": 0.6132
            },
            "biased_to_non-pun": {
                "TPR": 0.932,
                "TNR": 0.7094
            },
            "Kappa": 0.8392
        }
    },
    "claude-3-opus-20240229_judge def_true CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.984,
                "TNR": 0.7172
            },
            "biased_to_non-pun": {
                "TPR": 0.9679,
                "TNR": 0.8341
            },
            "Kappa": 0.8656
        },
        "het": {
            

### vicuna

In [9]:
evaluate_recognition(recognitions=pun_recognition, limit_to='vicuna', save=True)

{
    "vicuna-7b-v1.5_judge def_false CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.984,
                "TNR": 0.0284
            },
            "biased_to_non-pun": {
                "TPR": 0.6852,
                "TNR": 0.4044
            },
            "Kappa": 0.077
        },
        "het": {
            "biased_to_pun": {
                "TPR": 0.9969,
                "TNR": 0.024
            },
            "biased_to_non-pun": {
                "TPR": 0.8022,
                "TNR": 0.4429
            },
            "Kappa": 0.0549
        }
    },
    "vicuna-7b-v1.5_judge def_true CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.9988,
                "TNR": 0.0063
            },
            "biased_to_non-pun": {
                "TPR": 0.9963,
                "TNR": 0.0348
            },
            "Kappa": 0.3187
        },
        "het": {
            "biased_to_pun"

### llama2

In [10]:
evaluate_recognition(recognitions=pun_recognition, limit_to='llama-2', save=True)

{
    "llama-2-7b-chat_judge def_false CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.9926,
                "TNR": 0.049
            },
            "biased_to_non-pun": {
                "TPR": 0.8642,
                "TNR": 0.3428
            },
            "Kappa": 0.1481
        },
        "het": {
            "biased_to_pun": {
                "TPR": 0.9845,
                "TNR": 0.0421
            },
            "biased_to_non-pun": {
                "TPR": 0.9011,
                "TNR": 0.3647
            },
            "Kappa": 0.1734
        }
    },
    "llama-2-7b-chat_judge def_true CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.9938,
                "TNR": 0.06
            },
            "biased_to_non-pun": {
                "TPR": 0.7802,
                "TNR": 0.4265
            },
            "Kappa": 0.1305
        },
        "het": {
            "biased_to_pu

### mistral

In [11]:
evaluate_recognition(recognitions=pun_recognition, limit_to='mistral', save=True)

{
    "mistral-7b-instruct-v0.2_judge def_false CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.8667,
                "TNR": 0.2085
            },
            "biased_to_non-pun": {
                "TPR": 0.3333,
                "TNR": 0.7488
            },
            "Kappa": 0.1557
        },
        "het": {
            "biased_to_pun": {
                "TPR": 0.8733,
                "TNR": 0.2024
            },
            "biased_to_non-pun": {
                "TPR": 0.4312,
                "TNR": 0.7876
            },
            "Kappa": 0.1754
        }
    },
    "mistral-7b-instruct-v0.2_judge def_true CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.542,
                "TNR": 0.722
            },
            "biased_to_non-pun": {
                "TPR": 0.3691,
                "TNR": 0.8468
            },
            "Kappa": 0.657
        },
        "het": {
       

### openchat

In [12]:
evaluate_recognition(recognitions=pun_recognition, limit_to='openchat', save=True)

{
    "openchat-3.5-0106_judge def_false CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.9481,
                "TNR": 0.3681
            },
            "biased_to_non-pun": {
                "TPR": 0.8753,
                "TNR": 0.4882
            },
            "Kappa": 0.722
        },
        "het": {
            "biased_to_pun": {
                "TPR": 0.9304,
                "TNR": 0.3788
            },
            "biased_to_non-pun": {
                "TPR": 0.8624,
                "TNR": 0.499
            },
            "Kappa": 0.7419
        }
    },
    "openchat-3.5-0106_judge def_true CoT_false examples_false": {
        "hom": {
            "biased_to_pun": {
                "TPR": 0.8679,
                "TNR": 0.6556
            },
            "biased_to_non-pun": {
                "TPR": 0.8198,
                "TNR": 0.6588
            },
            "Kappa": 0.8864
        },
        "het": {
            "biased_