# Recognition&Explanation Task

## Load Package

In [1]:
import os
import json
from tqdm import tqdm
from random import shuffle
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import google.generativeai as genai
from langchain_anthropic import ChatAnthropic
from _api_key import get_openai_api_key, get_google_api_key, get_claude_api_key

## Basic Function

In [2]:
def load_json_file(file_path):
    """
    Load json file
    """
    with open(file_path,'r',encoding='utf-8') as f:
        file = json.load(f)
        f.close()
    return file

def save_json_file(file, file_path):
    """
    Save json file
    """
    with open(file_path,'w',encoding='utf-8') as f:
        json.dump(file, f, indent=4, ensure_ascii=False)
        f.close()

In [3]:
def add_human_performance(whole_dataset, save:bool=False, path_rec='pun_recognition.json',
                         path_expl='pun_explanation.json'):
    """
    Add human performance (gold of recognition & explanation)
    """
    path_rec = './results/' + path_rec
    path_expl = './results/' + path_expl
    if os.path.exists(path_rec):
        pun_recognition = load_json_file(path_rec)
    else:
        pun_recognition = dict()
    if os.path.exists(path_expl):
        pun_explanation = load_json_file(path_expl)
    else:
        pun_explanation = dict()
    # Distinguish between pun and non-pun
    for ID in whole_dataset:
        data = whole_dataset[ID]
        # Puns
        if data.get('pun_word', False):
            # recognition
            if ID not in pun_recognition:
                pun_recognition[ID] = {'human_judge':1}
            else:
                pun_recognition[ID].update({'human_judge':1})
            # explanation
            explanation = data['human_explanation']
            if ID not in pun_explanation:
                pun_explanation[ID] = {'human_explanation':explanation}
            else:
                pun_explanation[ID].update({'human_explanation':explanation})
        # Non-puns
        else:
            # recognition
            if ID not in pun_recognition:
                pun_recognition[ID] = {'human_judge':0}
            else:
                pun_recognition[ID].update({'human_judge':0})
            # explanation
            if ID not in pun_explanation:
                pun_explanation[ID] = {'human_explanation':'None'}
            else:
                pun_explanation[ID].update({'human_explanation':'None'})
    if save:
        save_json_file(pun_recognition, path_rec)
        save_json_file(pun_explanation, path_expl)

## Function of recognition


In [4]:
def call_llm_to_recognize(model, dataset, model_name:str=None, add_def:bool=False, add_CoT:bool=False, add_examples:dict=None,
                          save:bool=False, path_rec='pun_recognition1.json', path_expl='pun_explanation1.json',
                          batch_size:int=1):
    """
    Pun recognition task, with a focus on evaluating accuracy(TPR,TNR) and consistency(kappa)  \n
    Possible parts of prompt: definition, instruction, examples, testing \n
    Collect the 'reason' part of CoT as explanation of the text (pun/non-pun explanation)
    """
    def parse_output(ID, output:str, with_expl:bool):
        # Parse the output and get result
        try:
            output = output[output.index('{'): output.index('}')+1]
        except:
            output = output
        try:
            output = eval(output)
            choice = output['Choice']
            if with_expl:
                expl =  output['Reason']
        except:
            try:
                choice = output.split('The given text is a')[-1]
                if with_expl:
                    expl = output.split('Reason')[1].split('Choice')[0]
                # print(ID, output)
            except:
                choice = 'No-result'
                if with_expl:
                    expl = "No correctly parsed result."
        choice = choice.lower()
        if 'non-pun' in choice:
            judge = 0
        elif 'pun' in choice:
            judge = 1
        else:
            judge = -1
        if with_expl:
            return judge, expl
        else:
            return judge

    path_rec = './results/' + path_rec
    path_expl = './results/' + path_expl
    if os.path.exists(path_rec):  # pun recognition
        record_rec = load_json_file(path_rec)
    else:
        record_rec = dict()
    if add_CoT:  # pun explanation
        if os.path.exists(path_expl):
            record_expl = load_json_file(path_expl)
        else:
            record_expl = dict()
    # [A]. Construct the prompt
    # 1. Give a definition or not
    if add_def:
        definition = """<*Definition*>\nPuns are a form of wordplay exploiting different meanings of a word or similar-sounding words, while non-puns are jokes or statements that don't rely on such linguistic ambiguities.\n\n"""
    else:
        definition = ''
    # 2. CoT or not
    if add_CoT:
        instruction = """<*Instruction*>\nDetermine whether the given Text is a {side}. Give your reasons first, then make your final decision clearly. You should either say "The given text is a pun" or say "The given text is a non-pun". You must output the current status in a parsable JSON format. An example output looks like:\n{{"Reason": "XXX", "Choice": "The given text is a XXX"}}"""
    else:
        instruction = """<*Instruction*>\nDetermine whether the given Text is a {side}. You should either say "The given text is a pun" or say "The given text is a non-pun". You must output the current status in a parsable JSON format. An example output looks like:\n{{"Choice": "The given text is a XXX"}}"""
    # 3. Add examples or not (zero/6-shot)
    if add_examples is not None:
        examples_temp = []
        for ID in add_examples:
            example = add_examples[ID]
            if add_CoT:
                examples_temp.append(f"Text: {example['text']}\nOutput:\n"
                                     f"{{{{\"Reason\": \"{example['reason']}\", "
                                     f"\"Choice\": \"{example['label']}\"}}}}")
            else:
                examples_temp.append(f"Text: {example['text']}\nOutput:\n"
                                     f"{{{{\"Choice\": \"{example['label']}\"}}}}")
        examples_string = '\n\n<*Examples*>\n' + '\n\n'.join(examples_temp)
    else:
        examples_string = ''
    # 4. Test data
    if add_examples is not None:
        testing = "\n\n<*Your Response*>\nText: {text}\nOutput:"
    else:
        testing = "\n\n<*Your Response*>\nText: {text}\nOutput:"
    # 5. Combine all parts together
    prompt_string = definition + instruction + examples_string + testing
    chat_prompt = ChatPromptTemplate.from_template(prompt_string)
    # [B]. Call LLM to respond
    if model_name is None:
        model_name = model.model_name if hasattr(model,'model_name') else model.model
        model_name = model_name.split('/')[-1]
    key_rec = f"{model_name}_judge def_{str(add_def).lower()} " \
              f"CoT_{str(add_CoT).lower()} examples_{str(add_examples is not None).lower()}"
    key_expl = f"{model_name}_explanation def_{str(add_def).lower()} " \
                   f"CoT_{str(add_CoT).lower()} examples_{str(add_examples is not None).lower()}"
    IDs = list(dataset.keys())
    shuffle(IDs); shuffle(IDs)
    IDs_loaded = []
    for ID in record_rec:
        if record_rec[ID].get(key_rec, False):
            IDs_loaded.append(ID)
    all_ind = list(range(0,len(IDs)))
    batch_ind = list(range(0,len(IDs),batch_size))
    for ind in tqdm(all_ind):
        if ind not in batch_ind:
            continue
        IDs_batch = IDs[ind: ind+batch_size]
        # Remove the data that has already been run
        IDs_batch = list(set(IDs_batch)-set(IDs_loaded))
        if len(IDs_batch) == 0:
            continue
        biased_to = {1:'pun',2:'non-pun'}
        _inputs1, _outputs1 = [], []
        _inputs2, _outputs2 = [], []
        for ID in IDs_batch:
            data = dataset[ID]
            text = data['human_text']
            _inputs1.append(chat_prompt.format_messages(text=text, side=biased_to[1]))
            _inputs2.append(chat_prompt.format_messages(text=text, side=biased_to[2]))
        # Gemini's native SDK does not support batch
        if 'gemini' in model_name:
            for _input in _inputs1:
                _outputs1.append(model.generate_content(_input[0].content).text)
            for _input in _inputs2:
                _outputs2.append(model.generate_content(_input[0].content).text)
        # Other models can use batch
        else:
            _outputs1 = [o1.content for o1 in model.batch(_inputs1)]
            _outputs2 = [o2.content for o2 in model.batch(_inputs2)]
        # print(_inputs1[0][0].content)
        # print(_outputs1[0])
        # print()
        # print(_inputs2[0][0].content)
        # print(_outputs2[0])
        # break
        for ID,o1,o2 in zip(IDs_batch, _outputs1, _outputs2):
            if add_CoT:
                rec1, expl1 = parse_output(ID, o1, with_expl=True)
                rec2, expl2 = parse_output(ID, o2, with_expl=True)
                recognition = {f'biased_to_{biased_to[1]}':rec1, f'biased_to_{biased_to[2]}':rec2}
                explanation = {f'biased_to_{biased_to[1]}':expl1, f'biased_to_{biased_to[2]}':expl2}
                if ID not in record_rec:
                    record_rec[ID] = {key_rec: recognition}
                else:
                    record_rec[ID].update({key_rec: recognition})
                if ID not in record_expl:
                    record_expl[ID] = {key_expl: explanation}
                else:
                    record_expl[ID].update({key_expl: explanation})
            else:
                rec1 = parse_output(ID, o1, with_expl=False)
                rec2 = parse_output(ID, o2, with_expl=False)
                recognition = {f'biased_to_{biased_to[1]}':rec1, f'biased_to_{biased_to[2]}':rec2}
                if ID not in record_rec:
                    record_rec[ID] = {key_rec: recognition}
                else:
                    record_rec[ID].update({key_rec: recognition})
        if save:
            save_json_file(record_rec, path_rec)
            if add_CoT:
                save_json_file(record_expl, path_expl)

In [5]:
def call_llm_to_give_pun_definition(model, model_name:str=None, save:bool=False, path='pun_definition.json'):
    """
    Check if the LLM knows the difference between pun and non-pun
    """
    path = './results/' + path
    if os.path.exists(path):
        record = load_json_file(path)
    else:
        record = dict()
    if model_name is None:
        model_name = model.model_name if hasattr(model,'model_name') else model.model
        model_name = model_name.split('/')[-1]
    key_def = f'{model_name}_definition'
    # Call LLM to respond
    _input = "Tell me the difference between puns and non-puns, using no more than 60 words."
    if 'gemini' in model_name:
        _output = model.generate_content(_input).text
    else:
        _output = model.invoke(_input).content
    print(_output)
    record[key_def] = _output
    if save:
        save_json_file(record, path)

## Dataset and Examples

In [6]:
hom_path = r'./dataset/hom_dataset.json'
het_path = r'./dataset/het_dataset.json'
hom_dataset = load_json_file(hom_path)
het_dataset = load_json_file(het_path)

# add_human_performance(dict(**hom_dataset,**het_dataset), save=True)

In [7]:
# Choose data from examples manually
hom_examples = {
    "hom_705":{"text":"Driving on so many turnpikes was taking its toll .",
               "reason":"The text is using the word 'toll' in a double entendre. It refers both to the physical tolls paid on turnpikes and to 'taking its toll' as in having a negative effect or cost.",
               "label":"The given text is a pun"},
    "hom_533":{"text":"Don ' t kill the goose that lays the golden eggs .",
               "reason":"The text is a proverb warning against destructive greed and does not exploit different meanings of a word or similar-sounding words for humorous effect.",
               "label":"The given text is a non-pun"},
    "hom_488":{"text":"A carpenter sat on his drill and was bored to tears .",
               "reason":"The text plays on the double meaning of 'bored'. A carpenter using a drill creates a bore or hole, while 'bored to tears' is an expression used when someone is extremely bored. Thus, it exploits the different meanings of the word 'bored'.",
               "label":"The given text is a pun"},
    "hom_639":{"text":"When all is said and done , more is said than done .",
               "reason":"The text plays on the juxtaposition of the concepts of speaking and doing to highlight a common human behavior of talking more than taking action. It does not rely on the different meanings of a single word or similar sounding words.",
               "label":"The given text is a non-pun"},
    "hom_1556":{"text":"One leftover said to another ' foiled again . '",
                "reason":"The joke is based on the double meaning of the word 'foiled.' One meaning is to be thwarted or defeated, and the other refers to being wrapped in foil, which is what often happens to leftovers.",
                "label":"The given text is a pun"},
    "hom_1167":{"text":"Nothing ventured , nothing gained .",
                "reason":"The given text is a proverb that expresses a general truth or piece of advice and does not exploit different meanings of a word or similar-sounding words.",
                "label":"The given text is a non-pun"}
}

het_examples = {
    "het_621":{"text":"When the waiter told me they were out of corn I said , ' That really shucks . '",
               "reason":"The text plays on the double meaning of the word 'shucks'. 'Shucks' refers to both the act of removing the husk from corn and is a homophone for 'sucks', which is used colloquially to express disappointment.",
               "label":"The given text is a pun"},
    "het_41":{"text":"Desperate times call for desperate measures .",
              "reason":"The text is an idiomatic expression meaning that one may need to take drastic actions in difficult situations. It does not exploit different meanings of a word or similar-sounding words.",
              "label":"The given text is a non-pun"},
    "het_530":{"text":"A tangled bell ringer tolled himself off .",
             "reason":"The text plays on the homophones 'tolled' and 'told', using the word 'tolled' in the context of a bell ringer (which relates to the ringing or tolling of bells) and 'told' as in scolding oneself (told sb off). This creates a humorous double meaning.",
             "label":"The given text is a pun"},
    "het_225":{"text":"Don ' t bite the hand that feeds you .",
               "reason":"The text is an idiomatic expression meaning one should not act ungratefully towards those who provide for them. It does not rely on a play on words or different meanings of the same word.",
                "label":"The given text is a non-pun"},
    "het_325":{"text":"An illiterate fisherman was lost at c .",
               "reason":"The text exploits the homophonic nature of the letter 'C' and the word 'sea', playing on the expectation that 'lost at sea' is a common expression, but humorously substituting 'sea' for 'C' to suggest that the fisherman, being illiterate, is lost at the letter.",
               "label":"The given text is a pun"},
    "het_563":{"text":"Better go about than fall into the ditch .",
              "reason":"The text is an idiomatic expression that suggests it's better to be cautious than to get into trouble. It does not rely on the ambiguity of words or similar-sounding words for a humorous effect.",
              "label":"The given text is a non-pun"}
}

## Recognition

### gpt3.5

In [8]:
# Connect gpt-3.5-turbo-1106
gpt35_name = 'gpt-3.5-turbo-1106'
temperature = 0.0
openai_api_key = get_openai_api_key()  # use your api key
gpt35 = ChatOpenAI(model_name=gpt35_name, temperature=temperature,
                   openai_api_key=openai_api_key, request_timeout=120)

In [10]:
call_llm_to_give_pun_definition(model=gpt35, save=True)

Puns are a form of wordplay that relies on the multiple meanings of a word or the similarity in sound between different words to create humor. Non-puns, on the other hand, do not rely on wordplay for their humor. They may use other comedic devices such as irony, exaggeration, or situational humor to elicit laughter from the audience.


#### bare (0-shot)

In [9]:
call_llm_to_recognize(model=gpt35, dataset=hom_dataset, save=True)
call_llm_to_recognize(model=gpt35, dataset=het_dataset, save=True)

 11%|█         | 159/1443 [17:35<2:51:59,  8.04s/it] Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..
 61%|██████    | 878/1443 [1:30:57<45:44,  4.86s/it]  Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=120.0).
 77%|███████▋  | 1113/1443 [1:54:20<24:56,  4.53s/it] Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..
100%|██████████| 1443/1443 [2:25:23<00:00,  6.05s/it]  
100%|██████████| 1146/1146 [1:25:00<00:00,  4.45s/it]


#### +def (0-shot)

In [10]:
call_llm_to_recognize(model=gpt35, dataset=hom_dataset, save=True,
                      add_def=True)
call_llm_to_recognize(model=gpt35, dataset=het_dataset, save=True,
                      add_def=True)

 51%|█████     | 732/1443 [54:03<1:01:28,  5.19s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..
100%|██████████| 1443/1443 [1:45:17<00:00,  4.38s/it]
100%|██████████| 1146/1146 [1:08:24<00:00,  3.58s/it]


#### +def (6-shot)

In [14]:
call_llm_to_recognize(model=gpt35, dataset=hom_dataset, save=True,
                      add_def=True, add_examples=hom_examples)
call_llm_to_recognize(model=gpt35, dataset=het_dataset, save=True,
                      add_def=True, add_examples=het_examples)

 99%|█████████▉| 1435/1443 [1:56:38<00:37,  4.73s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..
100%|██████████| 1443/1443 [1:59:17<00:00,  4.96s/it]
 22%|██▏       | 255/1146 [24:18<1:09:46,  4.70s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=120.0).
 68%|██████▊   | 777/1146 [1:13:50<33:25,  5.43s/it] Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=120.0).
100%|██████████| 1146/1146 [1:47:35<00:00,  5.63s/it] 


#### +def&CoT (6-shot)

In [18]:
call_llm_to_recognize(model=gpt35, dataset=hom_dataset, save=True,
                      add_def=True, add_CoT=True, add_examples=hom_examples)
call_llm_to_recognize(model=gpt35, dataset=het_dataset, save=True,
                      add_def=True, add_CoT=True, add_examples=het_examples)

  0%|          | 5/1443 [00:46<3:31:55,  8.84s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=120.0).
 83%|████████▎ | 1193/1443 [2:33:53<32:11,  7.72s/it] Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=120.0).
100%|██████████| 1443/1443 [3:06:49<00:00,  7.77s/it]  
 14%|█▎        | 155/1146 [17:45<1:45:45,  6.40s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=120.0).
100%|██████████| 1146/1146 [2:00:50

### gpt4

In [11]:
# Connect gpt-4-1106-preview
gpt4_name = 'gpt-4-1106-preview'
temperature = 0.0
openai_api_key = get_openai_api_key()  # use your api key
gpt4 = ChatOpenAI(model_name=gpt4_name, temperature=temperature,
                  openai_api_key=openai_api_key, request_timeout=120)

In [13]:
call_llm_to_give_pun_definition(model=gpt4, save=True)

Puns are a form of wordplay that exploit multiple meanings of a term or similar-sounding words for an intended humorous or rhetorical effect. Non-puns are straightforward language without such double meanings or sound-based humor.


#### bare (0-shot)

In [7]:
call_llm_to_recognize(model=gpt4, dataset=hom_dataset, save=True)
call_llm_to_recognize(model=gpt4, dataset=het_dataset, save=True)

 57%|█████▋    | 821/1443 [1:01:06<38:54,  3.75s/it]  Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: Timed out generating response. Please try again with a shorter prompt or with `max_tokens` set to a lower value. {
    "error": {
        "message": "Timed out generating response. Please try again with a shorter prompt or with `max_tokens` set to a lower value.",
        "type": "internal_error",
        "param": null,
        "code": "request_timeout"
    }
}
 500 {'error': {'message': 'Timed out generating response. Please try again with a shorter prompt or with `max_tokens` set to a lower value.', 'type': 'internal_error', 'param': None, 'code': 'request_timeout'}} {'Date': 'Mon, 22 Jan 2024 16:21:11 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '251', 'Connection': 'keep-alive', 'vary': 'Origin', 'x-ratelimit-limit-requests': '10000', 'x-ratelimit-limit-tokens'

#### +def (0-shot)

In [8]:
call_llm_to_recognize(model=gpt4, dataset=hom_dataset, save=True,
                      add_def=True)
call_llm_to_recognize(model=gpt4, dataset=het_dataset, save=True,
                      add_def=True)

 17%|█▋        | 251/1443 [16:34<1:03:40,  3.20s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=120.0).
 33%|███▎      | 469/1443 [33:22<1:09:26,  4.28s/it] Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..
100%|██████████| 1443/1443 [1:34:33<00:00,  3.93s/it] 
 13%|█▎        | 153/1146 [09:06<53:05,  3.21s/it]  Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..
100%|██████████| 1146/1146 [1:07:45<00:00,  3.55s/it]


#### +def (6-shot)

In [10]:
call_llm_to_recognize(model=gpt4, dataset=hom_dataset, save=True,
                      add_def=True, add_examples=hom_examples)
call_llm_to_recognize(model=gpt4, dataset=het_dataset, save=True,
                      add_def=True, add_examples=het_examples)

100%|██████████| 1443/1443 [1:19:47<00:00,  3.32s/it]
100%|██████████| 1146/1146 [1:04:20<00:00,  3.37s/it]


#### +def&CoT (6-shot)

In [8]:
call_llm_to_recognize(model=gpt4, dataset=hom_dataset, save=True,
                      add_def=True, add_CoT=True, add_examples=hom_examples)
call_llm_to_recognize(model=gpt4, dataset=het_dataset, save=True,
                      add_def=True, add_CoT=True, add_examples=het_examples)

100%|██████████| 1443/1443 [4:14:14<00:00, 10.57s/it]  
 41%|████▏     | 473/1146 [1:37:43<2:33:09, 13.66s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by ProxyError('Cannot connect to proxy.', ConnectionAbortedError(10053, '你的主机中的软件中止了一个已建立的连接。', None, 10053, None))).
100%|██████████| 1146/1146 [4:21:46<00:00, 13.71s/it] 


### gemini-pro

In [14]:
# Connect gemini-pro
gemini_name = 'gemini-pro'
temperature = 0.0
google_api_key = get_google_api_key()  # use your api key
genai.configure(api_key=google_api_key, transport='rest')
safety_settings=[
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    }
]
generation_config = {"temperature": temperature}
gemini = genai.GenerativeModel(model_name=gemini_name, safety_settings=safety_settings,
                               generation_config=generation_config)

In [18]:
call_llm_to_give_pun_definition(model=gemini, save=True)

Puns are a form of wordplay that exploits multiple meanings of a term, or of similar-sounding words, for humorous or rhetorical effect. Non-puns, on the other hand, are statements or expressions that do not rely on wordplay or ambiguity for their meaning or humor.


#### bare (0-shot)

In [9]:
call_llm_to_recognize(model=gemini, dataset=hom_dataset, save=True)
call_llm_to_recognize(model=gemini, dataset=het_dataset, save=True)

100%|██████████| 1443/1443 [1:28:50<00:00,  3.69s/it]
100%|██████████| 1146/1146 [1:07:15<00:00,  3.52s/it]


#### +def (0-shot)

In [10]:
call_llm_to_recognize(model=gemini, dataset=hom_dataset, save=True,
                      add_def=True)
call_llm_to_recognize(model=gemini, dataset=het_dataset, save=True,
                      add_def=True)

100%|██████████| 1443/1443 [1:27:24<00:00,  3.63s/it]
100%|██████████| 1146/1146 [1:39:35<00:00,  5.21s/it] 


#### +def (6-shot)

In [22]:
call_llm_to_recognize(model=gemini, dataset=hom_dataset, save=True,
                      add_def=True, add_examples=hom_examples)
call_llm_to_recognize(model=gemini, dataset=het_dataset, save=True,
                      add_def=True, add_examples=het_examples)

100%|██████████| 1443/1443 [25:37<00:00,  1.07s/it] 
100%|██████████| 1146/1146 [1:23:27<00:00,  4.37s/it]


#### +def&CoT (6-shot)

In [9]:
call_llm_to_recognize(model=gemini, dataset=hom_dataset, save=True,
                      add_def=True, add_CoT=True, add_examples=hom_examples)
call_llm_to_recognize(model=gemini, dataset=het_dataset, save=True,
                      add_def=True, add_CoT=True, add_examples=het_examples)

 78%|███████▊  | 1124/1443 [1:58:55<30:23,  5.72s/it]  

hom_1115 {"Reason": "The text is a humorous play on the literal meaning of "Insert disk # 3" and the physical limitation of only being able to fit 2 disks. It exploits the different meanings of "Insert disk # 3" to create a humorous situation.", "Choice": "The given text is a pun"}


100%|██████████| 1443/1443 [2:29:25<00:00,  6.21s/it]
100%|██████████| 1146/1146 [1:56:08<00:00,  6.08s/it] 


### claude3

In [8]:
# Connect claude-3-opus-20240229
claude3_name = 'claude-3-opus-20240229'
temperature = 0.0
claude_api_key = get_claude_api_key()  # use your api key
claude3 = ChatAnthropic(model_name=claude3_name, temperature=temperature,
                        anthropic_api_key=claude_api_key)

In [9]:
call_llm_to_give_pun_definition(model=claude3, save=True)

Puns are a form of wordplay that exploit the multiple meanings of a word or the similarity in sound between different words for humorous effect. Non-puns are straightforward statements that do not involve any wordplay or intentional ambiguity. Puns are often used to create jokes, while non-puns are used to convey information clearly and directly.


#### bare (0-shot)

In [10]:
call_llm_to_recognize(model=claude3, dataset=hom_dataset, save=True)
call_llm_to_recognize(model=claude3, dataset=het_dataset, save=True)

100%|██████████| 1443/1443 [00:00<00:00, 30783.22it/s]
100%|██████████| 1146/1146 [00:00<00:00, 35361.12it/s]


#### +def (0-shot)

In [11]:
call_llm_to_recognize(model=claude3, dataset=hom_dataset, save=True,
                      add_def=True)
call_llm_to_recognize(model=claude3, dataset=het_dataset, save=True,
                      add_def=True)

100%|██████████| 1443/1443 [58:18<00:00,  2.42s/it]  
100%|██████████| 1146/1146 [45:49<00:00,  2.40s/it] 


#### +def (6-shot)

In [12]:
call_llm_to_recognize(model=claude3, dataset=hom_dataset, batch_size=4, save=True,
                      add_def=True, add_examples=hom_examples)
call_llm_to_recognize(model=claude3, dataset=het_dataset, batch_size=4, save=True,
                      add_def=True, add_examples=het_examples)

100%|██████████| 1443/1443 [00:00<00:00, 38829.17it/s]
100%|██████████| 1146/1146 [19:07<00:00,  1.00s/it]


#### +def&CoT (6-shot)

In [11]:
call_llm_to_recognize(model=claude3, dataset=hom_dataset, save=True,
                      add_def=True, add_CoT=True, add_examples=hom_examples)
call_llm_to_recognize(model=claude3, dataset=het_dataset, save=True,
                      add_def=True, add_CoT=True, add_examples=het_examples)

100%|██████████| 1443/1443 [00:00<00:00, 103026.26it/s]
100%|██████████| 1146/1146 [1:36:12<00:00,  5.04s/it]


### vicuna

In [8]:
# Connect to a server
# Use local langChain with fastChat (Terminal code)
# python3 -m fastchat.serve.controller
# python3 -m fastchat.serve.model_worker --model-names "gpt-3.5-turbo,text-davinci-003,text-embedding-ada-002" --model-path lmsys/vicuna-7b-v1.5
# python3 -m fastchat.serve.openai_api_server --host localhost --port 8000

# Connect vicuna-7b-v1.5
os.environ['OPENAI_API_BASE'] = 'http://localhost:8000/v1'
os.environ['OPENAI_API_KEY'] = 'EMPTY'

vicuna_name = 'vicuna-7b-v1.5'
temperature = 0
max_tokens = 200
vicuna = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=temperature, max_tokens=max_tokens)

In [10]:
call_llm_to_give_pun_definition(model=vicuna, model_name=vicuna_name, save=True)

Puns are a type of word play that exploit multiple meanings of a term, or of similar-sounding words, these meanings being at least somewhat related to the referents for which the terms stand. Non-puns, on the other hand, do not rely on multiple meanings or similar-sounding words to create their effect. They may use word play for comedic effect, but they do not rely on the dual meaning of a term or similar-sounding words.


#### bare (0-shot)

In [9]:
call_llm_to_recognize(model=vicuna, model_name=vicuna_name, dataset=hom_dataset, batch_size=10, save=True)
call_llm_to_recognize(model=vicuna, model_name=vicuna_name, dataset=het_dataset, batch_size=10, save=True)

100%|██████████| 1443/1443 [13:22<00:00,  1.80it/s]
100%|██████████| 1146/1146 [11:31<00:00,  1.66it/s]


#### +def (0-shot)

In [10]:
call_llm_to_recognize(model=vicuna, model_name=vicuna_name, dataset=hom_dataset, batch_size=10, save=True,
                      add_def=True)
call_llm_to_recognize(model=vicuna, model_name=vicuna_name, dataset=het_dataset, batch_size=10, save=True,
                      add_def=True)

100%|██████████| 1443/1443 [14:15<00:00,  1.69it/s]
100%|██████████| 1146/1146 [11:24<00:00,  1.67it/s]


#### +def (6-shot)

In [16]:
call_llm_to_recognize(model=vicuna, model_name=vicuna_name, dataset=hom_dataset, batch_size=10, save=True,
                      add_def=True, add_examples=hom_examples)
call_llm_to_recognize(model=vicuna, model_name=vicuna_name, dataset=het_dataset, batch_size=10, save=True,
                      add_def=True, add_examples=het_examples)

100%|██████████| 1443/1443 [16:57<00:00,  1.42it/s]
100%|██████████| 1146/1146 [13:45<00:00,  1.39it/s]


#### +def&CoT (6-shot)

In [18]:
call_llm_to_recognize(model=vicuna, model_name=vicuna_name, dataset=hom_dataset, batch_size=10, save=True,
                      add_def=True, add_CoT=True, add_examples=hom_examples)
call_llm_to_recognize(model=vicuna, model_name=vicuna_name, dataset=het_dataset, batch_size=10, save=True,
                      add_def=True, add_CoT=True, add_examples=het_examples)

100%|██████████| 1443/1443 [1:25:22<00:00,  3.55s/it]
100%|██████████| 1146/1146 [1:14:37<00:00,  3.91s/it]


### llama2

In [8]:
# Connect to a server
# Use local langChain with fastChat (Terminal code)
# python3 -m fastchat.serve.controller
# python3 -m fastchat.serve.model_worker --model-names "gpt-3.5-turbo,text-davinci-003,text-embedding-ada-002" --model-path meta-llama/Llama-2-7b-chat-hf
# python3 -m fastchat.serve.openai_api_server --host localhost --port 8000

# Connect llama-2-7b-chat
os.environ['OPENAI_API_BASE'] = 'http://localhost:8000/v1'
os.environ['OPENAI_API_KEY'] = 'EMPTY'

llama2_name = 'llama-2-7b-chat'
temperature = 0
max_tokens = 200
llama2 = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=temperature, max_tokens=max_tokens)

In [13]:
call_llm_to_give_pun_definition(model=llama2, model_name=llama2_name, save=True)

 Puns rely on wordplay, often using multiple meanings or sounds of words to create humor. Non-puns, on the other hand, use more straightforward language and rely on other forms of humor, such as irony or sarcasm.


#### bare (0-shot)

In [13]:
call_llm_to_recognize(model=llama2, model_name=llama2_name, dataset=hom_dataset, batch_size=10, save=True)
call_llm_to_recognize(model=llama2, model_name=llama2_name, dataset=het_dataset, batch_size=10, save=True)

100%|██████████| 1443/1443 [17:50<00:00,  1.35it/s]
100%|██████████| 1146/1146 [54:12<00:00,  2.84s/it]


#### +def (0-shot)

In [14]:
call_llm_to_recognize(model=llama2, model_name=llama2_name, dataset=hom_dataset, batch_size=10, save=True,
                      add_def=True)
call_llm_to_recognize(model=llama2, model_name=llama2_name, dataset=het_dataset, batch_size=10, save=True,
                      add_def=True)

100%|██████████| 1443/1443 [00:00<00:00, 159659.72it/s]
100%|██████████| 1146/1146 [32:47<00:00,  1.72s/it]


#### +def (6-shot)

In [12]:
call_llm_to_recognize(model=llama2, model_name=llama2_name, dataset=hom_dataset, batch_size=10, save=True,
                      add_def=True, add_examples=hom_examples)
call_llm_to_recognize(model=llama2, model_name=llama2_name, dataset=het_dataset, batch_size=10, save=True,
                      add_def=True, add_examples=het_examples)

100%|██████████| 1443/1443 [1:55:23<00:00,  4.80s/it] 
100%|██████████| 1146/1146 [1:18:37<00:00,  4.12s/it]


#### +def&CoT (6-shot)

In [18]:
call_llm_to_recognize(model=llama2, model_name=llama2_name, dataset=hom_dataset, batch_size=10, save=True,
                      add_def=True, add_CoT=True, add_examples=hom_examples)
call_llm_to_recognize(model=llama2, model_name=llama2_name, dataset=het_dataset, batch_size=10, save=True,
                      add_def=True, add_CoT=True, add_examples=het_examples)

100%|██████████| 1443/1443 [2:14:40<00:00,  5.60s/it] 
100%|██████████| 1146/1146 [1:48:12<00:00,  5.67s/it]


### mistral

In [8]:
# Connect to a server
# Use local langChain with fastChat (Terminal code)
# python3 -m fastchat.serve.controller
# python3 -m fastchat.serve.model_worker --model-names "gpt-3.5-turbo,text-davinci-003,text-embedding-ada-002" --model-path mistralai/Mistral-7B-Instruct-v0.2
# python3 -m fastchat.serve.openai_api_server --host localhost --port 8000

# Connect mistral-7b-instruct-v0.2
os.environ['OPENAI_API_BASE'] = 'http://localhost:8000/v1'
os.environ['OPENAI_API_KEY'] = 'EMPTY'

mistral_name = 'mistral-7b-instruct-v0.2'
temperature = 0
max_tokens = 200
mistral = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=temperature, max_tokens=max_tokens)

In [11]:
call_llm_to_give_pun_definition(model=mistral, model_name=mistral_name, save=True)

Puns are words, phrases, or sentences that exploit multiple meanings of a term, or of similar-sounding words, for an intended humorous or rhetorical effect. Non-puns lack this deliberate play on words or sounds. Puns rely on context and language nuances, while non-puns communicate straightforward meanings.


#### bare (0-shot)

In [11]:
call_llm_to_recognize(model=mistral, model_name=mistral_name, dataset=hom_dataset, batch_size=10, save=True)
call_llm_to_recognize(model=mistral, model_name=mistral_name, dataset=het_dataset, batch_size=10, save=True)

100%|██████████| 1443/1443 [51:15<00:00,  2.13s/it]
100%|██████████| 1146/1146 [43:27<00:00,  2.28s/it]


#### +def (0-shot)

In [9]:
call_llm_to_recognize(model=mistral, model_name=mistral_name, dataset=hom_dataset, batch_size=10, save=True,
                      add_def=True)
call_llm_to_recognize(model=mistral, model_name=mistral_name, dataset=het_dataset, batch_size=10, save=True,
                      add_def=True)

100%|██████████| 1443/1443 [22:00<00:00,  1.09it/s]
100%|██████████| 1146/1146 [35:56<00:00,  1.88s/it]


#### +def (6-shot)

In [10]:
call_llm_to_recognize(model=mistral, model_name=mistral_name, dataset=hom_dataset, batch_size=10, save=True,
                      add_def=True, add_examples=hom_examples)
call_llm_to_recognize(model=mistral, model_name=mistral_name, dataset=het_dataset, batch_size=10, save=True,
                      add_def=True, add_examples=het_examples)

100%|██████████| 1443/1443 [27:08<00:00,  1.13s/it]
100%|██████████| 1146/1146 [22:00<00:00,  1.15s/it]


#### +def&CoT (6-shot)

In [12]:
call_llm_to_recognize(model=mistral, model_name=mistral_name, dataset=hom_dataset, batch_size=10, save=True,
                      add_def=True, add_CoT=True, add_examples=hom_examples)
call_llm_to_recognize(model=mistral, model_name=mistral_name, dataset=het_dataset, batch_size=10, save=True,
                      add_def=True, add_CoT=True, add_examples=het_examples)

100%|██████████| 1443/1443 [1:22:29<00:00,  3.43s/it]
100%|██████████| 1146/1146 [1:05:29<00:00,  3.43s/it]


### openchat

In [8]:
# Connect to a server
# Use local langChain with fastChat (Terminal code)
# python3 -m fastchat.serve.controller
# python3 -m fastchat.serve.model_worker --model-names "gpt-3.5-turbo,text-davinci-003,text-embedding-ada-002" --model-path openchat/openchat-3.5-0106
# python3 -m fastchat.serve.openai_api_server --host localhost --port 8000

# Connect openchat-3.5-0106
os.environ['OPENAI_API_BASE'] = 'http://localhost:8000/v1'
os.environ['OPENAI_API_KEY'] = 'EMPTY'

openchat_name = 'openchat-3.5-0106'
temperature = 0
max_tokens = 200
openchat = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=temperature, max_tokens=max_tokens)

In [10]:
call_llm_to_give_pun_definition(model=openchat, model_name=openchat_name, save=True)

Puns are words or phrases that have multiple meanings, often creating humor by exploiting the different meanings. Non-puns are words or phrases with only one clear meaning. Puns rely on wordplay, while non-puns do not. Puns can be clever and witty, while non-puns are straightforward and literal.


#### bare (0-shot)

In [9]:
call_llm_to_recognize(model=openchat, model_name=openchat_name, dataset=hom_dataset, batch_size=10, save=True)
call_llm_to_recognize(model=openchat, model_name=openchat_name, dataset=het_dataset, batch_size=10, save=True)

100%|██████████| 1443/1443 [14:07<00:00,  1.70it/s]
100%|██████████| 1146/1146 [11:15<00:00,  1.70it/s]


#### +def (0-shot)

In [10]:
call_llm_to_recognize(model=openchat, model_name=openchat_name, dataset=hom_dataset, batch_size=10, save=True,
                      add_def=True)
call_llm_to_recognize(model=openchat, model_name=openchat_name, dataset=het_dataset, batch_size=10, save=True,
                      add_def=True)

100%|██████████| 1443/1443 [14:53<00:00,  1.62it/s]
100%|██████████| 1146/1146 [11:49<00:00,  1.62it/s]


#### +def (6-shot)

In [9]:
call_llm_to_recognize(model=openchat, model_name=openchat_name, dataset=hom_dataset, batch_size=10, save=True,
                      add_def=True, add_examples=hom_examples)
call_llm_to_recognize(model=openchat, model_name=openchat_name, dataset=het_dataset, batch_size=10, save=True,
                      add_def=True, add_examples=het_examples)

100%|██████████| 1443/1443 [02:47<00:00,  8.62it/s]
100%|██████████| 1146/1146 [13:06<00:00,  1.46it/s]


#### +def&CoT (6-shot)

In [11]:
call_llm_to_recognize(model=openchat, model_name=openchat_name, dataset=hom_dataset, batch_size=10, save=True,
                      add_def=True, add_CoT=True, add_examples=hom_examples)
call_llm_to_recognize(model=openchat, model_name=openchat_name, dataset=het_dataset, batch_size=10, save=True,
                      add_def=True, add_CoT=True, add_examples=het_examples)

100%|██████████| 1443/1443 [1:14:52<00:00,  3.11s/it]
100%|██████████| 1146/1146 [1:01:59<00:00,  3.25s/it]
