Loads the libraries necessary for the script and also loads the environment variables in the .env file.

In [None]:
%reload_ext dotenv
%dotenv

import openai
import json
import groq
import time
import ollama

from pathlib import Path
from textwrap import dedent

from tqdm.autonotebook import tqdm

Setup for utilizing the OpenAI and Ollama APIs. Also loads the dataset.

In [None]:
openai_client = openai.OpenAI()
ollama_client = ollama.Client(timeout=60)

with open("data/dataset.json") as file:
    dataset = json.load(file)

Defines helper methods for asking questions to GPT and Ollama. The ollama version contains a retry part, since, especially on windows, ollama tends to fail to response sometimes.

In [None]:
def ask_chatgpt(prompts, model):
    response = openai_client.chat.completions.create(
        model=model,
        messages=prompts,
        response_format={"type": "json_object"},
        temperature=0
    )
    return response.choices[0].message.content
        
def ask_ollama(prompts, model):
    retry_count = 0
    while True:
        try:
            response = ollama_client.chat(model=model, messages=prompts, format="json", stream=False,
            options={
                "temperature": 0,
                "num_ctx": 8192,
                "num_predict": -1
            })
            return response['message']['content']
        except Exception as e:
            time.sleep(5)
            print(f"Failed with {e}. Retrying...")
            retry_count += 1
            if retry_count > 5:
                return None
            continue

This cell defines helper methods that generate the prompt depending on the structure utilized by the model (OpenAI and Ollama can differ) and the prompt engineering technique utilized).

In [None]:
def get_system_prompt():
    return {"role": "system",
            "content": dedent(f"""
                You are a bot that classifies messages from Github pull requests. Classify the message as one of three types of sentiment: positive, neutral, and negative.

                For classification purposes, we consider love and joy (and related emotions) positive, \
                anger, sadness, and fear, negative, surprise can be positive or negative depending on the context, and \
                neutral is considered the absense of any emotions.

                Return the result one of the following JSONs: {{"sentiment": "positive"}}, {{"sentiment": "negative"}} OR {{"sentiment": "neutral"}}.
            """)}

def get_system_prompt_with_message(message):
    return {"role": "user",
            "content": dedent(f"""
                You are a bot that classifies messages from Github pull requests. Classify the message as one of three types of sentiment: positive, neutral, and negative.

                For classification purposes, we consider love and joy (and related emotions) positive, \
                anger, sadness, and fear, negative, surprise can be positive or negative depending on the context, and \
                neutral is considered the absense of any emotions.

                Return the result one of the following JSONs: {{"sentiment": "positive"}}, {{"sentiment": "negative"}} OR {{"sentiment": "neutral"}}.
                
                Message: "{message}"
            """)}

def get_system_prompt_with_message_and_examples(message, num_examples, cot=False):
    return {"role": "user",
            "content": dedent(f"""
                You are a bot that classifies messages from Github pull requests. Classify the message as one of three types of sentiment: positive, neutral, and negative.

                For classification purposes, we consider love and joy (and related emotions) positive, \
                anger, sadness, and fear, negative, surprise can be positive or negative depending on the context, and \
                neutral is considered the absense of any emotions.

                Return the result one of the following JSONs: {{"sentiment": "positive"}}, {{"sentiment": "negative"}} OR {{"sentiment": "neutral"}}.
                
                Here are some examples to help you get started:
                {get_example_strings(num_examples, cot)}
                
                Message: "{message}"
            """)}

def get_examples(num_positive, num_negative, num_neutral):
    positive = [
        {
            "message": "Thinking out loud I'm not sure whether a subclass would be a good way to go or not, but happy to discuss further. Thanks!",
            "reasoning": "Analyzing the tone of the message, we notice that the author is satisfied with what was implemented, and the use of 'thinking out loud' suggests a collaborative approach by openly exploring ideas; the phrase 'I'm not sure' shows openness to suggestions, and 'happy to discuss further' reveals a positive disposition for dialogue and adjustment, indicating that the author values feedback; finally, 'Thanks!' conveys gratitude and reinforces the tone of appreciation and cooperation."
        },
        {
            "message": "Thanks, Your contribution is now merged in with a polish commit.",
            "reasoning": "Observing the structure of the message, we notice that 'Thanks' establishes a tone of gratitude, and the expression 'Your contribution' emphasizes the value of the recipient’s participation, creating a sense of recognition; the word 'merged' suggests approval, since the work was integrated into the main project,. These elements together reinforce a tone of approval and appreciation for the recipient's effort."
        },
        {
            "message": "I did polish it a bit more, thanks Eddu!",
            "reasoning": "The 'thanks Eddu!' suggesting camaraderie and gratitude, probably for feedback that was given in the pull request. By recognizing the importance of Eddu's feedback in a friendly way and acting on it, the author conveys a tone of cooperation and friendship."
        },
    ]
    
    neutral = [
        {
            "message": "Quick check using python awscli.",
            "reasoning": "Analyzing the phrase 'Quick check,' we see that it describes a brief verification without any quality or emotional judgment, suggesting an objective communication; 'Using python awscli' describes the tool and language used, keeping the focus on functionality; the tone is descriptive and informative, without emotional inclination, and the language is purely technical and direct, intended to share a detail without expecting a reaction."
        },
        {
            "message": "Should this be `AsyncComponent` maybe?",
            "reasoning": "The author of the message is providing, in a purely technical manner, feedback for a change that he believes should be done in the pull request. "
        },
        {
            "message": "Can you please give a short update when do you think the next Guice release will be cut? Are there any plans?",
            "reasoning": "Looking at the excerpt 'Can you please give a short update,' we see it is a polite and straightforward request, without emotional tone; the phrase 'when do you think the next Guice release will be cut?' seeks information without implying urgency or frustration, and 'Are there any plans?' reinforces the tone of curiosity and inquiry, keeping the focus on information. The choice of words indicates interest but without any emotional inclination."
        },
    ]
    
    negative = [
        {
            "message": "Sorry, test case was mistaken.",
            "reasoning": "Observing the use of 'Sorry,' we notice an expression of regret that suggests the author feels responsible for the error."
        },
        {
            "message": "Oh, you know what? I'm stupid. I'm actually using `http-server`. Please, accept my apologies ",
            "reasoning": "In this message, the author utilizes a self deprecating expression (I'm stupid) and then apologizes, likely to the reviewer."
        },
        {
            "message": "Hmm, ok that is unfortunate. I have split support into a separate pull request (#2071). If you want you can close this pull request here then, if you don't think there is any chance this will be integrated.",
            "reasoning": "Analyzing 'unfortunate,' we notice an expression of disappointment with the situation; the phrase 'If you want you can close this pull request' suggests resignation, indicating that the author has low expectations about the integration of the original request; the phrase 'if you don't think there is any chance' emphasizes a pessimistic, almost resigned tone regarding the pull request's outcome."
        },
    ]
    
    return positive[0:num_positive], neutral[0:num_neutral], negative[0:num_negative]

def get_example_shots(num):
    
    classes = {}
    example_shots = []
    
    classes['positive'], classes['neutral'], classes['negative'] = get_examples(num, num, num)
    
    for key, value in classes.items():
        for message in value:
            example_shots.append({
                "role": "user",
                "content": message["message"],
            })
            example_shots.append({
                "role": "assistant",
                "content": f'{{"sentiment": "{key}"}}',
            })
        
    return example_shots
        
def get_example_strings(num, cot=False):
    message = ""
    classes = {}
    
    classes['positive'], classes['neutral'], classes['negative'] = get_examples(num, num, num)
    
    i = 1
    for key, value in classes.items():
        for msg in value:
            message += f'Example {i}: {msg['message']}\nResponse for Example {i}: {{"sentiment": "{key}"}}\n'
            if cot:
                message += f'Reasoning for Example {i}: {msg['reasoning']}\n'
            i += 1
            
    return message

def get_final_prompt(message):
    return {"role": "user",
            "content": message,}

### Data collection for the first research question (RQ1)

Collects the results from GPT-4o

In [None]:
results = [None] * len(dataset)

GPT_MODELS = ["gpt-4o-2024-05-13"]

for model in GPT_MODELS:
    run_name = model + f"_0shot" + ".json"
    if Path("output/"+ run_name).exists():
        continue
    for index, message in tqdm(enumerate(dataset), desc=run_name):
        prompt = [get_system_prompt()] + get_examples(number=0, cot=False) + [get_final_prompt(message["raw_message"])]
        model_response = ask_chatgpt(prompt, model)
        try:
            results[index] = json.loads(model_response)
        except:
            continue
            
    with open(f"output/{run_name}", "w") as jsonfile:
        json.dump(results, jsonfile)

# Data collection for the second research question (RQ2)

This first cell collects the the results for GPT-4o-mini, without utilizing any prompt engineering techniques.

In [None]:
results = [None] * len(dataset)

GPT_MODELS = ["gpt-4o-mini-2024-07-18"]

for model in GPT_MODELS:
    run_name = model + f"_0shot" + ".json"
    if Path("output/" + run_name).exists():
        continue
    for index, message in tqdm(enumerate(dataset), desc=run_name):
        prompt = [get_system_prompt()] + [get_final_prompt(message["raw_message"])]
        model_response = ask_chatgpt(prompt, model)
        try:
            results[index] = json.loads(model_response)
        except:
            continue
            
with open(f"output/{run_name}", "w") as jsonfile:
    json.dump(results, jsonfile)

This second cell collects the data from the ollama models, without providing any examples (prompt engineering techniques).

In [None]:
results = [None] * len(dataset)

OLLAMA_MODELS = ["mistral-nemo:12b", "gemma2:9b", "llama3.1:8b", "mistral-small:22b", "gemma2:27b", "llama3.1:70b"]

for model in OLLAMA_MODELS:
    run_name = model + f"_0shot" + ".json"
    if Path("output/" + run_name).exists():
        continue
    for index, message in tqdm(enumerate(dataset), desc=run_name):
        prompt = [get_system_prompt_with_message(message["raw_message"])]
        model_response = ask_ollama(prompt, model)
        try:
            results[index] = json.loads(model_response)
        except:
            continue
            
    with open(f"output/{run_name}", "w") as jsonfile:
        json.dump(results, jsonfile)

# Data collection for the third research question (RQ3)

This first cell collects the data for the OpenAI models, for the one-shot and few-shot cases.

In [None]:
results = [None] * len(dataset)

GPT_MODELS = ["gpt-4o-mini-2024-07-18", "gpt-4o-2024-05-13"]

for model in GPT_MODELS:
    for number in [1, 3]:
        run_name = model + f"_{str(number)}shot" + ".json"
        if Path("output/"+ run_name).exists():
            continue
        for index, message in tqdm(enumerate(dataset), desc=run_name):
            prompt = [get_system_prompt()] + get_example_shots(number) + [get_final_prompt(message["raw_message"])]
            if model in GPT_MODELS:
                model_response = ask_chatgpt(prompt, model)
            try:
                results[index] = json.loads(model_response)
            except:
                continue
            
        with open(f"output/{run_name}", "w") as jsonfile:
            json.dump(results, jsonfile)

This second cell collects the data for the ollama models, for the one-shot and few-shot cases.

In [None]:
results = [None] * len(dataset)

OLLAMA_MODELS = ["mistral-nemo:12b", "gemma2:9b", "llama3.1:8b", "mistral-small:22b", "gemma2:27b", "llama3.1:70b"]

for model in OLLAMA_MODELS:
    for number in [1, 3]:
        run_name = model + f"_{str(number)}shot" + ".json"
        if Path("output/" + run_name).exists():
            continue
        for index, message in tqdm(enumerate(dataset), desc=run_name):
            prompt = [get_system_prompt_with_message_and_examples(message["raw_message"], number)]
            model_response = ask_ollama(prompt, model)
            try:
                results[index] = json.loads(model_response)
            except:
                continue

        with open(f"output/{run_name}", "w") as jsonfile:
            json.dump(results, jsonfile)

This third cell collects the data for the OpenAI models, for the chain of thought cases.

In [None]:
results = [None] * len(dataset)

GPT_MODELS = ["gpt-4o-mini-2024-07-18", "gpt-4o-2024-05-13"]

for model in GPT_MODELS:
    run_name = model + "_cotshot" + ".json"
    if Path("output/"+ run_name).exists():
        continue
    for index, message in tqdm(enumerate(dataset), desc=run_name):
        prompt = [get_system_prompt_with_message_and_examples(message["raw_message"], number, cot=True)]
        if model in GPT_MODELS:
            model_response = ask_chatgpt(prompt, model)
        try:
            results[index] = json.loads(model_response)
        except:
            continue
            
    with open(f"output/{run_name}", "w") as jsonfile:
        json.dump(results, jsonfile)

This fourth cell collects the data for the ollama models, for the chain of thought cases.

In [None]:
results = [None] * len(dataset)

OLLAMA_MODELS = ["mistral-nemo:12b", "llama3.1:8b", "mistral-small:22b", "gemma2:9b", "gemma2:27b", "llama3.1:70b"]

for model in OLLAMA_MODELS:
    run_name = model + "_cotshot" + ".json"
    if Path("output/" + run_name).exists():
        continue
    for index, message in tqdm(enumerate(dataset), desc=run_name):
        prompt = [get_system_prompt_with_message_and_examples(message["raw_message"], number, cot=True)]
        model_response = ask_ollama(prompt, model)
        try:
            results[index] = json.loads(model_response)
        except:
            continue

    with open(f"output/{run_name}", "w") as jsonfile:
        json.dump(results, jsonfile)