In [1]:
import pandas as pd
import os
import openai
from openai import OpenAI
from dotenv import load_dotenv

# loads environment variables
load_dotenv()

True

In [2]:
# Initialize OpenAI client
# Make sure to set your OpenAI API key in the environment variable OPENAI_API_KEY
# Alternatively, you can pass the key directly to the OpenAI constructor with `api_key='your_api_key'`

client = OpenAI()

# Core function to query GPT model

In [3]:
def query_gpt(prompt):
    """
    Queries the GPT model with a given prompt and returns the completion.
    
    Args:
        prompt (str): The input prompt to send to the GPT model
        
    Returns:
        str: The model's response/completion text
    """
    # Call the OpenAI API to get a completion
    # Ensure you have the correct model and parameters set
    chat_completion = client.chat.completions.create(
        messages = [
            {
                'role': 'system',
                'content': '',
            },
            {
                'role': 'user',
                'content': prompt,
            }

        ],
        model='gpt-4o',
        max_tokens=1000
    )
    completion = chat_completion.choices[0].message.content
    return completion

# Function to query other hugginface models (Optional)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def query_hf_model(model_name: str, prompt: str, max_new_tokens: int = 100, temperature: float = 0.7) -> str:
    """
    Generate text from a Hugging Face language model.

    Args:
        model_name (str): Hugging Face model name (e.g., 'gpt2', 'meta-llama/Llama-2-7b-chat-hf').
        prompt (str): Input prompt text.
        max_new_tokens (int): Maximum number of tokens to generate.
        temperature (float): Sampling temperature.

    Returns:
        str: The generated text.
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.eval()

    # Tokenize input
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    # Generate output
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id  # avoids warning for some models
        )

    # Decode and return only the newly generated part
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_text[len(prompt):].strip()


# Example usage
output = query_hf_model("gpt2", "Once upon a time", max_new_tokens=50)
print("Generated:", output)

# Study 1 Prompts

In [3]:
def context_prompt(interview_domain, interview_turns):
    """
    Generates a prompt for generating follow-up questions based on interview context.
    
    Args:
        interview_domain (str): The domain/topic of the interview
        interview_turns (str): The conversation history between interviewer and interviewee
        
    Returns:
        str: A formatted prompt for generating follow-up questions
    """
    prompt = f"You are an AI agent capable of generating context summaries.\
      During a requirements elicitation interview with an interviewee about how the interviewee conducts {interview_domain}, \
      the INTERVIEWEE and INTERVIEWER have had the following conversation: {interview_turns}. \
      Generate a follow-up question that the Interviewer should ask next based on the conversation. \
      Restrict your response to only show the follow-up question without explanation."
    return prompt

In [7]:
# Specify the path to your CSV file containing the interview data
# The default path to reproduce the results in the paper is '~/datasets/study1.csv'
notebook_dir = os.getcwd()
df = pd.read_csv(notebook_dir+ '/datasets/study1.csv')

In [None]:
# Validate the input dataset
df.head()

Unnamed: 0,Interview Domain,Interview Turns,Human Follow Up Question,No. Of Relevant Speaker Turns,Type
0,Apartment finding,Interviewer: What kind of things do you look f...,"So, the three things you are looking for are t...",1,Confirmation
1,Apartment finding,"Interviewer: Okay, and you mentioned that pric...",You mentioned that you were looking for an apa...,1,Answer probing
2,Apartment finding,"Interviewer: Okay, so you would rank them as f...","Apart from not knowing the person, what other ...",1,Answer probing
3,Apartment finding,"Interviewer: Right, and you also mentioned the...",Have you had any incident that required settling?,1,Answer probing
4,Apartment finding,Interviewer: What kind of things do you look f...,"Okay, and the last thing that you mentioned th...",3,Answer probing


In [None]:
# iterate through the dataset and generate questions

print("Generating follow-up questions...")
for i in range(len(df)):
    # Extract the interview domain and turns for the current row
    interview_domain = df.iloc[i]['Interview Domain']
    interview_turns = df.iloc[i]['Interview Turns']

    # Generate the context prompt for the current interview turn
    prompt = context_prompt(interview_domain, interview_turns)
    # Query the GPT model with the generated prompt
    response = query_gpt(prompt)

    # Store the generated question in the DataFrame
    df.at[i, 'LLM Follow Up Question'] = response
    print(f"Generated question for interview turn {i+1}: {response}")

In [None]:
# Save the results to a new CSV file
df2 = pd.DataFrame()
df2['Interview Domain'] = df['Interview Domain']
df2['Interview Turns'] = df['Interview Turns']
df2['Human Follow-up Question'] = df['Human Follow Up Question']
df2['LLM Follow-up Question'] = df['LLM Follow Up Question']
df2.to_csv('study1_out.csv', index=False)

# Study 2 & 3 Prompts

In [4]:
criterion1 = "A good follow-up question should elicit tacit \
assumptions, i.e. justify or authorize assumptions stakeholders tacitly made without justification."

criterion2 = "A good follow-up question should consider alternatives, \
i.e. look for alternative information or alternatives to existing requirements."

criterion3 = "A good follow-up question should clarify when unclear, \
i.e. ask for clarification whenever words INTERVIEWEE said are unclear. \
To classify whether the INTERVIEWER's question meets this standard, first consider \
if the INTERVIEWEE's speech contains unclear statements. If it does not contain anything \
unclear, then the standard is met. Otherwise, look at whether the INTERVIEWER's \
question tries to clarify the unclear. "

criterion4 = "A good follow-up question should clarify when contradictory, \
i.e. ask for clarification whenever INTERVIEWEE mentioned two conflicting requirements or described \
potential features in conflicting terms. For example, if the INTERVIEWEE said he wanted to have an app \
recommending to him good restaurants based on his age and gender, but he also wanted the app to protect \
his private information, then this may be a case that needs clarification. \
To classify whether the INTERVIEWER's question meets this standard, first consider \
if the INTERVIEWEE mentioned anything contradictory. If it does not contain anything \
contradictory, then the standard is met. Otherwise, look at whether the INTERVIEWER's \
question tries to clarify the contradiction. "

criterion5 = "A good follow-up question should elicit tacit knowledge, \
i.e. elicit tacit knowledge that are known to INTERVIEWEE but unknown to analysts."

criterion6 = "A good follow-up question should be related to the interview domain, \
and should not be too generic."

criterion7 = "A good follow-up question should not be too long or articulated, \
which would require the interviewee to ask for repeating or rephrase multiple times. \
To classify whether the INTERVIEWER's question meets this standard, first consider \
if the question is too long. If not, then the standard is met. Otherwise, the standard is not met."

criterion8 = "A good follow-up question should use common vocabulary, \
it should not contain special words or expressions that are not in the common vocabulary. \
To classify whether the INTERVIEWER's question meets this standard, first consider \
if the question contains any jargon. If not, then the standard is met. Otherwise, the standard is not met."

criterion9 = "A good follow-up question should not require technical knowledge in order to answer. \
For example, in a clinic finding interview, the question should not ask about the \
diagnosis criteria or data analysis. \
To classify whether the INTERVIEWER's question meets this standard, first consider \
if the INTERVIEWER's question contained anything that needs technical knowledge in order to understand \
or answer. If it does not contain anything \
technical, then the standard is met. Otherwise, the standard is not met."

criterion10 = "A good follow-up question should be \
appropriate to interviewee's profile, i.e., ask questions that can be answered \
by the interviewee given the interviewee's profile. For example, when conducting crowd-based interviews, \
it's inappropriate to ask crowd users about industry-specific topics such as software development cycles. \
To classify whether the INTERVIEWER's question meets this standard, \
first consider if the question contains any word inappropriate to crowd users. \
If the question contains no such word, then the standard is met. Otherwise, the standard is not met."

criterion11 = "A good follow-up question should not ask the interviewee to present a \
solution to satisfy a requirement. For example, it's inappropriate to ask users about how to design a specific \
feature, or what would an ideal user interface look like. To classify whether the INTERVIEWER's question meets this standard, \
first consider if the question is asking the INTERVIEWEE to give a solution. If not, then the standard is met. \
Otherwise, the standard is not met."

criterion12 = "A good follow-up question should not \
involve multiple kinds of requirements, i.e., it should not mix different categories of requirements \
or multiple specific requirements within one category into one single question. \
To classify whether the INTERVIEWER's question meets this standard, first consider if \
the question is asking for more than one requirement. If it contains only one kind of requirement, \
the standard is met. If more than one kind of requirement is involved, then the standard is not met."

criterion13 = "A good follow-up question should avoid asking questions \
that lead to multiple interpretations, which are questions that can be interpreted in more than one way. \
To classify whether the INTERVIEWER's question meets this standard, first consider if \
the question can be interpreted in more than one way. If not, the standard is met. \
Otherwise, the standard is not met. "

criterion14 = "A good follow-up question should avoid asking vague questions \
that could infer no reasonable meaning, which are questions that do not have enough context or \
clarity for interviewee to answer."

In [5]:
# study 2 classification prompt
def generate_prompt_classify(domain, interviewee_speech, interviewer_response, criterion):
    """
    Generates a prompt for classifying whether an interviewer's response meets a specific criterion.
    
    Args:
        domain (str): The domain/topic of the interview
        interviewee_speech (str): What the interviewee said
        interviewer_response (str): The interviewer's follow-up question
        criterion (str): The specific criterion to evaluate against
        
    Returns:
        str: A formatted prompt for classification
    """
    prompt = f"You are an AI agent capable of conducting requirements elicitation interviews.\
 During a requirements elicitation interview with an interviewee about how the interviewee conducts {domain}, \
the INTERVIEWEE said '{interviewee_speech}'. Then the INTERVIEWER asked a follow up question \
by saying '{interviewer_response}'. Standard: {criterion} \
Please classify based solely on whether the INTERVIEWER’s response meets this specific standard, and \
refrain from using any other standards related to follow up questions when you classify. \
If the INTERVIEWER’s response meets this standard, output 'Yes', otherwise output 'No'. \
Restrict your response to output only 'Yes' or 'No' without explanations."
    return prompt

In [6]:
# study 3 question generation prompt
def generate_prompt_question(domain, interviewee_speech, interviewer_response, criterion):
    """
    Generates a prompt for creating a follow-up question that meets a specific criterion.
    
    Args:
        domain (str): The domain/topic of the interview
        interviewee_speech (str): What the interviewee said
        interviewer_response (str): The interviewer's follow-up question
        criterion (str): The specific criterion the new question should meet
        
    Returns:
        str: A formatted prompt for question generation
    """
    prompt = f"You are an AI agent capable of conducting requirements elicitation interviews.\
 During a requirements elicitation interview with an interviewee about how the interviewee conducts {domain}, \
the INTERVIEWEE said '{interviewee_speech}'. Generate a follow-up question that meets the following \
criterion based ONLY on what the INTERVIEWEE said, and restrict your response to only show the \
follow-up question without explanation. Criterion: {criterion}"
    return prompt

In [7]:
def get_responses_2_step(dataset_file):
    """
    Processes a dataset to generate and classify follow-up questions based on multiple criteria.
    Uses a two-step process: first classifies existing responses, then generates new ones if needed.
    
    Args:
        dataset_file (str): Path to the CSV file containing the dataset
        
    Returns:
        pandas.DataFrame: Updated dataset with new responses and classifications
    """
    dataset_df = pd.read_csv(dataset_file, dtype='object')
    
    for index, row in dataset_df.iterrows():
        # Extracting the necessary fields from the dataset
        interviewee_speech = row['Interviewee_Speech']
        interviewer_response = row['Interviewer_Response']
        domain = row['Domain']
        
        # Initialize the columns for criteria
        criteria = [criterion1, criterion2, criterion3, criterion4, criterion5, criterion6, criterion7, criterion8, criterion9, criterion10, criterion11, criterion12, criterion13, criterion14]
        column_names = list(dataset_df.columns[3:])
        
        # Iterate through each criterion to classify the interviewer's response
        for i in range(len(criteria)):
            criterion = criteria[i]
            # classification prompt
            prompt_c = generate_prompt_classify(domain, interviewee_speech, interviewer_response, criterion)
            response = query_gpt(prompt_c)
            
            # If the response does not meet the criterion, generate a new question
            if "No" in response:
                # generation prompts
                prompt_q = generate_prompt_question(domain, interviewee_speech, interviewer_response, criterion)
                response_q = query_gpt(prompt_q)
                dataset_df.at[index, column_names[i]] = response_q           
    
    return dataset_df  

In [8]:
# Specify the path to your CSV file containing the interview data
# The default path to reproduce the results in the paper is '~/datasets/study2.csv'

notebook_dir = os.getcwd()
dataset_df = get_responses_2_step(notebook_dir + "/datasets/study2.csv")
output_file = "study2_out.csv"
dataset_df.to_csv(output_file)

# Side Study: LLM to avoid all mistakes + self-evaluation

In [None]:
# side study prompt
def generate_prompt_all_cri(domain, interviewee_speech, criteria_lst):
    """
    Generates a prompt for creating a follow-up question that meets all specified criteria.
    
    Args:
        domain (str): The domain/topic of the interview
        interviewee_speech (str): What the interviewee said
        criteria_lst (list): List of all criteria the question should meet
        
    Returns:
        str: A formatted prompt for generating a question meeting all criteria
    """
    prompt = f"You are an AI agent capable of conducting requirements elicitation interviews.\
 During a requirements elicitation interview with an interviewee about how the interviewee conducts {domain}, \
the INTERVIEWEE said '{interviewee_speech}'. Generate a follow-up question that meets ALL of the following \
criteria based ONLY on what the INTERVIEWEE said, and restrict your response to only show the \
follow-up question without explanation. \
Criterion 1: {criteria_lst[0]} \
Criterion 2: {criteria_lst[1]} \
Criterion 3: {criteria_lst[2]} \
Criterion 4: {criteria_lst[3]} \
Criterion 5: {criteria_lst[4]} \
Criterion 6: {criteria_lst[5]} \
Criterion 7: {criteria_lst[6]} \
Criterion 8: {criteria_lst[7]} \
Criterion 9: {criteria_lst[8]} \
Criterion 10: {criteria_lst[9]} \
Criterion 11: {criteria_lst[10]} \
Criterion 12: {criteria_lst[11]} \
Criterion 13: {criteria_lst[12]} \
Criterion 14: {criteria_lst[13]}"
    return prompt

In [None]:
def get_responses_all_cri(dataset_file):
    """
    Processes a dataset to generate follow-up questions that meet all criteria.
    
    Args:
        dataset_file (str): Path to the CSV file containing the dataset
        
    Returns:
        pandas.DataFrame: Updated dataset with new responses that meet all criteria
    """
    dataset_df = pd.read_csv(dataset_file, dtype='object')
    
    for index, row in dataset_df.iterrows():
        
        interviewee_speech = row['Interviewee_Speech']
        domain = row['Domain']
        
        criteria = [criterion1, criterion2, criterion3, criterion4, criterion5, criterion6, criterion7, criterion8, criterion9, criterion10, criterion11, criterion12, criterion13, criterion14]
        
        prompt_all_cri = generate_prompt_all_cri(domain, interviewee_speech, criteria)
        response_all_cri = query_gpt(prompt_all_cri)
                   
        dataset_df.at[index, 'Interviewer_Response'] = response_all_cri
    
    return dataset_df 

In [None]:
# Specify the path to your CSV file containing the interview data
# The default path to reproduce the results in the paper is '~/datasets/study2.csv'

dataset_all_cri = get_responses_all_cri("study2.csv")

In [None]:
output_file = "side_study.csv"
dataset_all_cri.to_csv(output_file)

In [None]:
# reclassify prompt
def generate_prompt_classify(domain, interviewee_speech, interviewer_response, criterion):
    prompt = f"You are an AI agent capable of conducting requirements elicitation interviews.\
 During a requirements elicitation interview with an interviewee about how the interviewee conducts {domain}, \
the INTERVIEWEE said '{interviewee_speech}'. Then the INTERVIEWER asked a follow up question \
by saying '{interviewer_response}'. Standard: {criterion} \
Please classify based solely on whether the INTERVIEWER’s response meets this specific standard, and \
refrain from using any other standards related to follow up questions when you classify. \
If the INTERVIEWER’s response meets this standard, output 'Yes', otherwise output 'No'. \
Restrict your response to output only 'Yes' or 'No' without explanations."
    return prompt

In [None]:
def get_responses_reclassify(dataset_file):
    """
    Reclassifies responses in a dataset against all criteria.
    
    Args:
        dataset_file (str): Path to the CSV file containing the dataset
        
    Returns:
        pandas.DataFrame: Updated dataset with new classifications for all criteria
    """
    dataset_df = pd.read_csv(dataset_file, dtype='object')
    
    for index, row in dataset_df.iterrows():
        
        interviewee_speech = row['Interviewee_Speech']
        interviewer_response = row['Interviewer_Response']
        domain = row['Domain']
        

        criteria = [criterion1, criterion2, criterion3, criterion4, criterion5, criterion6, criterion7, criterion8, criterion9, criterion10, criterion11, criterion12, criterion13, criterion14]
        column_names = list(dataset_df.columns[4:])
        
        for i in range(len(criteria)):
            criterion = criteria[i]
            # classification prompt
            prompt_c = generate_prompt_classify(domain, interviewee_speech, interviewer_response, criterion)
            response = query_gpt(prompt_c)
            dataset_df.at[index, column_names[i]] = response           
    
    return dataset_df 

In [None]:
dataset_df = get_responses_reclassify("side_study.csv")
output_file = "side_study.csv"
dataset_df.to_csv(output_file)