This notebook was run in Google Colab. To run every cell you need to upload the data set and prompts into the same folder. The code for evaluation was partially taken from others papers and merged together. Boilerplate code was typically generated using Claude 3 Opus and GPT-4 and then adjusted for our specific use case.

# Install dependencies

In [None]:
!pip install sacrebleu
!pip install rouge_score

# METEOR:
#%%capture
#!pip install nltk

Collecting sacrebleu
  Downloading sacrebleu-2.4.1-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.1
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=d11ac83beef9ac343585d0189c91bfb15df365838d7acdfad0297c297a9eb553
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be

# Import libraries

In [None]:
# Import libraries
import pathlib
import textwrap
import json
import pickle
import sacrebleu
import numpy as np
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown
import nltk
from nltk.translate import meteor
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
nltk.download("wordnet")
nltk.download("punkt")
from rouge_score import rouge_scorer


# Import library to call Google API key
from google.colab import userdata

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Set up API key

In [None]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

# Define functions

In [None]:
# Function that retrieves the whole dialog chat from a single conversation
def sort_and_track_dialog(dialog):
    seeker_list = []
    supporter_list = []
    first_speaker = None  # Track who starts the conversation

    current_speaker = None
    current_content = ""

    for i, entry in enumerate(dialog):
        speaker = entry['speaker']
        content = entry['content']

        # Determine the first speaker
        if i == 0:
            first_speaker = speaker

        if speaker == current_speaker:
            # Concatenate content if the same speaker continues
            current_content += " " + content
        else:
            # Save the previous speaker's content if any
            if current_content:
                if current_speaker == 'seeker':
                    seeker_list.append(current_content)
                elif current_speaker == 'supporter':
                    supporter_list.append(current_content)

            # Update the current speaker and content with the new entry
            current_speaker = speaker
            current_content = content

    # Add the last speaker's content to the respective list
    if current_content:
        if current_speaker == 'seeker':
            seeker_list.append(current_content)
        elif current_speaker == 'supporter':
            supporter_list.append(current_content)

    return seeker_list, supporter_list, first_speaker

In [None]:
def create_chat_history(prompt, model_start, seeker_list, supporter_list, first_speaker):

    # Initialize chat_history with the prompt and initial model message
    chat_history = [
        {"parts": [{"text": prompt}], "role": "user"},
        {"parts": [{"text": model_start}], "role": "model"}
    ]

    # Adjust initial model message if the first speaker is 'supporter'
    if first_speaker == "supporter" and supporter_list:
        # Add the first message of the supporter_list to the model_start text
        chat_history[1]["parts"][0]["text"] += "\n" + supporter_list[0]
        supporter_start_idx = 1  # Start from the second message for supporter
    else:
        supporter_start_idx = 0  # Start from the first message for supporter

    # Determine the starting point for seeker messages
    seeker_start_idx = 0

    # Calculate the total iterations needed
    max_iterations = max(len(seeker_list) - seeker_start_idx, len(supporter_list) - supporter_start_idx)

    # Alternating between seeker and supporter messages
    for i in range(max_iterations):
        if i + seeker_start_idx < len(seeker_list):
            chat_history.append({
                "parts": [{"text": seeker_list[i + seeker_start_idx]}],
                "role": "user"
            })
        if i + supporter_start_idx < len(supporter_list):
            chat_history.append({
                "parts": [{"text": supporter_list[i + supporter_start_idx]}],
                "role": "model"
            })

    return chat_history

In [None]:
def generate_model_responses(dialog, prompt, model_start):
  # Get seeker_list, supporter_list and who starts the conversation
  seeker_list, supporter_list, first_speaker = sort_and_track_dialog(dialog)

  # Create chat history
  chat_history = create_chat_history(prompt, model_start, seeker_list, supporter_list, first_speaker)

  # Initialise model responses
  model_responses = []
  golden_responses = []

  # Determine max index
  max_index = len(chat_history) - 4

  # Loop to generate model responses
  for i in range(0, max_index, 2):
    try:
      # Current history
      current_history = chat_history[:i+2]

      # Initiate model
      model = genai.GenerativeModel(model_name="gemini-pro", generation_config=generation_config, safety_settings=safety_settings)

      # Clear chat history
      chat = model.start_chat(history=current_history)

      # Send message
      current_message = chat_history[i+2]['parts'][0]['text']

      # Send message and get model response
      model_response = chat.send_message(current_message).text

      # Save model responses in list if no exception occurs
      model_responses.append(model_response)

      # Save golden responses in list
      golden_response = chat_history[i+3]['parts'][0]['text']
      golden_responses.append(golden_response)

      # Print chat history
      #print(f'Golden response: \n {golden_response}')
      #print(f'Model response: \n {model_response}')

    except Exception as e:
      # Log the error, e.g., print or save to a log file
      print(f"An error occurred: {e}")
      # Optionally, break out of the loop or continue depending on the desired behavior
      break  # or use `continue` to skip adding responses for the current iteration and move to the next

  return model_responses, golden_responses

# Gemini model configuration

In [None]:
# Model configuration
generation_config = {
    "temperature":0.5, # temperature parameter
    "top_p":1,
    "top_k":1,
    "max_output_tokens":1000,
}

# Safety Settings
safety_settings = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE"
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE"
    },
]

# Hyperparameters

In [None]:
# File path
# You need to upload some of the following files to run it.
file_path = 'ESConv.json'

# Load the JSON data
with open(file_path, 'r') as file:
    data = json.load(file)

# Get prompt
file_name = '40_example_conversations_prompt'
file_path = f'{file_name}.txt'
with open(file_path, 'r') as file:
    prompt = file.read()

# First model response for chat history
model_start = "I understand that I should provide psychological help and that the previous message provides a suitable guideline. In the following conversation, I will only reply in 1-2 sentences:"

# Save path for model and golden responses
save_golden_responses = f'{file_name}_all_golden_responses.pkl'
save_model_responses = f'{file_name}_all_model_responses.pkl'

FileNotFoundError: [Errno 2] No such file or directory: '40_example_conversations_prompt.txt'

# Generate model and golden responses for a single conversation

In [None]:
# Get dialog of conversation
dialog = data[0]['dialog']

# Get seeker_list, supporter_list and who starts the conversation
seeker_list, supporter_list, first_speaker = sort_and_track_dialog(dialog)

# Create chat history
chat_history = create_chat_history(prompt, model_start, seeker_list, supporter_list, first_speaker)

# Generate model and golden responses
model_responses, golden_responses = generate_model_responses(dialog, prompt, model_start)

In [None]:
model_responses

['Hello there. I am here to provide emotional support. How can I assist you today?',
 "It's understandable to feel anxious about quitting a well-paying job. Let's explore your concerns and identify strategies to manage your anxiety.",
 "It's understandable that dealing with people in hard financial situations can be upsetting. Have you considered any other career options that might be less stressful but still financially rewarding?",
 "It must be heartbreaking to know that you can't help everyone as much as you'd like to.",
 "It's important to prioritize your own well-being. If your job is causing you significant stress and anxiety, it may be time to consider other options.",
 "That is something to consider.  It's important to weigh the pros and cons of staying in a stressful job versus finding a less stressful job with lower pay.",
 'Can you think of any positive aspects of your job that outweigh the negative aspects?',
 "It's okay to wonder if it's for you. Most people do at one poin

# Load the test data set to generate model responses

In [None]:
# Replace 'file_path' with the path to your text file
file_path = 'testdata.txt'

# Initialize a list to store the JSON objects
testdata = []

# Open the file and read line by line
with open(file_path, 'r') as file:
    for line in file:
        # Parse each line as a JSON object and append to the list
        json_line = json.loads(line.strip())
        testdata.append(json_line)

# Renaming the keys as per the requirements
for conversation in testdata:
    for message in conversation['dialog']:
        # Renaming 'text' to 'content'
        message['content'] = message.pop('text')
        # Renaming 'speaker' values
        if message['speaker'] == 'sys':
            message['speaker'] = 'supporter'
        elif message['speaker'] == 'usr':
            message['speaker'] = 'seeker'

In [None]:
len(testdata)

195

# Generate model responses and golden responses for all

In [None]:
# Initialise list of lists for model responses and golden responses
all_model_responses = []
all_golden_responses = []

# Iterate over all test conversations
for i in range(0, len(testdata)):

  # Print statement
  print(f'Starting to generate model responses for test conversation {i} of {len(testdata)}')

  # Get dialog data
  dialog = testdata[i]['dialog']

  # Get seeker_list, supporter_list and who starts the conversation
  seeker_list, supporter_list, first_speaker = sort_and_track_dialog(dialog)

  # Create chat history
  chat_history = create_chat_history(prompt, model_start, seeker_list, supporter_list, first_speaker)

  # Generate model responses
  model_responses, golden_responses = generate_model_responses(dialog, prompt, model_start)

  # Add model and golden responses to list of lists
  all_model_responses.append(model_responses)
  all_golden_responses.append(golden_responses)

Starting to generate model responses for test conversation 0 of 195
Starting to generate model responses for test conversation 1 of 195
Starting to generate model responses for test conversation 2 of 195
Starting to generate model responses for test conversation 3 of 195
Starting to generate model responses for test conversation 4 of 195
Starting to generate model responses for test conversation 5 of 195
Starting to generate model responses for test conversation 6 of 195
Starting to generate model responses for test conversation 7 of 195
Starting to generate model responses for test conversation 8 of 195
Starting to generate model responses for test conversation 9 of 195
Starting to generate model responses for test conversation 10 of 195
Starting to generate model responses for test conversation 11 of 195
Starting to generate model responses for test conversation 12 of 195
Starting to generate model responses for test conversation 13 of 195
Starting to generate model responses for tes

In [None]:
# Save all_model_responses and all_golden_responses to disk
with open(save_model_responses, 'wb') as f:
    pickle.dump(all_model_responses, f)

with open(save_golden_responses, 'wb') as f:
    pickle.dump(all_golden_responses, f)

print('All model and golden responses were saved.')

All model and golden responses were saved.


# Load model responses

In [None]:
# Load all_model_responses from disk
with open(save_model_responses, 'rb') as f:
    all_model_responses = pickle.load(f)

# Load all_golden_responses from disk
with open(save_golden_responses, 'rb') as f:
    all_golden_responses = pickle.load(f)

print('All model and golden responses were loaded.')

All model and golden responses were loaded.


# Flatten responses for evaluation

In [None]:
model_response_flattened = [item for sublist in all_model_responses for item in sublist]
conversation_golden_responses_flattened = [item for sublist in all_golden_responses for item in sublist]

# Evaluation with BLEU

In [None]:
def calculate_bleu_scores(model_responses, golden_responses, is_corpus=False):

    # Ensure that the lengths of model responses and golden responses are the same
    assert len(model_responses) == len(golden_responses), "The lengths of model responses and golden responses should match."

    # Initialize lists to store individual sentence scores for each BLEU n-gram
    bleu_1_scores = []
    bleu_2_scores = []
    bleu_3_scores = []
    bleu_4_scores = []

    # Calculate sentence BLEU scores for each response
    for model_response, golden_response in zip(model_responses, golden_responses):
        reference = [golden_response.split()]  # Tokenize the golden response
        candidate = model_response.split()     # Tokenize the model response

        # Calculate and store BLEU scores for each sentence
        bleu_1_scores.append(sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=SmoothingFunction().method1))
        bleu_2_scores.append(sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0), smoothing_function=SmoothingFunction().method1))
        bleu_3_scores.append(sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0), smoothing_function=SmoothingFunction().method1))
        bleu_4_scores.append(sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1))

    # Calculate average BLEU scores across all responses
    avg_bleu_1 = sum(bleu_1_scores) / len(bleu_1_scores)
    avg_bleu_2 = sum(bleu_2_scores) / len(bleu_2_scores)
    avg_bleu_3 = sum(bleu_3_scores) / len(bleu_3_scores)
    avg_bleu_4 = sum(bleu_4_scores) / len(bleu_4_scores)

    return avg_bleu_1, avg_bleu_2, avg_bleu_3, avg_bleu_4

In [None]:
avg_bleu_1, avg_bleu_2, avg_bleu_3, avg_bleu_4 = calculate_bleu_scores(model_response_flattened, conversation_golden_responses_flattened)

print("Average BLEU-1 score:", avg_bleu_1 * 100)
print("Average BLEU-2 score:", avg_bleu_2 * 100)
print("Average BLEU-3 score:", avg_bleu_3 * 100)
print("Average BLEU-4 score:", avg_bleu_4 * 100)

Average BLEU-1 score: 9.948840934544801
Average BLEU-2 score: 2.9352130299880823
Average BLEU-3 score: 1.50297647761927
Average BLEU-4 score: 0.9452084170030023


# Meteor

In [None]:
def calculate_meteor(candidate, reference):
  '''
  candidate, reference: tokenized list of words in the sentence
  '''
  mt_list = []
  mt_sum = 0
  for c, r in zip(candidate, reference):
    r_tokenized = word_tokenize(r)
    c_tokenized = word_tokenize(c)
    meteor_score = round(meteor([c_tokenized], r_tokenized), 4)
    #print("Model output: ", c)
    #print("Gold reference: ", r)
    #print("Meteor score: ", meteor_score)
    mt_list.append(meteor_score)
    mt_sum += meteor_score
    #print("\n")
  avg_mt = mt_sum / len(mt_list)
  print(f"Total average meteor score: {str(avg_mt)}")
  return meteor_score

In [None]:
calculate_meteor(model_response_flattened, conversation_golden_responses_flattened)

Total average meteor score: 0.10695113373438238


0.1535

# Rouge

In [None]:
# Initialize the scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Initialize sums for each ROUGE score
sum_rougeL_precision, sum_rougeL_recall, sum_rougeL_fmeasure = 0, 0, 0

# Calculate scores for each sentence pair
for m, c in zip(model_response_flattened, conversation_golden_responses_flattened):
    score = scorer.score(m, c)

    # Accumulate the scores
    sum_rougeL_precision += score["rougeL"].precision
    sum_rougeL_recall += score["rougeL"].recall
    sum_rougeL_fmeasure += score["rougeL"].fmeasure

# Calculate the averages
avg_rougeL_precision = sum_rougeL_precision / len(model_response_flattened)
avg_rougeL_recall = sum_rougeL_recall / len(model_response_flattened)
avg_rougeL_fmeasure = sum_rougeL_fmeasure / len(model_response_flattened)

In [None]:
# Print the average scores
print(f'Average ROUGE-L Precision: {avg_rougeL_precision}')
print(f'Average ROUGE-L Recall: {avg_rougeL_recall}')
print(f'Average ROUGE-L F-measure: {avg_rougeL_fmeasure}')

Average ROUGE-L Precision: 0.24231084010987666
Average ROUGE-L Recall: 0.09718937290911356
Average ROUGE-L F-measure: 0.12655826933857547


# Distinct

In [None]:
# Clone the Distinct-N repository
!git clone https://github.com/neural-dialogue-metrics/Distinct-N.git
%cd Distinct-N

from distinct_n.utils import ngrams

def distinct_n_sentence_level(sentence, n):
    """
    Compute distinct-N for a single sentence.
    :param sentence: a list of words.
    :param n: int, ngram.
    :return: float, the metric value.
    """
    if len(sentence) == 0:
        return 0.0  # Prevent a zero division
    distinct_ngrams = set(ngrams(sentence, n))
    return len(distinct_ngrams) / len(sentence)

def distinct_n_corpus_level(sentences, n):
    """
    Compute average distinct-N of a list of sentences (the corpus).
    :param sentences: a list of sentence.
    :param n: int, ngram.
    :return: float, the average value.
    """
    return sum(distinct_n_sentence_level(sentence, n) for sentence in sentences) / len(sentences)

Cloning into 'Distinct-N'...
remote: Enumerating objects: 79, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 79 (delta 6), reused 12 (delta 6), pack-reused 64[K
Receiving objects: 100% (79/79), 186.03 KiB | 2.21 MiB/s, done.
Resolving deltas: 100% (28/28), done.
/content/Distinct-N


In [None]:
distinct_1_list = []

for response in model_response_flattened:
    d_1 = distinct_n_sentence_level(response, 1)  # or adjust max_length as needed
    distinct_1_list.append(d_1)

# Now, 'perplexities' contains the perplexity for each response.
# You can print them out or analyze them further as needed.
#print(distinct_1_list)

d_1 = sum(distinct_1_list) / len(distinct_1_list)
print(f'Distinct 1: {d_1}')

Distinct 1: 0.12314263432794803


In [None]:
distinct_2_list = []

for response in model_response_flattened:
    d_2 = distinct_n_sentence_level(response, 2)  # or adjust max_length as needed
    distinct_2_list.append(d_2)

# Now, 'perplexities' contains the perplexity for each response.
# You can print them out or analyze them further as needed.
#print(distinct_2_list)

d_2 = sum(distinct_2_list) / len(distinct_2_list)
print(f'Distinct 2: {d_2}')

Distinct 2: 0.537523993167126


In [None]:
traindata[0]['dialog']

[{'speaker': 'seeker', 'content': 'Hello good afternoon.'},
 {'speaker': 'supporter',
  'strategy': 'Question',
  'content': 'Hi, good afternoon.'},
 {'speaker': 'seeker',
  'content': "I'm feeling anxious that I am going to lose my job."},
 {'speaker': 'supporter',
  'strategy': 'Reflection of feelings',
  'content': 'Losing a job is always anxious.'},
 {'speaker': 'seeker', 'content': "I hope I don't."},
 {'speaker': 'supporter',
  'strategy': 'Question',
  'content': 'Why do you think you will lose your job?'},
 {'speaker': 'seeker',
  'content': 'I am on short term disability and I am not ready to go back to work yet but I do not have any job protection.'},
 {'speaker': 'supporter',
  'strategy': 'Restatement or Paraphrasing',
  'content': 'Oh so your job is not protected and your short term disability will end soon? Is that correct?'},
 {'speaker': 'seeker',
  'content': "It's not ending yet, but no my job is not protected. I live in the United States, but I have not been at my jo

In [None]:
# Adjusted function to merge consecutive messages from the same speaker
def format_dialog(dialog):
    formatted_dialog = []
    current_speaker = ""
    current_content = []
    for message in dialog:
        speaker_role = "User" if message["speaker"] == "seeker" else "Model"
        if speaker_role == current_speaker:
            # If the current speaker is the same as the last, append the content
            current_content.append(message['content'].strip())
        else:
            # If the speaker changes, join the current content and start a new block
            if current_content:
                formatted_dialog.append(f"{current_speaker}: {' '.join(current_content)}")
            current_speaker = speaker_role
            current_content = [message['content'].strip()]
    # Don't forget to add the last block of messages
    if current_content:
        formatted_dialog.append(f"{current_speaker}: {' '.join(current_content)}")
    return "\n".join(formatted_dialog)

# Extract and format the specified dialogs
extracted_dialogs = []
for index in range(len(traindata)):
    formatted_dialog = format_dialog(traindata[index]['dialog'])
    extracted_dialogs.append(f"Example conversation #{index + 1}\n{formatted_dialog}")

# Save the extracted dialogs to a .txt file
file_path = "extracted_dialogs.txt"  # Specify your desired file path
with open(file_path, "w") as file:
    file.write("\n\n".join(extracted_dialogs))

file_path  # Return the path of the saved file for reference

'extracted_dialogs.txt'