This notebook was run in Google Colab. To run every cell you need to upload the data set and prompts into the same folder. The code for evaluation was partially taken from others papers and merged together. Boilerplate code was typically generated using Claude 3 Opus and GPT-4 and then adjusted for our specific use case.

In [None]:
!pip install anthropic
!pip install sacrebleu
!pip install rouge_score

Collecting anthropic
  Downloading anthropic-0.21.3-py3-none-any.whl (851 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m851.6/851.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from anthropic)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->anthropic)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->anthropic)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, anthropic
Successfully installed anthropic-

# Import libraries

In [None]:
import anthropic
import pathlib
import textwrap
import json
import time
import pickle
import sacrebleu
import numpy as np
from IPython.display import display
from IPython.display import Markdown
import nltk
from nltk.translate import meteor
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
nltk.download("wordnet")
nltk.download("punkt")
from rouge_score import rouge_scorer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Claude API key

In [None]:
client = anthropic.Anthropic(
    api_key="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",
)

# Functions

In [None]:
# Function that retrieves the whole dialog chat from a single conversation
def sort_and_track_dialog(dialog):
    seeker_list = []
    supporter_list = []
    first_speaker = None  # Track who starts the conversation

    current_speaker = None
    current_content = ""

    for i, entry in enumerate(dialog):
        speaker = entry['speaker']
        content = entry['content']

        # Determine the first speaker
        if i == 0:
            first_speaker = speaker

        if speaker == current_speaker:
            # Concatenate content if the same speaker continues
            current_content += " " + content
        else:
            # Save the previous speaker's content if any
            if current_content:
                if current_speaker == 'seeker':
                    seeker_list.append(current_content)
                elif current_speaker == 'supporter':
                    supporter_list.append(current_content)

            # Update the current speaker and content with the new entry
            current_speaker = speaker
            current_content = content

    # Add the last speaker's content to the respective list
    if current_content:
        if current_speaker == 'seeker':
            seeker_list.append(current_content)
        elif current_speaker == 'supporter':
            supporter_list.append(current_content)

    return seeker_list, supporter_list, first_speaker

In [None]:
def create_chat_history_claude(prompt, model_start, seeker_list, supporter_list, first_speaker):
    # Initialize chat_history with the prompt and initial model message
    chat_history = [
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": model_start}
    ]

    # Adjust initial model message if the first speaker is 'supporter'
    supporter_start_idx = 0
    if first_speaker == "supporter" and supporter_list:
        # Add the first message of the supporter_list to the model_start content
        chat_history[1]["content"] += "\n" + supporter_list[0]
        supporter_start_idx = 1  # Start from the second message for supporter

    # Determine the starting point for seeker messages
    seeker_start_idx = 0

    # Calculate the total iterations needed
    max_iterations = max(len(seeker_list) - seeker_start_idx, len(supporter_list) - supporter_start_idx)

    # Alternating between seeker and supporter messages
    for i in range(max_iterations):
        if i + seeker_start_idx < len(seeker_list):
            chat_history.append({
                "role": "user",
                "content": seeker_list[i + seeker_start_idx]
            })
        if i + supporter_start_idx < len(supporter_list):
            chat_history.append({
                "role": "assistant",
                "content": supporter_list[i + supporter_start_idx]
            })

    return chat_history

In [None]:
def generate_model_responses_claude(dialog, prompt, model_start):
    # Get seeker_list, supporter_list, and who starts the conversation
    seeker_list, supporter_list, first_speaker = sort_and_track_dialog(dialog)

    # Create chat history for Claude3
    chat_history = create_chat_history_claude(prompt, model_start, seeker_list, supporter_list, first_speaker)

    # Initialise model responses and golden responses
    model_responses = []
    golden_responses = []

    # Determine max index for generating responses
    max_index = len(chat_history) - 4

    for i in range(0, max_index, 2):
        try:
            # Current history
            current_history = chat_history[:i+3]  # Include the next user message for context

            # Send message to Claude3 and get model response
            message = client.messages.create(
                model="claude-3-haiku-20240307",
                max_tokens=1024,
                messages=current_history
            )
            print('Sleeping after calling API')
            time.sleep(2)

            # Extract model response
            model_response = message.content[0].text
            model_responses.append(model_response)

            #print(current_history)
            #print(model_response)

            # Save golden responses
            golden_response = chat_history[i+3]["content"]
            golden_responses.append(golden_response)

        except Exception as e:
            print(f"An error occurred: {e}")
            break  # or use `continue` depending on the desired behavior

    return model_responses, golden_responses

# Hyperparameters

In [None]:
# File path
file_path = 'ESConv.json'

# Load the JSON data
with open(file_path, 'r') as file:
    data = json.load(file)

# Get prompt
file_name = '40_example_conversations_prompt'
file_path = f'{file_name}.txt'
with open(file_path, 'r') as file:
    prompt = file.read()

# First model response for chat history
model_start = "I understand that I should provide psychological help and that the previous message provides a suitable guideline. In the following conversation, I will only reply in 1-2 sentences:"

# Save path for model and golden responses
save_golden_responses = f'{file_name}_all_golden_responses.pkl'
save_model_responses = f'{file_name}_all_model_responses.pkl'

# Load test data

In [None]:
# Replace 'file_path' with the path to your text file
file_path = 'testdata.txt'

# Initialize a list to store the JSON objects
testdata = []

# Open the file and read line by line
with open(file_path, 'r') as file:
    for line in file:
        # Parse each line as a JSON object and append to the list
        json_line = json.loads(line.strip())
        testdata.append(json_line)

# Renaming the keys as per the requirements
for conversation in testdata:
    for message in conversation['dialog']:
        # Renaming 'text' to 'content'
        message['content'] = message.pop('text')
        # Renaming 'speaker' values
        if message['speaker'] == 'sys':
            message['speaker'] = 'supporter'
        elif message['speaker'] == 'usr':
            message['speaker'] = 'seeker'

In [None]:
len(testdata)

195

# Generate model responses and golden responses for all

In [None]:
# Initialise list of lists for model responses and golden responses
all_model_responses = []
all_golden_responses = []

# Iterate over all test conversations
for i in range(0, len(testdata)):

  # Print statement
  print(f'Starting to generate model responses for test conversation {i} of {len(testdata)}')

  # Get dialog data
  dialog = testdata[i]['dialog']

  # Get seeker_list, supporter_list and who starts the conversation
  seeker_list, supporter_list, first_speaker = sort_and_track_dialog(dialog)

  # Create chat history
  chat_history = create_chat_history_claude(prompt, model_start, seeker_list, supporter_list, first_speaker)

  # Generate model responses
  model_responses, golden_responses = generate_model_responses_claude(dialog, prompt, model_start)

  # Add model and golden responses to list of lists
  all_model_responses.append(model_responses)
  all_golden_responses.append(golden_responses)

Starting to generate model responses for test conversation 0 of 195
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Starting to generate model responses for test conversation 1 of 195
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Starting to generate model responses for test conversation 2 of 195
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Sleeping after calling API
Starting to generate model responses for test conversation 3 of 195
Sleeping after calling API
Sleeping after calling API

# Save model

In [None]:
# Save all_model_responses and all_golden_responses to disk
with open(save_model_responses, 'wb') as f:
    pickle.dump(all_model_responses, f)

with open(save_golden_responses, 'wb') as f:
    pickle.dump(all_golden_responses, f)

print('All model and golden responses were saved.')

All model and golden responses were saved.


# Load model responses

In [None]:
# Load all_model_responses from disk
with open(save_model_responses, 'rb') as f:
    all_model_responses = pickle.load(f)

# Load all_golden_responses from disk
with open(save_golden_responses, 'rb') as f:
    all_golden_responses = pickle.load(f)

print('All model and golden responses were loaded.')

All model and golden responses were loaded.


In [None]:
all_golden_responses = all_golden_responses[:23]
all_model_responses = all_model_responses[:23]

In [None]:
all_model_responses

[["I'm sorry to hear you're feeling sad. Can you tell me more about what's causing those feelings? I'm here to listen without judgment and provide the support you need.",
  'The holidays can be a difficult time for many people. What is causing you the most stress and nervousness regarding the holidays?',
  "I'm so sorry to hear about the tension with your dad. That must be incredibly difficult, especially with having to go home for the holidays. Your feelings are valid. Have you considered reaching out to your school's counseling services for support during this time?",
  "That sounds like an incredibly difficult situation. Being stuck in close quarters with someone you're not speaking to must feel very stressful and anxiety-provoking. I can understand why you're feeling so nervous about going home for the holidays. Have you considered reaching out to any campus resources or counselors to see if they can provide support or guidance during this time? You deserve to feel safe and comfort

# Flatten responses for evaluation

In [None]:
model_response_flattened = [item for sublist in all_model_responses for item in sublist]
conversation_golden_responses_flattened = [item for sublist in all_golden_responses for item in sublist]

# Evaluation with BLEU

In [None]:
def calculate_bleu_scores(model_responses, golden_responses, is_corpus=False):

    # Ensure that the lengths of model responses and golden responses are the same
    assert len(model_responses) == len(golden_responses), "The lengths of model responses and golden responses should match."

    # Initialize lists to store individual sentence scores for each BLEU n-gram
    bleu_1_scores = []
    bleu_2_scores = []
    bleu_3_scores = []
    bleu_4_scores = []

    # Calculate sentence BLEU scores for each response
    for model_response, golden_response in zip(model_responses, golden_responses):
        reference = [golden_response.split()]  # Tokenize the golden response
        candidate = model_response.split()     # Tokenize the model response

        # Calculate and store BLEU scores for each sentence
        bleu_1_scores.append(sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=SmoothingFunction().method1))
        bleu_2_scores.append(sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0), smoothing_function=SmoothingFunction().method1))
        bleu_3_scores.append(sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0), smoothing_function=SmoothingFunction().method1))
        bleu_4_scores.append(sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1))

    # Calculate average BLEU scores across all responses
    avg_bleu_1 = sum(bleu_1_scores) / len(bleu_1_scores)
    avg_bleu_2 = sum(bleu_2_scores) / len(bleu_2_scores)
    avg_bleu_3 = sum(bleu_3_scores) / len(bleu_3_scores)
    avg_bleu_4 = sum(bleu_4_scores) / len(bleu_4_scores)

    return avg_bleu_1, avg_bleu_2, avg_bleu_3, avg_bleu_4

In [None]:
avg_bleu_1, avg_bleu_2, avg_bleu_3, avg_bleu_4 = calculate_bleu_scores(model_response_flattened, conversation_golden_responses_flattened)

print("Average BLEU-1 score:", avg_bleu_1 * 100)
print("Average BLEU-2 score:", avg_bleu_2 * 100)
print("Average BLEU-3 score:", avg_bleu_3 * 100)
print("Average BLEU-4 score:", avg_bleu_4 * 100)

Average BLEU-1 score: 10.494820458858978
Average BLEU-2 score: 3.149349921495902
Average BLEU-3 score: 1.6016417159576957
Average BLEU-4 score: 1.0345540833899667


# Meteor

In [None]:
def calculate_meteor(candidate, reference):
  '''
  candidate, reference: tokenized list of words in the sentence
  '''
  mt_list = []
  mt_sum = 0
  for c, r in zip(candidate, reference):
    r_tokenized = word_tokenize(r)
    c_tokenized = word_tokenize(c)
    meteor_score = round(meteor([c_tokenized], r_tokenized), 4)
    #print("Model output: ", c)
    #print("Gold reference: ", r)
    #print("Meteor score: ", meteor_score)
    mt_list.append(meteor_score)
    mt_sum += meteor_score
    #print("\n")
  avg_mt = mt_sum / len(mt_list)
  print(f"Total average meteor score: {str(avg_mt)}")
  return meteor_score

In [None]:
calculate_meteor(model_response_flattened, conversation_golden_responses_flattened)

Total average meteor score: 0.11746218181818183


0.0596

# Rouge

In [None]:
# Initialize the scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Initialize sums for each ROUGE score
sum_rougeL_precision, sum_rougeL_recall, sum_rougeL_fmeasure = 0, 0, 0

# Calculate scores for each sentence pair
for m, c in zip(model_response_flattened, conversation_golden_responses_flattened):
    score = scorer.score(m, c)

    # Accumulate the scores
    sum_rougeL_precision += score["rougeL"].precision
    sum_rougeL_recall += score["rougeL"].recall
    sum_rougeL_fmeasure += score["rougeL"].fmeasure

# Calculate the averages
avg_rougeL_precision = sum_rougeL_precision / len(model_response_flattened)
avg_rougeL_recall = sum_rougeL_recall / len(model_response_flattened)
avg_rougeL_fmeasure = sum_rougeL_fmeasure / len(model_response_flattened)

In [None]:
# Print the average scores
print(f'Average ROUGE-L Precision: {avg_rougeL_precision}')
print(f'Average ROUGE-L Recall: {avg_rougeL_recall}')
print(f'Average ROUGE-L F-measure: {avg_rougeL_fmeasure}')

Average ROUGE-L Precision: 0.2284092814326488
Average ROUGE-L Recall: 0.10738771780868564
Average ROUGE-L F-measure: 0.13233606430866787


In [None]:
# Clone the Distinct-N repository
!git clone https://github.com/neural-dialogue-metrics/Distinct-N.git
%cd Distinct-N

from distinct_n.utils import ngrams

def distinct_n_sentence_level(sentence, n):
    """
    Compute distinct-N for a single sentence.
    :param sentence: a list of words.
    :param n: int, ngram.
    :return: float, the metric value.
    """
    if len(sentence) == 0:
        return 0.0  # Prevent a zero division
    distinct_ngrams = set(ngrams(sentence, n))
    return len(distinct_ngrams) / len(sentence)

def distinct_n_corpus_level(sentences, n):
    """
    Compute average distinct-N of a list of sentences (the corpus).
    :param sentences: a list of sentence.
    :param n: int, ngram.
    :return: float, the average value.
    """
    return sum(distinct_n_sentence_level(sentence, n) for sentence in sentences) / len(sentences)

Cloning into 'Distinct-N'...
remote: Enumerating objects: 79, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 79 (delta 6), reused 12 (delta 6), pack-reused 64[K
Receiving objects: 100% (79/79), 186.03 KiB | 4.77 MiB/s, done.
Resolving deltas: 100% (28/28), done.
/content/Distinct-N


In [None]:
distinct_1_list = []

for response in model_response_flattened:
    d_1 = distinct_n_sentence_level(response, 1)  # or adjust max_length as needed
    distinct_1_list.append(d_1)

# Now, 'perplexities' contains the perplexity for each response.
# You can print them out or analyze them further as needed.
#print(distinct_1_list)

d_1 = sum(distinct_1_list) / len(distinct_1_list)
print(f'Distinct 1: {d_1}')

Distinct 1: 0.14610076763182236


In [None]:
distinct_2_list = []

for response in model_response_flattened:
    d_2 = distinct_n_sentence_level(response, 2)  # or adjust max_length as needed
    distinct_2_list.append(d_2)

# Now, 'perplexities' contains the perplexity for each response.
# You can print them out or analyze them further as needed.
#print(distinct_2_list)

d_2 = sum(distinct_2_list) / len(distinct_2_list)
print(f'Distinct 2: {d_2}')

Distinct 2: 0.5771637195011393


# Single examples

In [None]:
chat_history[:4]

[{'role': 'user',
  'content': 'You are a psychologist. You should answer in 1-2 sentences. The support you provide should be world-class.'},
 {'role': 'assistant',
  'content': 'I understand that I should provide psychological help and that the previous message provides a suitable guideline. In the following conversation, I will only reply in 1-2 sentences:'},
 {'role': 'user', 'content': 'Hello\n'},
 {'role': 'assistant', 'content': 'Hello, what would you like to talk about?'}]

In [None]:

message = client.messages.create(
    model="claude-3-haiku-20240307",
    max_tokens=1024,
    messages=chat_history[:7]
)
print(message)

Message(id='msg_01V1nKughnB6oaPKj8tNKzMH', content=[ContentBlock(text='I understand the difficulty in balancing financial stability and mental well-being. Have you considered speaking to a career counselor to explore options that could provide a better work-life balance while still meeting your financial needs? They may have insightful suggestions to ease your transition.', type='text')], model='claude-3-haiku-20240307', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=Usage(input_tokens=150, output_tokens=58))


In [None]:
message.content[0].text

"I'm afraid I don't have enough context to fully understand your request. Could you please provide some more details about what you'd like me to explain? I'm happy to try my best to provide an explanation, but need a bit more information about the specific topic or question you have in mind."