# Evaluation

This notebook covers the automatic and human evaluations of our experiment.

In [1]:
# Install all the required packages.
%pip install bert-score
!pip install rouge-score
!pip install scipy
import pandas as pd
from bert_score import BERTScorer
from rouge_score import rouge_scorer
import matplotlib.pyplot as plt
import scipy.stats
import itertools

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
#read the dataset
df_test_results = pd.read_pickle('../test_results.pkl')

## BERTScore

In [None]:
# initialize BERT-Score object
scorer = BERTScorer(lang="en")

In [None]:
# calculate BERT-Score for each turn in each dialogue of testset
# one time Groundtrouth vs. Aproach, second time Groundtrouth vs. Basline
df_test_results['bert_score'] = df_test_results['results'].map(lambda l: list(map(lambda e: [scorer.score([e[1]],[e[2]]), scorer.score([e[1]],[e[3]])], l)))

In [None]:
# calculate_avg_bert_score_Approach for one dialogue
def calculate_avg_bert_score_Approach(row):
    cnt, res = 0, 0
    for (p1, r1, f11), (p2, r2, f12) in row:
        cnt += 1   
        res += f11 #Approach
    return res/cnt

In [None]:
# calculate_avg_bert_score_Basline for one dialogue
def calculate_avg_bert_score_Basline(row):
    cnt, res = 0, 0
    for (p1, r1, f11), (p2, r2, f12) in row:
        cnt += 1   
        res += f12 #Basline
    return res/cnt

In [None]:
# Calculate BERT-Scores
df_test_results['bert_score_avg_Ap'] = df_test_results['bert_score'].apply(calculate_avg_bert_score_Approach)
df_test_results['bert_score_avg_BL'] = df_test_results['bert_score'].apply(calculate_avg_bert_score_Basline)
df_test_results.bert_score_avg_Ap = df_test_results.bert_score_avg_Ap.apply(lambda l: l[0].item())
df_test_results.bert_score_avg_BL = df_test_results.bert_score_avg_BL.apply(lambda l: l[0].item())

## ROUGE Score

In [None]:
# Calculate ROUGE scores
rg_scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
df_test_results['rouge_score_Approach'] = df_test_results['results'].map(lambda l: list(map(lambda e: rg_scorer.score(e[1],e[2])['rouge1'][2], l)))
df_test_results['rouge_score_Basline'] = df_test_results['results'].map(lambda l: list(map(lambda e: rg_scorer.score(e[1],e[3])['rouge1'][2], l)))
df_test_results['Rouge_Score_Avg_Approach'] = df_test_results['rouge_score_Approach'].map(lambda l: sum(l)/len(l))
df_test_results['Rouge_Score_Avg_Basline'] = df_test_results['rouge_score_Basline'].map(lambda l: sum(l)/len(l))

## Summary of Automatic Evaluation

In [None]:
df_test_results[['bert_score_avg_Ap', 'bert_score_avg_BL', 'Rouge_Score_Avg_Approach', 'Rouge_Score_Avg_Basline']].describe()

## Human Evaluation

In [None]:
# Enter the data of human evaluation
U1 = [ 
    1, 3, 2, 1, 3, 2, 3, 2, 1, 3, 2, 1, 1, 2, 3, 2, 1, 3, 2, 3, 1, 3, 2, 1, 1,
    3, 2, 2, 1, 3, 2, 1, 3, 3, 1, 2, 1, 3, 2, 1, 2, 3, 1, 3, 2, 2, 3, 1, 1, 2,
    3, 2, 3, 1, 3, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, 3, 3, 1, 2, 2, 3, 1, 3, 2, 1,
    1, 2, 3, 3, 2, 1, 3, 2, 1, 3, 2, 1, 3, 1, 2
]
U2 = [
    1, 3, 2, 1, 3, 2, 3, 1, 2, 3, 2, 1, 1, 2, 3, 1, 3, 2, 1, 2, 3, 3, 1, 2, 1,
    2, 3, 1, 3, 2, 2, 1, 3, 3, 1, 2, 1, 3, 2, 1, 3, 2, 1, 3, 2, 2, 3, 1, 1, 2,
    3, 2, 1, 3, 1, 3, 2, 1, 3, 2, 2, 1, 3, 1, 2, 3, 2, 1, 3, 1, 3, 2, 1, 3, 2,
    1, 3, 2, 2, 1, 3, 1, 2, 3, 3, 1, 2, 1, 3, 2
]
U3 = [ 
    1, 3, 2, 1, 2, 3, 3, 2, 1, 3, 2, 1, 2, 3, 1, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1,
    2, 3, 1, 2, 3, 3, 2, 1, 2, 1, 3, 2, 3, 1, 3, 2, 1, 2, 3, 1, 2, 3, 1, 1, 2,
    3, 1, 3, 2, 3, 1, 2, 1, 2, 3, 2, 1, 3, 3, 1, 2, 3, 1, 2, 2, 3, 1, 1, 3, 2,
    1, 3, 2, 2, 1, 3, 1, 2, 3, 3, 1, 2, 3, 1, 2
]

U4 = [ 
    1, 2, 3, 1, 3, 2, 3, 2, 1, 3, 2, 1, 1, 2, 3, 2, 1, 3, 1, 2, 3, 1, 3, 2, 1,
    3, 2, 3, 1, 2, 3, 2, 1, 3, 1, 2, 2, 1, 3, 3, 1, 2, 1, 2, 3, 2, 3, 1, 1, 2,
    3, 3, 2, 1, 1, 3, 2, 3, 2, 1, 2, 1, 3, 1, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 1,
    1, 2, 3, 3, 2, 1, 3, 2, 1, 2, 1, 3, 3, 1, 2
]
U5 = [ 
    1, 3, 2, 1, 2, 3, 3, 1, 2, 3, 2, 1, 3, 2, 1, 1, 3, 2, 3, 2, 1, 3, 1, 2, 1,
    3, 2, 2, 3, 1, 2, 3, 1, 3, 1, 2, 3, 2, 1, 3, 1, 2, 1, 3, 2, 2, 3, 1, 1, 2,
    3, 2, 1, 3, 3, 1, 2, 3, 1, 2, 2, 1, 3, 3, 1, 2, 3, 2, 1, 2, 3, 1, 1, 3, 2,
    2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 2, 1, 3, 1, 2
]
U6 = [
    1, 3, 2, 1, 2, 3, 3, 2, 1, 3, 2, 1, 1, 2, 3, 1, 2, 3, 2, 3, 1, 1, 2, 3, 1,
    2, 3, 1, 2, 3, 2, 3, 1, 3, 1, 2, 1, 3, 2, 2, 1, 3, 2, 1, 3, 1, 3, 2, 2, 1,
    3, 1, 2, 3, 1, 3, 2, 1, 2, 3, 2, 1, 3, 1, 2, 3, 3, 1, 2, 2, 3, 1, 1, 3, 2,
    2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 3, 1, 2
]
U7 = [
    1, 2, 3, 1, 2, 3, 3, 1, 2, 3, 2, 1, 1, 2, 3, 2, 3, 1, 1, 2, 3, 3, 2, 1, 1,
    2, 3, 3, 1, 2, 3, 2, 1, 3, 1, 2, 1, 2, 3, 2, 1, 3, 1, 3, 2, 3, 2, 1, 2, 1,
    3, 1, 2, 3, 1, 3, 2, 1, 2, 3, 1, 3, 2, 1, 2, 3, 3, 2, 1, 2, 3, 1, 1, 3, 2,
    1, 2, 3, 2, 1, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3
]
U8 = [
    2, 1, 3, 1, 2, 3, 3, 2, 1, 3, 2, 1, 2, 1, 3, 1, 2, 3, 3, 2, 1, 2, 1, 3, 1,
    2, 3, 1, 2, 3, 3, 2, 1, 2, 1, 3, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3, 2, 1, 1, 2,
    3, 1, 3, 2, 3, 1, 2, 3, 1, 2, 2, 1, 3, 1, 3, 2, 2, 1, 3, 2, 3, 1, 2, 3, 1,
    1, 2, 3, 3, 2, 1, 1, 2, 3, 1, 2, 3, 1, 3, 2
]


In [None]:
# Calculate the Kendall Tau correlation coefficients for all pairs of arrays and calculate the average for all pairs
array_names = ["U1", "U2", "U3", "U4", "U5", "U6", "U7", "U8"]
arrays = [U1, U2, U3, U4, U5, U6, U7, U8]

kendall_tau_results = []

for i in range(len(arrays) - 1):
    for j in range(i + 1, len(arrays)):
        kendall_tau = scipy.stats.kendalltau(arrays[i], arrays[j])
        kendall_tau_results.append(kendall_tau.correlation)
        print(kendall_tau.correlation)

average_kendall_tau = sum(kendall_tau_results) / len(kendall_tau_results)
print(f"Average Kendall Tau correlation coefficient for all pairs of arrays: {average_kendall_tau}")

In [None]:
# Enter the summary of Human Evaluation A (Groundtruth), B (Approach), C (Baseline)

human_eval_summary = {
#               A , B , C
    'Rank 1': [109, 69, 62],
    'Rank 2': [56, 104, 80],
    'Rank 3': [75, 67, 98]
}

# Define the index labels
index_labels = ['Groundtruth', 'Approach', 'Baseline']

# Create the pandas DataFrame
df_human_eval_summary = pd.DataFrame(human_eval_summary, index=index_labels)

# To display the DataFrame, you can use
print(df_human_eval_summary)


In [None]:
# Create the bar chart using the DataFrame `df`
ax = df_human_eval_summary.T.plot(kind='bar', figsize=(10, 7), colormap='tab20c')


# Set the title and labels
# ax.set_title('Comparison of Results of Human Evaluation')
ax.set_xlabel('Turn Rank')
plt.xticks(rotation=0)
# ax.set_ylabel('Occurrences')
ax.set_ylim([0, 125])
for container in ax.containers:
    ax.bar_label(container)

# Display the plot
plt.show()
