In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/nlp-2021-vda/evaluate_models/
!pwd

If you want to run evaluation on our sampled responses, download data from google cloud storage.

In [None]:
import os
if not os.path.isdir('./sampled_responses'):
    !gsutil -m cp -r gs://nlp-lab/evaluate_models/sampled_responses ./

In [None]:
if not os.path.isdir('./empathy_mental_health/trained_models'):
  !gsutil -m cp -r gs://nlp-lab/evaluate_models/empathy_mental_health/trained_models ./empathy_mental_health/

In [None]:
!pip install -U nltk

In [None]:
!pip install transformers

When nltk.download opens choose 'Download' and as package 'stopwords'. Then choose quit.

In [None]:
import json
import nltk
nltk.download()
from nltk.translate.meteor_score import meteor_score
import matplotlib.pyplot as plt
import numpy as np
nltk.download('wordnet')

# Metric calculation 
from data import metrics
from data import parse_sampled_responses
from data import metric_averages_or_ratios

# For plotting
import plotly.express as px
import plotly.graph_objs as go
import pandas as pd
from plotly.subplots import make_subplots

Plot METEOR Score vs Training Step and Returns vs Training Step

In [None]:
path_list = ['supervised_0.7', 'run7_1', 'run7_2', 'run7_3', 'run7_4', 'run7_5', 'run7_6', 'run7_7', 'run7_8']
file_scores = []
gold_path = 'sampled_responses/policy-1/gold.json'

In [None]:
for path in path_list:
  with open(gold_path) as f:
    gold = json.load(f)

  with open(f"sampled_responses/policy-1/{path}.json") as f:
    data = json.load(f)
  scores = []
  for i in range(len(gold)):
    score = meteor_score([gold[i]['gold_response']], data[i]['sample0'])
    scores.append(score)
  file_scores.append(sum(scores) / len(scores))

In [None]:
x = [0, 25, 50, 75, 100, 125, 150, 175, 200]
returns = [0.99, 1.19, 1.28, 1.81, 2.2, 2.74, 3.08, 3.49, 3.57]

In [None]:
fig, ax1 = plt.subplots()

color = 'tab:blue'
ax1.set_xlabel('Training step')
ax1.set_ylabel('METEOR score', color=color)
ax1.plot(x, file_scores, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:orange'
ax2.set_ylabel('Returns', color=color)  # we already handled the x-label with ax1
ax2.plot(x, returns, color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()

# Perplexity

# Utterance length

In [None]:
path_list = ['supervised_0.7', 'run7_1', 'run7_2', 'run7_3', 'run7_4', 'run7_5', 'run7_6', 'run7_7', 'run7_8']
x = [0, 25, 50, 75, 100, 125, 150, 175, 200]
gold_path = 'sampled_responses/policy-2/gold.json'

In [None]:
def getWordCountMetricDictFromFilename(file_name, metric_dict):
    parsed_conversations = parse_sampled_responses.getParsedConversations(f"sampled_responses/policy-1/{file_name}.json",'sample0')

    parsed_conversations_dict = metric_averages_or_ratios.getMetricDict(parsed_conversations,metric_dict,metric_dict)
    return parsed_conversations_dict

def getGoldMetricDict(metric_dict):
    parsed_gold_conversations = parse_sampled_responses.getParsedConversations(gold_path,'gold_response')
    gold_metric = metric_averages_or_ratios.getMetricDict(parsed_gold_conversations,metric_dict,metric_dict)
    return gold_metric

In [None]:
utterance_length_dict = {}
utterance_length_metric_names = ['utterance_length']

gold_utterance_length = getGoldMetricDict(utterance_length_metric_names)

counter = 0
for path in path_list:
    utterance_length = getWordCountMetricDictFromFilename(path,utterance_length_metric_names)
    utterance_length_dict[x[counter]] = utterance_length['utterance_length']
    counter = counter + 1
    
print(utterance_length_dict)

In [None]:
fig = px.line(x=list(utterance_length_dict.keys()), y=list(utterance_length_dict.values()), title='Utternace length (y) vs #trainingsteps (x)')
fig.add_shape(go.layout.Shape(type="line",
                                    name="gold",
                                    x0=0,
                                    y0=gold_utterance_length['utterance_length'],
                                    x1=200,
                                    y1=gold_utterance_length['utterance_length'],
                                    line=dict(color='yellow', width=2),))
fig.append_trace(go.Scatter(
        showlegend = False,
        x=[210],
        y=[gold_utterance_length['utterance_length']],
        text=["gold"],
        mode="text",
    ),row=1,col=1)
fig.show()

# Repetition

In [None]:
repetition_dict = {}

repetition_metric_names = ['conversation_repetition',
                            'self_repetition',
                            'utterance_repetition',
                            'word_repetition']

gold_repetition_dict = getGoldMetricDict(repetition_metric_names)


counter = 0
for path in path_list:
    repetition_metrics = getWordCountMetricDictFromFilename(path,repetition_metric_names)
    repetition_dict[x[counter]] = repetition_metrics
    counter = counter + 1
print(repetition_dict)

In [None]:
nrows = 4
df = pd.DataFrame(repetition_dict).transpose()

plot_names = [i[0] for i in list(gold_repetition_dict.items())]

fig = make_subplots(rows=nrows, cols=1, subplot_titles=plot_names)

fig.update_xaxes(title_text="number of training steps", row=nrows, col=1)
fig.update_yaxes(title_text="average word count", row=2, col=1)


for i in range(0,nrows):
    fig.append_trace(go.Scatter(
        x=list(df.index),
        y=df.iloc[:,i],
        name=plot_names[i],
        legendgroup = '1',
    ), row=(i+1), col=1)


# add shapes
col_count = 1
for i in range(0,nrows):
    gold = list(gold_repetition_dict.items())[i][1]
    fig.add_shape(go.layout.Shape(type="line",
                                    name="gold",
                                    x0=0,
                                    y0=gold,
                                    x1=200,
                                    y1=gold,
                                    line=dict(color='yellow', width=2),),
                  row=(i+1),
                  col=1)
    fig.append_trace(go.Scatter(
        showlegend = False,
        x=[210],
        y=[gold],
        text=["gold"],
        mode="text",
    ), row=(i+1), col=1)
    col_count = col_count+1


fig.update_layout(height=600, width=800, title_text="Word Count metrics vs number of training steps")
fig.show()

# Question

In [None]:
def getQuestionMetricDictFromFile(data_filepath,gold_filepath,response_type):
    parsed_conversations = parse_sampled_responses.getParsedConversations(data_filepath,response_type)

    parsed_conversations_dict = metric_averages_or_ratios.getMetricDict(parsed_conversations,['question'], ['question'])
    
    if response_type == 'gold_response':
        parsed_conversations_dict["ratio_of_sample_is_question_of_all_samples"] = 0
        parsed_conversations_dict["ratio_of_sample_is_question_if_gold_is_question"] = 0
        parsed_conversations_dict["ratio_of_sample_is_question_if_gold_is_no_question"] = 0
    else:
        parsed_conversations_gold = parse_sampled_responses.getParsedConversations(gold_filepath,'gold_response')
        parsed_conversations_dict["ratio_of_sample_is_question_of_all_samples"] = metric_averages_or_ratios.getSampleQuestionOfAllSamplesRatio(parsed_conversations)
        parsed_conversations_dict["ratio_of_sample_is_question_if_gold_is_question"] = metric_averages_or_ratios.getGoldQuestionVsSampleRatio(parsed_conversations_gold,parsed_conversations)
        parsed_conversations_dict["ratio_of_sample_is_question_if_gold_is_no_question"] = metric_averages_or_ratios.getNoGoldQuestionVsSampleRatio(parsed_conversations_gold,parsed_conversations)
    
    return parsed_conversations_dict

In [None]:
question_dict = {}

gold_question_dict = getQuestionMetricDictFromFile(gold_path,gold_path,'gold_response')


counter = 0
for path in path_list:
    print(path)
    question_metrics = getQuestionMetricDictFromFile(f"sampled_responses/policy-1/{path}.json",gold_path,'sample0')
    question_dict[x[counter]] = question_metrics
    counter = counter + 1
print(question_dict)

In [None]:
nrows = 4

df = pd.DataFrame(question_dict).transpose()

plot_names = [i[0] for i in list(gold_question_dict.items())]

fig = make_subplots(rows=nrows, cols=1, subplot_titles=plot_names)

fig.update_xaxes(title_text="Training steps", row=nrows, col=1)
fig.update_yaxes(title_text="value", row=(2), col=1)


for i in range(0,nrows):
    fig.append_trace(go.Scatter(
        x=list(df.index),
        y=df.iloc[:,i],
        name=plot_names[i],
        legendgroup = '1',
    ), row=(i+1), col=1)


# add shapes
    
gold = list(gold_question_dict.items())[0][1]
fig.add_shape(go.layout.Shape(type="line",
                                    name="gold",
                                    x0=0,
                                    y0=gold,
                                    x1=200,
                                    y1=gold,
                                    line=dict(color='yellow', width=2),),
                  row=(1),
                  col=1)
fig.append_trace(go.Scatter(
    showlegend = False,
    x=[210],
    y=[gold],
    text=["gold"],
    mode="text",
), row=1, col=1)


fig.update_layout(height=600, width=1000, title_text="Word Count metrics vs number of training steps")
fig.show()

# Empathy

In [None]:
content_metric_names = ['empathy']

content_metric_names_separated = [
                            'emotional_reaction_level',
                            'interpretation_level',
                            'exploration_level']

In [None]:
MAX_SAMPLE = 100

In [None]:
def getContentMetricDictFromFile(data_filepath,gold_filepath,response_type):
    print("start")
    parsed_conversations = parse_sampled_responses.getParsedConversations(data_filepath,response_type)[0:MAX_SAMPLE]

    parsed_conversations_dict = metric_averages_or_ratios.getMetricDict(parsed_conversations,content_metric_names, content_metric_names_separated)
    
    return parsed_conversations_dict

In [None]:
gold_content_metrics = getContentMetricDictFromFile(gold_path,gold_path,'gold_response')

empathy_dict = {}
counter = 0
for path in path_list:
    print(path)
    empathy_metrics = getContentMetricDictFromFile(f"sampled_responses/policy-1/{path}.json",gold_path,'sample0')
    empathy_dict[x[counter]] = empathy_metrics
    counter = counter + 1
print(empathy_dict)

In [None]:
df2 = pd.DataFrame(empathy_dict).transpose()

nrows = 3

plot_names = [i[0] for i in list(gold_content_metrics.items())]

fig = make_subplots(rows=nrows, cols=1, subplot_titles=plot_names)

for i in range(0,nrows):
    fig.append_trace(go.Scatter(
        x=list(df2.index),
        y=df2.iloc[:,i],
        name=plot_names[i],
        legendgroup = '1',
    ), row=(i+1), col=1)
    
fig.update_xaxes(title_text="training steps", row=nrows, col=1)
fig.update_yaxes(title_text="average word count", row=(2), col=1)


# add shapes
col_count = 1
for i in range(0,nrows):
    gold = list(gold_content_metrics.items())[i][1]
    fig.add_shape(go.layout.Shape(type="line",
                                    x0=0,
                                    y0=gold,
                                    x1=200,
                                    y1=gold,
                                    line=dict(color='yellow', width=2),),
                  row=(i+1),
                  col=1)
    fig.append_trace(go.Scatter(
        showlegend = False,
        x=[210],
        y=[gold],
        text=["gold"],
        mode="text",
    ), row=(i+1), col=1)
    col_count = col_count+1


fig.update_layout(height=600, width=800, title_text="Word Count metrics vs KL calculated from "+str(MAX_SAMPLE)+" samples")
fig.show()