# Installs

In [None]:
%pip install datasets

In [None]:
%pip install rouge_score

In [None]:
%pip install sacrebleu

In [None]:
%pip install -U nltk

In [None]:
%pip install git+https://github.com/google-research/bleurt.git

In [None]:
%pip install bert_score

In [None]:
!git clone "https://github.com/neural-dialogue-metrics/Distinct-N.git"

In [None]:
%pip install ./Distinct-N/

In [None]:
%pip install lexical-diversity

# Load dataset

In [None]:
# Load dataset
from datasets import load_dataset
data = load_dataset("gem", "common_gen")

## Explore dataset structure

In [None]:
data

In [None]:
test_instance = data['test'][0]
test_instance

{'concept_set_id': 0,
 'concepts': ['drill', 'field', 'run', 'team'],
 'gem_id': 'common_gen-test-0',
 'gem_parent_id': 'common_gen-test-0',
 'references': [],
 'target': ''}

In [None]:
validation_instance = data['validation'][0]
validation_instance

{'concept_set_id': 0,
 'concepts': ['field', 'look', 'stand'],
 'gem_id': 'common_gen-validation-0',
 'gem_parent_id': 'common_gen-validation-0',
 'references': ['The player stood in the field looking at the batter.',
  'The coach stands along the field, looking at the goalkeeper.',
  'I stood and looked across the field, peacefully.',
  'Someone stands, looking around the empty field.'],
 'target': 'The player stood in the field looking at the batter.'}

In [None]:
challenge_test_instance = data['challenge_test_scramble'][0]
challenge_test_instance

{'concept_set_id': 1428,
 'concepts': ['ice', 'wall', 'use', 'climb', 'pick'],
 'gem_id': 'common_gen-challenge_test_scramble-0',
 'gem_parent_id': 'common_gen-test-1428',
 'references': [],
 'target': ''}

In [None]:
challenge_validation_instance = data['challenge_validation_sample'][0]
challenge_validation_instance

{'concept_set_id': 844,
 'concepts': ['bench', 'kid', 'line', 'ride', 'sit'],
 'gem_id': 'common_gen-challenge_validation_sample-0',
 'gem_parent_id': 'common_gen-validation-844',
 'references': ['The kid stood in line for the ride while I chose to sit on the bench.',
  'The line for the ride at the fair was too long so the kid went to go sit on the bench nearby.',
  'The parents sat on a bench while the kids waited in line for an amusement park ride.',
  'A kid sits on a bench while the kid waits in line to go on a ride at the fair.',
  'People wait in line for a ride while kids sit on bench.'],
 'target': 'The kid stood in line for the ride while I chose to sit on the bench.'}

In [None]:
concept = challenge_validation_instance.get("target")
concept

'The kid stood in line for the ride while I chose to sit on the bench.'

In [None]:
validation_targets = []
for i in range(len(data['validation'])): 
  instance = data['validation'][i]
  target = instance.get("target")
  validation_targets.append(target)
validation_targets[0]

'The player stood in the field looking at the batter.'

## Extract validation set

In [59]:
validation_data = []
validation_data = data['validation']
validation_data['target']

['The player stood in the field looking at the batter.',
 'The silly kid loves to dance in her room.',
 'A pet cat likes to sleep on a couch.',
 'The mouse climbed the side of the building.',
 'The woman thats talking teaches her students how to climb the wall.',
 'The car drove through the snow.',
 'The boy wants to wear earpods as he talks on his phone.',
 'The team practiced hockey at the outdoor rink.',
 'the ocean is where surfers go to surf',
 'A boy jumps up a flight of stairs taking two at a time.',
 'The school band played their instruments in unison rendering a sweet music.',
 'The students sit in a circle as the teacher talks.',
 'I quickly moved back, while still looking into the eyes of the stranger.',
 'He looked at his watch and then out of the window.',
 'The athlete performs stretches as they prepare to run.',
 'The hiker bought new gear to wear for the walk to the waterfall.',
 'The man on stage is giving a speech.',
 'A man rubs his hands with soap after using the ba

## Clean generated text

In [186]:
# Clean validation greedy sampling generation
data_file = "sample_data/greedy_validation_set.txt"
greedy_generated_data = []

with open(data_file, 'r') as f:
  for i, line in enumerate(f):
    line = line.strip()
    example = line.split()
    new_example = []
    for word in example:
      if word.startswith("[unused") or word.startswith("[UNK]") or word.startswith(".") or word.startswith("\\") or word.startswith("##"):
        del word
      else:
        new_example.append(word)

    complete_sentence = " ".join(new_example) + "."
    greedy_generated_data.append(complete_sentence)

greedy_generated_data[0]

'the two beautiful white black sheep that are standing out and grazing close up together together in a blue green grassy field standing together.'

In [187]:
# Clean validation top-k generation
data_file = "sample_data/sampling_validation_set.txt"
sampling_generated_data = []

with open(data_file, 'r') as f:
  for i, line in enumerate(f):
    line = line.strip()
    example = line.split()
    new_example = []
    for word in example:
      if word.startswith("[unused") or word.startswith("[UNK]") or word.startswith(".") or word.startswith("\\") or word.startswith("##"):
        del word
      else:
        new_example.append(word)

    complete_sentence = " ".join(new_example) + "."
    sampling_generated_data.append(complete_sentence)

sampling_generated_data[0]

'a small herd affiliates of white sheep are standing together in a green field together , with the two large black elephants moving like that they really look metacritic like people.'

In [228]:
# Clean test top-k generation
data_file = "sample_data/sampling_test_set.txt"
test_sampling_generated_data = []

with open(data_file, 'r') as f:
  for i, line in enumerate(f):
    line = line.strip()
    example = line.split()
    new_example = []
    for word in example:
      if word.startswith("[unused") or word.startswith("[UNK]") or word.startswith(".") or word.startswith("\\") or word.startswith("##"):
        del word
      else:
        new_example.append(word)

    complete_sentence = " ".join(new_example) + "."
    test_sampling_generated_data.append(complete_sentence)

test_sampling_generated_data[0]

'young football players gather together to perform in an event a brand new fire drill , before someone running about frantically around city on the huge soccer field.'

# Compute metrics


## Lexical (ROUGE 1/2/L, BLEU, Meteor), 

In [64]:
from datasets import list_metrics, load_metric
import numpy as np

metrics_list = list_metrics()
print(metrics_list)

['accuracy', 'bertscore', 'bleu', 'bleurt', 'cer', 'comet', 'coval', 'cuad', 'f1', 'gleu', 'glue', 'indic_glue', 'matthews_correlation', 'meteor', 'pearsonr', 'precision', 'recall', 'rouge', 'sacrebleu', 'sari', 'seqeval', 'spearmanr', 'squad', 'squad_v2', 'super_glue', 'wer', 'xnli']


In [None]:
rouge_scorer = load_metric("rouge")
sacrebleu_scorer = load_metric("sacrebleu")
meteor_scorer = load_metric("meteor")

In [66]:
# ROUGE
greedy_rouge_results = rouge_scorer.compute(
    predictions=greedy_generated_data,
    references=validation_data['target'],
    rouge_types=["rouge1", "rouge2", "rougeL"],
    use_agregator=True, use_stemmer=False,
)

print("Greedy rouge1", round(greedy_rouge_results['rouge1'].mid.fmeasure, 4))
print("Greedy rouge2", round(greedy_rouge_results['rouge2'].mid.fmeasure, 4))
print("Greedy rougeL", round(greedy_rouge_results['rougeL'].mid.fmeasure, 4))


sampling_rouge_results = rouge_scorer.compute(
    predictions=sampling_generated_data,
    references=validation_data['target'],
    rouge_types=["rouge1", "rouge2", "rougeL"],
    use_agregator=True, use_stemmer=False,
)

print("Sampling rouge1", round(sampling_rouge_results['rouge1'].mid.fmeasure, 4))
print("Sampling rouge2", round(sampling_rouge_results['rouge2'].mid.fmeasure, 4))
print("Sampling rougeL", round(sampling_rouge_results['rougeL'].mid.fmeasure, 4))

Greedy rouge1 0.1376
Greedy rouge2 0.008
Greedy rougeL 0.1093
Sampling rouge1 0.1401
Sampling rouge2 0.0066
Sampling rougeL 0.1065


In [67]:
# BLEU

greedy_bleu_results = []
for i, sentence in enumerate(zip(greedy_generated_data, validation_data)):
  greedy_bleu_result = sacrebleu_scorer.compute(
      predictions=[greedy_generated_data[0]],
      references=[[validation_data['target'][0]]]
  )
  greedy_bleu_results.append(greedy_bleu_result["score"])

sampling_bleu_results = []
for i, sentence in enumerate(zip(greedy_generated_data, validation_data)):
  sampling_bleu_result = sacrebleu_scorer.compute(
      predictions=[sampling_generated_data[0]],
      references=[[validation_data['target'][0]]]
  )
  sampling_bleu_results.append(sampling_bleu_result["score"])

greedy_bleu_mean = np.mean(greedy_bleu_results)
sampling_bleu_mean = np.mean(sampling_bleu_results)

print("Greedy BLEU mean", greedy_bleu_mean)
print("Sampling BLEU mean", sampling_bleu_mean)


Greedy BLEU mean 2.1300733682208985
Sampling BLEU mean 1.6404472625060695


In [68]:
# METEOR
greedy_meteor_results = meteor_scorer.compute(
    predictions=greedy_generated_data,
    references=validation_data['target']
)

sampling_meteor_results = meteor_scorer.compute(
    predictions=sampling_generated_data,
    references=validation_data['target']
)

print("Greedy meteor", round(greedy_meteor_results['meteor'], 4))
print("Sampling meteor", round(sampling_meteor_results['meteor'], 4))

Greedy meteor 0.1229
Sampling meteor 0.1305


 ## Semantic (BERTscore, BLEURT)

In [None]:
import numpy as np

bleurt_scorer = load_metric("bleurt")
bertscore_scorer = load_metric("bertscore")

In [None]:
# BLEURT

greedy_bleurt_results = bleurt_scorer.compute(
    predictions=greedy_generated_data,
    references=validation_data['target']
)
sampling_bleurt_results = bleurt_scorer.compute(
    predictions=sampling_generated_data,
    references=validation_data['target']
)

In [None]:
greedy_bleurt_mean = np.mean(greedy_bleurt_results['scores'])
sampling_bleurt_mean = np.mean(sampling_bleurt_results['scores'])

print("Greedy bleurt mean", greedy_bleurt_mean)
print("Sampling bleurt mean", sampling_bleurt_mean)

Greedy bleurt mean -1.4129384312542363
Sampling bleurt mean -1.4204461984523953


In [None]:
# BERTscore

greedy_bertscore_results = bertscore_scorer.compute(
    predictions=greedy_generated_data,
    references=validation_data['target'],
    lang="en"
)

sampling_bertscore_results = bertscore_scorer.compute(
    predictions=sampling_generated_data,
    references=validation_data['target'],
    lang="en"
)

In [None]:
greedy_bertscore_f1_mean = np.mean(greedy_bertscore_results['f1'])
greedy_bertscore_precision_mean = np.mean(greedy_bertscore_results['precision'])
greedy_bertscore_recall_mean = np.mean(greedy_bertscore_results['recall'])

sampling_bertscore_f1_mean = np.mean(sampling_bertscore_results['f1'])
sampling_bertscore_precision_mean = np.mean(sampling_bertscore_results['precision'])
sampling_bertscore_recall_mean = np.mean(sampling_bertscore_results['recall'])

print("Greedy F1 mean", round(greedy_bertscore_f1_mean, 4))
print("Greedy precision mean", round(greedy_bertscore_precision_mean, 4))
print("Greedy recall mean", round(greedy_bertscore_recall_mean, 4))

print("Sampling F1 mean", round(sampling_bertscore_f1_mean, 4))
print("Sampling precision mean", round(sampling_bertscore_precision_mean, 4))
print("Sampling recall mean", round(sampling_bertscore_recall_mean, 4))


Greedy F1 mean 0.8422
Greedy precision mean 0.8222
Greedy recall mean 0.8638
Sampling F1 mean 0.84
Sampling precision mean 0.8174
Sampling recall mean 0.8644


## Diversity (MSTTR, Distinct 1/2/3, Unique 1/2/3, Entropy 1/2/3)


In [None]:
# Mean segmental Type-Token Ratio (MSTTR)
from lexical_diversity import lex_div as ld

greedy_msttr_results = []
for i, sentence in enumerate(greedy_generated_data):
  greedy_flt = ld.flemmatize(greedy_generated_data[i])
  greedy_msttr_result = ld.msttr(greedy_flt)
  greedy_msttr_results.append(greedy_msttr_result)

sampling_msttr_results = []
for i, sentence in enumerate(sampling_generated_data):
  sampling_flt = ld.flemmatize(sampling_generated_data[i])
  sampling_msttr_result = ld.msttr(sampling_flt)
  sampling_msttr_results.append(sampling_msttr_result)


greedy_msttr_mean = np.mean(greedy_msttr_results)
sampling_msttr_mean = np.mean(sampling_msttr_results)

print("Greedy MSTTR mean", greedy_msttr_mean)
print("Sampling MSTTR ", sampling_msttr_mean)

Greedy MSTTR mean 0.858626420024929
Sampling MSTTR  0.8777432256673694


In [71]:
# Distinct-n
from distinct_n import distinct_n_corpus_level

greedy_distinct_1_results = distinct_n_corpus_level(greedy_generated_data, 1)
sampling_distinct_1_results = distinct_n_corpus_level(sampling_generated_data, 1)

greedy_distinct_2_results = distinct_n_corpus_level(greedy_generated_data, 2)
sampling_distinct_2_results = distinct_n_corpus_level(sampling_generated_data, 2)

greedy_distinct_3_results = distinct_n_corpus_level(greedy_generated_data, 3)
#sampling_distinct_3_results = distinct_n_corpus_level(sampling_generated_data, 3)

print("Greedy distict 1-gram", greedy_distinct_1_results)
print("Greedy distict 2-gram", greedy_distinct_2_results)
print("Greedy distict 3-gram", greedy_distinct_3_results)

print("Sampling distict 1-gram", sampling_distinct_1_results)
print("Sampling distict 2-gram", sampling_distinct_2_results)
#print("Sampling distict 3-gram", sampling_distinct_3_results)

Greedy distict 1-gram 0.1910817796875825
Greedy distict 2-gram 0.5949154954804998
Greedy distict 3-gram 0.8051814770241112
Sampling distict 1-gram 0.16678654703025897
Sampling distict 2-gram 0.5732626861450655


In [190]:
# Unique-n

import nltk

greedy_document = []
for sentence in greedy_generated_data:
  sentence = sentence.split()
  for word in sentence:
    greedy_document.append(word)

sampling_document = []
for sentence in sampling_generated_data:
  sentence = sentence.split()
  for word in sentence:
    sampling_document.append(word)

greedy_unigram_finder = nltk.FreqDist(greedy_document)
greedy_bigram_finder = nltk.FreqDist(nltk.bigrams(greedy_document))
greedy_trigram_finder = nltk.FreqDist(nltk.trigrams(greedy_document))

sampling_unigram_finder = nltk.FreqDist(sampling_document)
sampling_bigram_finder = nltk.FreqDist(nltk.bigrams(sampling_document))
sampling_trigram_finder = nltk.FreqDist(nltk.trigrams(sampling_document))

def unique_finder(finder):
  counter = 0
  for occurence in range(len(finder)):
    item = finder.popitem()
    if item[1] == 1:
      counter += 1
  return counter

greedy_uni_count = unique_finder(greedy_unigram_finder)
greedy_bi_count = unique_finder(greedy_bigram_finder)
greedy_tri_count = unique_finder(greedy_trigram_finder)

sampling_uni_count = unique_finder(sampling_unigram_finder)
sampling_bi_count = unique_finder(sampling_bigram_finder)
sampling_tri_count = unique_finder(sampling_trigram_finder)

print("Greedy unique unigrams", greedy_uni_count)
print("Greedy unique bigrams", greedy_bi_count)
print("Greedy unique trigrams", greedy_tri_count)

print("Sampling unique unigrams", sampling_uni_count)
print("Sampling unique bigrams", sampling_bi_count)
print("Sampling unique trigrams", sampling_tri_count)

Greedy unique unigrams 974
Greedy unique bigrams 13096
Greedy unique trigrams 28102
Sampling unique unigrams 1150
Sampling unique bigrams 17349
Sampling unique trigrams 34908


In [222]:
# Entropy-n
from scipy import stats

greedy_unigram_finder = nltk.FreqDist(greedy_document)
greedy_bigram_finder = nltk.FreqDist(nltk.bigrams(greedy_document))
greedy_trigram_finder = nltk.FreqDist(nltk.trigrams(greedy_document))

sampling_unigram_finder = nltk.FreqDist(sampling_document)
sampling_bigram_finder = nltk.FreqDist(nltk.bigrams(sampling_document))
sampling_trigram_finder = nltk.FreqDist(nltk.trigrams(sampling_document))

greedy_unigram_probabilities = [v for k,v in greedy_unigram_finder.items()]
greedy_bigram_probabilities = [v for k,v in greedy_bigram_finder.items()]
greedy_trigram_probabilities = [v for k,v in greedy_trigram_finder.items()]

sampling_unigram_probabilities = [v for k,v in sampling_unigram_finder.items()]
sampling_bigram_probabilities = [v for k,v in sampling_bigram_finder.items()]
sampling_trigram_probabilities = [v for k,v in sampling_trigram_finder.items()]

greedy_entropy_1 = stats.entropy(greedy_unigram_probabilities)
greedy_entropy_2 = stats.entropy(greedy_bigram_probabilities)
greedy_entropy_2 = stats.entropy(greedy_trigram_probabilities)

sampling_entropy_1 = stats.entropy(sampling_unigram_probabilities)
sampling_entropy_2 = stats.entropy(sampling_bigram_probabilities)
sampling_entropy_2 = stats.entropy(sampling_trigram_probabilities)

print("Greedy entropy-1:", greedy_entropy_1)
print("Greedy entropy-2:", greedy_entropy_2)
print("Greedy entropy-3:", greedy_entropy_3)

print("Sampling entropy-1:", sampling_entropy_1)
print("Sampling entropy-2:", sampling_entropy_2)
print("Sampling entropy-3:", sampling_entropy_3)

Greedy entropy-1: 5.817888123094933
Greedy entropy-2: 10.239960244159507
Greedy entropy-3: 14.77313986316078
Sampling entropy-1: 6.009178492742538
Sampling entropy-2: 10.460680997248135
Sampling entropy-3: 15.091572599054542


In [232]:
valid_formatted = sampling_generated_data
test_formatted = test_sampling_generated_data

In [233]:
submission_dict = {
    "submission_name": "POINTER",
    "param_count": 110104890,
    "description": "Baseline for the task based on POINTER.",
    "tasks": {
      "common_gen_val": {"language": "en", "values": valid_formatted},
      "common_gen_test": {"language": "en", "values": test_formatted},
    }
}

In [234]:
import json
with open('gem_submission.json', 'w') as f:
  f.write(json.dumps(submission_dict, indent=2))