In [1]:
# Install packages
!pip install nltk
!pip install numpy
!pip install evaluate
!pip install rouge_score
!pip install sacrebleu
# !pip install google-cloud-storage
!pip install pandas

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting pyarrow-hotfix (from datasets>=2.0.0->ev

In [2]:
# Imports
import evaluate
import numpy as np
import pandas as pd
import rouge_score
import sacrebleu
from sacrebleu import CHRF
import ast

In [3]:
# Metrics
rouge_score = evaluate.load("rouge")
bleu_score = evaluate.load("bleu")
sacrebleu_score = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [4]:
# Load dataset
df = pd.read_csv("mt5-base-pred.csv", sep='\t')
preds = df['preds'].tolist()
# refs = df['refs'].tolist()

df2 = pd.read_csv("test_full.csv", sep='\t')
df2['table_text'] = df2['table_text'].apply(ast.literal_eval)
refs = df2['table_text'].tolist()

In [5]:
# Find the length of the longest list
max_length = max(len(sublist) for sublist in refs)

# Pad each sublist to match the length of the longest list
for sublist in refs:
    while len(sublist) < max_length:
        sublist.append("")

In [6]:
df.iloc[3]['refs']

'A number of the differences between rural and urban areas are common across most of the countries.'

In [7]:
df2.iloc[3]['table_text']

['A number of the differences between rural and urban areas are common across most of the countries.',
 'First, the family formation process in rural areas typically takes place at younger ages than in urban areas.',
 'This is especially evident when examining the median ages at first marriage and birth of first child, which are substantially higher in urban areas in every country.',
 'The pattern is less evident for age at first sex, where there is very little difference between rural and urban areas for three of the eight countries (Benin, Rwanda, and Uganda), a higher median age in rural areas in one country (Nigeria), and a higher median age in urban areas in four countries (Mali, Ethiopia, India, and Nepal).',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [8]:
# BLEU
score = bleu_score.compute(
    predictions=preds,
    references=refs
)
print("BLEU: " + str(score))
print()

BLEU: {'bleu': 0.050921144365139434, 'precisions': [0.18302964959568732, 0.0678330263965623, 0.03230769230769231, 0.016761930684086372], 'brevity_penalty': 1.0, 'length_ratio': 3567.3076923076924, 'translation_length': 46375, 'reference_length': 13}



In [9]:
# ROUGE
scores = rouge_score.compute(
    predictions=preds,
    references=refs,
    rouge_types=["rouge1", "rouge2", "rougeL"]
)

print("ROUGE:")
print(scores)
print()

ROUGE:
{'rouge1': 0.28327782595546724, 'rouge2': 0.11542169043773454, 'rougeL': 0.2443016125472928}



In [10]:
# Test
test_refs = [['Only 18% of women own a house, either alone or jointly, and only 15% own land.', 'In comparison, men are more than twice as likely to own a home alone or jointly (40%).', 'Men are also more than twice as likely to own land alone or jointly (34%).']]
test_preds = ['15% of women between the ages of 15 and 49 own land alone or jointly.']

score = chrf.compute(
    predictions=test_preds,
    references=test_refs,
    lowercase=True
)
print("CHRF: " + str(score))
print()

scores = rouge_score.compute(
    predictions=test_preds,
    references=test_refs,
    rouge_types=["rouge1", "rouge2", "rougeL"]
)

print("ROUGE:")
print(scores)
print()

score = bleu_score.compute(
    predictions=test_preds,
    references=test_refs
)
print("BLEU: " + str(score))
print()

CHRF: {'score': 43.95870591156349, 'char_order': 6, 'word_order': 0, 'beta': 2}

ROUGE:
{'rouge1': 0.5806451612903225, 'rouge2': 0.2857142857142857, 'rougeL': 0.38709677419354843}

BLEU: {'bleu': 0.28648682864686603, 'precisions': [0.6470588235294118, 0.4375, 0.26666666666666666, 0.14285714285714285], 'brevity_penalty': 0.8890097654027757, 'length_ratio': 0.8947368421052632, 'translation_length': 17, 'reference_length': 19}



In [11]:
# CHRF
score = chrf.compute(
    predictions=preds,
    references=refs,
    lowercase=True
)
print("CHRF: " + str(score))
print()

CHRF: {'score': 23.462802233817666, 'char_order': 6, 'word_order': 0, 'beta': 2}

