In [3]:
# Install packages
!pip install nltk
!pip install numpy
!pip install evaluate
!pip install rouge_score
!pip install sacrebleu
# !pip install google-cloud-storage
!pip install pandas

# Imports
import evaluate
import numpy as np
import pandas as pd
import rouge_score
import sacrebleu
from sacrebleu import CHRF
# import bert_score
import ast

# Metrics
rouge_score = evaluate.load("rouge")
bleu_score = evaluate.load("bleu")
sacrebleu_score = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [9

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [14]:
# datasets
import pandas as pd

# **1. Read the CSV file**
df = pd.read_csv("train_translated_blueprints_final.csv", sep="\t")

# **2. Create lists from columns**
linearized_input_list = df['linearized_input'].tolist()
blueprint_list = df['blueprint'].tolist()
table_text_list = df['table_text'].tolist()

# **Now you have:**
# * linearized_input_list
# * blueprint_list
# * table_text_list

In [15]:
# linearized_input to blueprint
score = chrf.compute(
    predictions=linearized_input_list,
    references=blueprint_list,
    lowercase=True
)
print("CHRF: " + str(score))
print()

# BLEU
score = bleu_score.compute(
    predictions=linearized_input_list,
    references=blueprint_list
)
print("BLEU: " + str(score))
print()

CHRF: {'score': 25.435962275109812, 'char_order': 6, 'word_order': 0, 'beta': 2}

BLEU: {'bleu': 0.016356141606937158, 'precisions': [0.09467143862501365, 0.02133439963151974, 0.008950853279788203, 0.003958763948718939], 'brevity_penalty': 1.0, 'length_ratio': 2.6011949435159143, 'translation_length': 797157, 'reference_length': 306458}



In [16]:
# blueprint to table_text
score = chrf.compute(
    predictions=blueprint_list,
    references=table_text_list,
    lowercase=True
)
print("CHRF: " + str(score))
print()

# BLEU
score = bleu_score.compute(
    predictions=blueprint_list,
    references=table_text_list
)
print("BLEU: " + str(score))
print()

CHRF: {'score': 43.02170583890401, 'char_order': 6, 'word_order': 0, 'beta': 2}

BLEU: {'bleu': 0.12559344023426053, 'precisions': [0.3487786254560168, 0.16296808134217255, 0.08616778495975512, 0.05080079204595546], 'brevity_penalty': 1.0, 'length_ratio': 1.3280032587122887, 'translation_length': 306458, 'reference_length': 230766}



In [17]:
# test outputs
import pandas as pd

# **1. Read the CSV file**
df = pd.read_csv("mt5-small-trans-blueprints-preds-full.csv", sep="\t")  # Replace 'your_file.csv' with your filename

# **2. Create lists from columns**
linearized_input_list = df['linearized_input'].tolist()

# **3. Split and create lists from 'table_text'**
blueprint_list = []
table_text_list = []

for text in df['preds']:
    parts = text.split("Verbalisation:")
    if len(parts) == 2:  # Ensure the split was successful
        blueprint_list.append(parts[0])
        table_text_list.append(parts[1])
    else:
        # Handle cases where "Verbalisation:" might be absent
        blueprint_list.append(text)
        table_text_list.append("")  # Add empty string

# **Now you have:**
# * linearized_input_list
# * blueprint_list
# * table_text_list

In [18]:
# linearized_input to blueprint
score = chrf.compute(
    predictions=linearized_input_list,
    references=blueprint_list,
    lowercase=True
)
print("CHRF: " + str(score))
print()

# BLEU
score = bleu_score.compute(
    predictions=linearized_input_list,
    references=blueprint_list
)
print("BLEU: " + str(score))
print()

CHRF: {'score': 23.013242552575033, 'char_order': 6, 'word_order': 0, 'beta': 2}

BLEU: {'bleu': 0.014284166673487137, 'precisions': [0.06035198000956574, 0.017927738331727082, 0.008679884664546238, 0.004432907348242811], 'brevity_penalty': 1.0, 'length_ratio': 5.100009956192752, 'translation_length': 102449, 'reference_length': 20088}



In [20]:
# blueprint to table_text
score = chrf.compute(
    predictions=blueprint_list,
    references=table_text_list,
    lowercase=True
)
print("CHRF: " + str(score))
print()

# BLEU
score = bleu_score.compute(
    predictions=blueprint_list,
    references=table_text_list
)
print("BLEU: " + str(score))
print()

CHRF: {'score': 36.174706137721536, 'char_order': 6, 'word_order': 0, 'beta': 2}

BLEU: {'bleu': 0.15907811551291728, 'precisions': [0.33223815213062524, 0.1945666235446313, 0.1286499299644435, 0.09197145907073431], 'brevity_penalty': 0.9565668333995695, 'length_ratio': 0.9574833174451859, 'translation_length': 20088, 'reference_length': 20980}



In [15]:
# dev outputs
import pandas as pd

# **1. Read the CSV file**
df = pd.read_csv("mt5-small-eng-only-blueprints-dev-preds.csv", sep="\t")
dev_df = pd.read_csv("eng_blueprints_dev.csv", sep="\t")

# **3. Split and create lists from 'table_text'**
blueprint_pred_list = []
blueprint_actual_list = []

for text in df['preds']:
    parts = text.split("Verbalisation:")
    if len(parts) == 2:  # Ensure the split was successful
        blueprint_pred_list.append(parts[0])
    else:
        # Handle cases where "Verbalisation:" might be absent
        blueprint_pred_list.append(text)

for text in dev_df['blueprint_target']:
    parts = text.split("Verbalisation:")
    if len(parts) == 2:  # Ensure the split was successful
        blueprint_actual_list.append(parts[0])
    else:
        # Handle cases where "Verbalisation:" might be absent
        blueprint_actual_list.append(text)


In [16]:
# predicted to actual blueprint
score = chrf.compute(
    predictions=blueprint_pred_list,
    references=blueprint_actual_list,
    lowercase=True
)
print("CHRF: " + str(score))
print()

# BLEU
score = bleu_score.compute(
    predictions=blueprint_pred_list,
    references=blueprint_actual_list
)
print("BLEU: " + str(score))
print()

CHRF: {'score': 23.087662471085054, 'char_order': 6, 'word_order': 0, 'beta': 2}

BLEU: {'bleu': 0.050647215273258106, 'precisions': [0.5614886731391586, 0.2643874643874644, 0.08454106280193237, 0.047527296082209375], 'brevity_penalty': 0.32408386095944797, 'length_ratio': 0.4702003550595993, 'translation_length': 1854, 'reference_length': 3943}

