In [1]:
# Install

# Packages for transformers
# !pip install transformers
!pip install -U accelerate
!pip install -U transformers
!pip install datasets
!pip install torch
!pip install sentencepiece

!pip install nltk
!pip install numpy
!pip install evaluate
!pip install rouge_score
!pip install sacrebleu

# Google Cloud packages for dataset retrieval
# !pip install google-cloud-storage
!pip install pandas

Collecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, accelerate
Successfully installed accelerate-0.23.0 huggingface-hub-0.18.0
Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m18.

In [2]:
import torch
import random
import evaluate
import transformers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional
from dataclasses import dataclass
from time import perf_counter
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, disable_progress_bar, Dataset, DatasetDict
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
import rouge_score
import sacrebleu
from sacrebleu import CHRF

In [3]:
# Load dataset

train_df = pd.read_csv("train_joined_refs.csv", sep='\t')
val_df = pd.read_csv("dev_joined_refs.csv", sep='\t')
test_df = pd.read_csv("test_joined_refs.csv", sep='\t')

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict()

dataset_dict['train'] = train_dataset
dataset_dict['validation'] = val_dataset
dataset_dict['test'] = test_dataset

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['example_id', 'title', 'unit_of_measure', 'chart_type', 'was_translated', 'table_data', 'table_text', 'linearized_input'],
        num_rows: 1393
    })
    validation: Dataset({
        features: ['example_id', 'title', 'unit_of_measure', 'chart_type', 'was_translated', 'table_data', 'table_text', 'linearized_input'],
        num_rows: 754
    })
    test: Dataset({
        features: ['example_id', 'title', 'unit_of_measure', 'chart_type', 'was_translated', 'table_data', 'table_text', 'linearized_input'],
        num_rows: 763
    })
})

In [10]:
sample = dataset_dict["train"][106]
sample

{'example_id': 'ATR11-en-10',
 'title': 'Delivery Assistance from a Skilled Provider',
 'unit_of_measure': 'Percent of live births in the past five years assisted by a skilled provider',
 'chart_type': 'Map Chart',
 'was_translated': False,
 'table_data': '[["Kagera", 0.54], ["Kigoma", 0.34], ["Rukwa", 30], ["Mwanza", 0.44], ["Tabora", 0.46], ["Mbeya", 0.43], ["Mara", 0.3], ["Shinyanga", 0.35], ["Singida", 0.49], ["Arusha", 0.47], ["Manyara", 0.39], ["Dodoma", 0.46], ["Iringa", 0.81], ["Ruvuma", 0.83], ["Kilimanjaro", 0.86], ["Tanga", 0.45], ["Morogoro", 0.61], ["Pwani", 0.74], ["Lindi", 0.52], ["Mtwara", 0.6], ["Dar Es Salaam", 0.91], ["Pemba North", 0.25], ["Pemba South", 0.42], ["Unguja North", 0.45], ["Unguja South", 0.71]]',
 'table_text': 'Just 25% of women living in Pemba North received delivery assistance from a skilled provider, compared to 86% of women in Kilimanjaro.',
 'linearized_input': 'Delivery Assistance from a Skilled Provider | Percent of live births in the past five

In [9]:
# Metrics
rouge_score = evaluate.load("rouge")
bleu_score = evaluate.load("bleu")
sacrebleu_score = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [10]:
# Test metrics
generated_summary = "I absolutely loved reading the Hunger Games"
reference_summary = "I loved reading the Hunger Games"

bleu_score.compute(
    predictions=[generated_summary],
    references=[reference_summary]
)

{'bleu': 0.6434588841607617,
 'precisions': [0.8571428571428571, 0.6666666666666666, 0.6, 0.5],
 'brevity_penalty': 1.0,
 'length_ratio': 1.1666666666666667,
 'translation_length': 7,
 'reference_length': 6}

In [11]:
sacrebleu_score.compute(
    predictions=[generated_summary],
    references=[reference_summary]
)

{'score': 64.34588841607616,
 'counts': [6, 4, 3, 2],
 'totals': [7, 6, 5, 4],
 'precisions': [85.71428571428571, 66.66666666666667, 60.0, 50.0],
 'bp': 1.0,
 'sys_len': 7,
 'ref_len': 6}

In [12]:
scores = rouge_score.compute(
    predictions=[generated_summary],
    references=[reference_summary],
    rouge_types=["rouge1", "rouge2", "rougeL"]
)
scores

{'rouge1': 0.923076923076923,
 'rouge2': 0.7272727272727272,
 'rougeL': 0.923076923076923}

In [13]:
chrf.compute(predictions=[generated_summary], references=[reference_summary])

{'score': 89.21375498911478, 'char_order': 6, 'word_order': 0, 'beta': 2}

In [35]:
model_name = 'google/mt5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [15]:
def batch_tokenize_fn(examples):
    """
    Generate the input_ids and labels field for huggingface dataset/dataset dict.

    Truncation is enabled where we cap the sentence to the max length. Padding will be done later
    in a data collator, so we pad examples to the longest length within a mini-batch and not
    the whole dataset.
    """
    sources = examples['linearized_input']
    targets = examples['table_text']
    # targets = examples['target']
    model_inputs = tokenizer(sources, max_length=128, truncation=True)
    # 128

    # setup the tokenizer for targets,
    # huggingface expects the target tokenized ids to be stored in the labels field
    # note, newer version of tokenizer supports a text_target argument, where we can create
    # source and target sentences in one go
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
dataset_dict_tokenized = dataset_dict.map(
    batch_tokenize_fn,
    batched=True,
    remove_columns=dataset_dict["train"].column_names
)
dataset_dict_tokenized

Map:   0%|          | 0/4782 [00:00<?, ? examples/s]



Map:   0%|          | 0/754 [00:00<?, ? examples/s]

Map:   0%|          | 0/763 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4782
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 754
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 763
    })
})

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

features = [dataset_dict_tokenized["train"][i] for i in range(2)]
output = data_collator(features)
output

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  5323,  45142,   6851,  11135,   5373,   6851,  51279,    575,   5340,
          19446,    259,  93479,    831,    556,    670,   1736,    842,   1364,
           7491,   1093,    632,   1240,  12441,    259,  47909,    307,  77929,
          11238,   1105,  63834,    831, 163206,   7139,    575,    259,  32329,
            445,   1021,  17830,   2216,    548,    905, 217492,    307,    274,
         130111,    842,   1985,   1463,   6778,    556,  68812,  45142,    261,
            259,  49544,    261,   5029,   5391,    274, 130111,    842,    259,
          40810,    556,  68812,  45142,    261,    259,  49544,    261,   5654,
          18570,    274, 130111,    842,    556,  35976,   2647,    556,  68812,
          45142,    261,    259,  49544,    261, 131950,  18570,    274, 130111,
            842,   1985,   1463,   6778,    556,  68812,  45142,    261,   2019,
           4225,  74460,   3552,    261,   5006,  13249,    274, 130111,    842,
            25

In [18]:
args = Seq2SeqTrainingArguments(
    output_dir='./mt5-small-tata-finetuned',
    evaluation_strategy="steps",
    learning_rate=0.001,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # weight_decay=config.weight_decay,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    load_best_model_at_end=True,
    greater_is_better=True,
    metric_for_best_model="rougeL",
    gradient_accumulation_steps=8,
    # do_train=do_train,
    fp16=False
)

In [19]:
def compute_metrics(eval_pred):
    """
    Compute rouge and bleu metrics for seq2seq model generated prediction.

    tip: we can run trainer.predict on our eval/test dataset to see what a sample
    eval_pred object would look like when implementing custom compute metrics function
    """
    predictions, labels = eval_pred
    # Decode generated summaries, which is in ids into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode labels, a.k.a. reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        rouge_types=["rouge1", "rouge2", "rougeL"]
    )
    score = sacrebleu_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    chrf_score = chrf.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    result["sacrebleu"] = score["score"]
    result["chrf"] = chrf_score["score"]
    return {k: round(v, 4) for k, v in result.items()}

In [20]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=dataset_dict_tokenized["train"],
    eval_dataset=dataset_dict_tokenized["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback()]
)

In [21]:
trainer.train()
trainer.save_model()

Step,Training Loss,Validation Loss


OutOfMemoryError: ignored

In [None]:
trainer.evaluate()

In [None]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"
# !huggingface-cli login

In [None]:
# trainer.push_to_hub('mt5-small-tata-joined-refs')

In [None]:
def generate_verbalisation(model, tokenizer, example):
    """print out the source, target and predicted raw text."""
    # source = example['linearized_input']
    # target = example['target']
    input_ids = tokenizer(example)["input_ids"]
    input_ids = torch.LongTensor(input_ids).view(1, -1).to(model.device)
    generated_ids = model.generate(input_ids, max_new_tokens=100)
    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return prediction

    # print('source: ', source)
    # print('target: ', target)
    # print('prediction: ', prediction)

In [None]:
dataset_dict['test'][60]['linearized_input']

In [None]:
dataset_dict['test'][60]['table_text']

In [None]:
example = dataset_dict['test'][60]['linearized_input']
generate_verbalisation(model, tokenizer, example)

In [None]:
# test_df['prediction'] = test_df['linearized_input'].apply(lambda x: generate_verbalisation(model, tokenizer, x))

In [None]:
# test_df.to_csv('preds.csv', sep='\t', index=False)
# test_df

In [None]:
# df = pd.read_csv("preds.csv", sep='\t')
# df

In [None]:
df = pd.read_csv("train_joined_refs.csv", sep='\t')

In [46]:
test_str = df.iloc[4].linearized_input
test_str

'Youth Empowerment among Young Women age 15-29 by country | percentage in pooled terciles | (High Empowerment, Mali, 13.2) (Medium Empowerment, Mali, 17.9) (Low Empowerment, Mali, 68.9) (High Empowerment, Ethiopia, 14.7) (Medium Empowerment, Ethiopia, 22.7) (Low Empowerment, Ethiopia, 62.6) (High Empowerment, Malawi, 15.8) (Medium Empowerment, Malawi, 55) (Low Empowerment, Malawi, 29.2) (High Empowerment, Uganda, 25.4) (Medium Empowerment, Uganda, 32.1) (Low Empowerment, Uganda, 42.4) (High Empowerment, Zambia, 27.4) (Medium Empowerment, Zambia, 28.3) (Low Empowerment, Zambia, 44.3) (High Empowerment, Nigeria, 34.2) (Medium Empowerment, Nigeria, 35.5) (Low Empowerment, Nigeria, 30.3) (High Empowerment, Haiti, 42.8) (Medium Empowerment, Haiti, 40.1) (Low Empowerment, Haiti, 17.1) (High Empowerment, Nepal, 43.5) (Medium Empowerment, Nepal, 37) (Low Empowerment, Nepal, 19.5) (High Empowerment, Senegal, 43.9) (Medium Empowerment, Senegal, 21.5) (Low Empowerment, Senegal, 34.6) (High Empowe

In [47]:
tokenized = tokenizer(test_str, max_length=1000, truncation=True)
tokenized

{'input_ids': [259, 43235, 415, 220794, 1143, 259, 14925, 19267, 6880, 259, 2015, 670, 26349, 455, 11395, 307, 259, 33733, 281, 8730, 345, 50910, 1838, 307, 274, 41930, 415, 220794, 1143, 261, 26559, 261, 5029, 5391, 274, 108202, 415, 220794, 1143, 261, 26559, 261, 5654, 18570, 274, 101424, 415, 220794, 1143, 261, 26559, 261, 131950, 18570, 274, 41930, 415, 220794, 1143, 261, 259, 43007, 261, 5006, 13249, 274, 108202, 415, 220794, 1143, 261, 259, 43007, 261, 8334, 13249, 274, 101424, 415, 220794, 1143, 261, 259, 43007, 261, 131730, 12084, 274, 41930, 415, 220794, 1143, 261, 259, 57189, 261, 5383, 16375, 274, 108202, 415, 220794, 1143, 261, 259, 57189, 261, 259, 101553, 274, 101424, 415, 220794, 1143, 261, 259, 57189, 261, 10662, 5391, 274, 41930, 415, 220794, 1143, 261, 259, 62036, 261, 9546, 11991, 274, 108202, 415, 220794, 1143, 261, 259, 62036, 261, 58865, 7267, 274, 101424, 415, 220794, 1143, 261, 259, 62036, 261, 47979, 11991, 274, 41930, 415, 220794, 1143, 261, 259, 99532, 261, 9

In [48]:
len(tokenized.input_ids)

347

In [62]:
top = 0
ind = 0

above_100 = 0
above_500 = 0

for index, row in df.iterrows():
  tokenized = tokenizer(row['linearized_input'], truncation=False)
  if len(tokenized.input_ids) > 500:
    above_500 += 1
  elif len(tokenized.input_ids) > 100:
    above_100 += 1
  if len(tokenized.input_ids) > top:
    top = len(tokenized.input_ids)
    ind = index

print(ind)
print(top)
print(above_100)
print(above_500)

79
2266
3203
147


In [60]:
test_str = df.iloc[46].linearized_input
test_str

'Ní orílẹ̀-èdè kọ̀ọ̀kan, ìpín aláfiwé gíga kan ní a kà sí eyiti ó bọ́sínú àkókò fún ìmúniwọ-ẹgbẹ́ ìbálòpọ̀ ṣíṣe fún yálà ìgbéyàwó tabi ọmọ bíbí. Fún àpẹẹrẹ, ní ilẹ̀ Nàìjíríà, ìdá 62.5 nínú ìdá ọgọ́ọ̀rún àwọn ọkùnrin ni a fí sínú ìṣọ̀rí àwọn tóbọ́ sínú àkókò fún ìbálòpọ̀ àkọ́kọ́, ní àfiwé pẹlu ìdá 34.5 nínú ìdá ọgọ́ọ̀rún àti ìdá 27.1 nínú ìdá ọgọ́ọ̀rún fún ìgbéyàwó àti ìbí ọmọ àkọ́kọ́ ní ṣíṣẹ̀-n-tẹ̀lé. Púpọ̀jù kan ninu àwọn ọkùnrin ní orílẹ̀-èdè Nàìjíríà ni a fí sínú ìṣọ̀rí àwọn tóbọ́ sínú àkókò fún ìmúniwọ-ẹgbẹ́ ti ìbálòpọ̀ àkọ́kọ́, pẹlu àwọn ọwọ́jà láti ìdá 50.3 nínú ìdá ọgọ́ọ̀rún ní ilẹ̀ Bìnì sí ìdá 82.8 nínú ìdá ọgọ́ọ̀rún ní Mali. Ní ìdàkejì, ìpín aláfiwé tí àwọn ọkùnrin tí a fi sínú ìṣọ̀rí àwọn tóbọ́ sínú àkókò fún ìgbéyàwó àkọ́ko