In [61]:
import json
from functools import partial
from pathlib import Path
from string import Template

import pandas as pd
import seqio
import tensorflow as tf
import tensorflow_io as tfio
from seqio.utils import map_over_dataset
from t5.data.preprocessors import parse_tsv
from t5.data.utils import rate_num_examples
from t5.evaluation.metrics import accuracy, sklearn_metrics_wrapper
from transformers import T5Tokenizer

from teva.utils import get_dataset_statistics

In [42]:
MASAKHANEWS_LANGUAGES = [
    "amh", "eng", "fra", "hau",
    "ibo", "lin", "lug", "orm", 
    "pcm", "run", "sna", "som", 
    "swa", "tir", "xho", "yor"
]

In [43]:
for language in MASAKHANEWS_LANGUAGES:
    for split in ["train", "dev", "test"]:
        pd.read_csv(f"/home/aooladip/projects/AfriTeVa-keji/data/masakhanews/{language}/{split}.tsv", sep="\t") \
            .to_json(f"/home/aooladip/projects/AfriTeVa-keji/data/masakhanews/{language}/{split}.jsonl", orient="records", lines=True, force_ascii=False)

In [None]:
tokenizer = T5Tokenizer.from_pretrained("../tokenizers/v150000_new/sentencepiece.bpe.model")

In [None]:
tokenizer.tokenize("politics")      # Luckily for us, many labels are single tokens in our dictionary!

In [31]:
DEFAULT_VOCAB = seqio.SentencePieceVocabulary("../tokenizers/v150000_new/sentencepiece.bpe.model")

In [32]:
DEFAULT_OUTPUT_FEATURES = {
    "inputs": seqio.Feature(vocabulary=DEFAULT_VOCAB, add_eos=True, required=False),
    "targets": seqio.Feature(vocabulary=DEFAULT_VOCAB, add_eos=True)
}

In [8]:
stats = get_dataset_statistics("../data/masakhanews/stats")

In [9]:
new_stats = {}

In [10]:
for lang in stats['test']:
    new_stats[lang] = {
        split: stats[split][lang]
        for split in ["train", "dev", "test"]
    }

In [4]:
MASAKHANEWS_LANGUAGES = [
    "amh", "eng", "fra", "hau",
    "ibo", "lin", "lug", "orm",
    "lug", "orm", "pcm", "run",
    "sna", "som", "swa", "tir",
    "xho", "yor"
]

In [69]:
DATASET_PATH = Template("/home/aooladip/projects/AfriTeVa-keji/data/masakhanews/${language}/${split}.jsonl")

In [18]:
LABELS_PATH = Template("/home/aooladip/projects/AfriTeVa-keji/data/masakhanews/${language}/labels.txt")

In [6]:
lang = "yor"

In [70]:
source = seqio.TextLineDataSource(
    split_to_filepattern={
        "train": DATASET_PATH.substitute(split="train", language="eng"),
        "validation": DATASET_PATH.substitute(split="dev", language="eng"),
        "test": DATASET_PATH.substitute(split="test", language="eng")
    }
)

In [48]:
ds = source.get_dataset(split="validation")

In [49]:
for batch in ds:
    print(batch)
    break

tf.Tensor(b'{"category":"business","headline":"Rising costs leave Albrighton independent shops in limbo","text":"Independent shops in a Shropshire village say they are \\"in limbo\\" as energy bills spiral.\\nVillage Butcher, in Albrighton, is the high street\'s first casualty - it closed after its electricity bills more than doubled from \xc2\xa315,000 to \xc2\xa335,000.\\nBusinesses are not covered by an energy price cap, now \xc2\xa33,549 for households, and many face cost pressures.\\nIn Albrighton, business owners told the BBC they were concerned about raising their prices and losing customers.\\n\\"If I put my bills up, am I going to lose clients?,\\" asked Helen Pickering from Nieve Ella\'s hair salon at the top of High Street.\\n\\"A lot of my customers are the older generation and it\'s horrible that they can\'t afford to have their hair done,\\" she said. \\n\\"If I can keep my prices the same, and keep my clients coming in, at least then I will have money coming in to pay my

In [22]:
parser = partial(parse_tsv, field_names=["category", "headline", "text", "url"])

In [None]:
def get_labels(labels_file: str):
    with tf.io.gfile.GFile(labels_file) as f:
        return f.read().splitlines()

In [None]:
SPECS = {
    field: tf.TensorSpec(tf.TensorShape([]), tf.string, name=field)
    for field in ["category", "headline", "text", "url"]
}

In [66]:
@map_over_dataset
def jsonline_to_dict(line: str):
    return tfio.experimental.serialization.decode_json(line, specs=SPECS)

In [24]:
# We already confirmed that all labels are single tokens in our tokenizer vocabulary
@map_over_dataset
def create_news_classification_example(
    example,
    config = "text"
):
    return {
        "inputs": tf.strings.join(
            inputs=[
                "classify:", 
                example['headline'] if config == 'headline_only' 
                else example['text'] if config == 'text' 
                else example['text'] + example['headline']],
            separator=" "
        ), 
        "targets": example["category"]
    }

In [27]:
# Metric function factory
def weighted_multiclass_f1(num_classes, **metric_fn_kwargs):
    """Computes the unweighted average of the F1 per class."""
    return sklearn_metrics_wrapper(
        "f1_score",
        metric_dict_str="weighted_%dclass_f1" % num_classes,
        metric_post_process_fn=lambda x: 100 * x,
        beta=1,
        labels=range(num_classes),
        average="weighted",
        **metric_fn_kwargs
    )


In [28]:
weighted_f1 = weighted_multiclass_f1(len(get_labels(LABELS_PATH.substitute(language="eng")))) 

In [71]:
task = seqio.Task(
    name="yor_news",
    source=source,
    preprocessors=[
        jsonline_to_dict,
        partial(create_news_classification_example, config="text"),
        seqio.preprocessors.tokenize,
        seqio.preprocessors.append_eos_after_trim
    ],
    output_features=DEFAULT_OUTPUT_FEATURES,
    metric_fns=[accuracy, weighted_f1]
)

In [72]:
yor_ds = task.get_dataset(split="test", sequence_length={"inputs": 512, "targets": 2})

In [74]:
for ex in yor_ds.as_numpy_iterator():
    print(ex)
    break

{'inputs_pretokenized': b'classify: grandparents of I\'m a Celebrity... Get Me Out of Here! finalist Owen Warner have said they were "so proud" of his performance on the show.  \nFormer Lioness Jill Scott was crowned queen of the jungle on Sunday, with the Hollyoaks actor coming second. \nConservative MP Matt Hancock came third in the annual TV show.\nWarner\'s grandparents Anne and David Beck, who watched the final at home in Thurmaston, Leicestershire, said seeing him on the show had been "surreal". \n23-year-old was one of 12 contestants to head to the Australian jungle for the first time since the pandemic. \nHis grandad, who he affectionately calls The General, said he was "very pleased" with how well Warner did. \n"I\'m extremely proud," he said. \n"He\'s shown his true colours and it\'s good to see him settling down.\n"He was in awe initially when he was in there with so many esteemed people, but he\'s got used to them and he\'s relaxed and taken it in his stride."\nHis Grandmot