In [2]:
from datasets import load_from_disk
import datasets

Concatenate datasets

In [3]:
data_folders = ["./annotations/annotations_ru_20"]
data = load_from_disk(data_folders[0])
for folder in data_folders[1:]:
    data = datasets.concatenate_datasets([data, load_from_disk(folder)])


In [4]:
print(f'columns: {data.column_names}')
print(f'nrows: {data.num_rows}')

columns: ['doc_id', 'seg_id', 'translation', 'context']
nrows: 20


In [5]:
print(data[0])

{'doc_id': 1695, 'seg_id': 205, 'translation': {'en': 'What is it about you?', 'nl': 'Wat heb jij?'}, 'context': {'en': ['CA: How cool is that?  Elon, how have you done this?', "These projects are so -- Paypal, SolarCity, Tesla, SpaceX, they're so spectacularly different, they're such ambitious projects at scale.", 'How on Earth has one person been able to innovate in this way?'], 'nl': ['CA: Hoe gaaf is data  Elon, hoe heb je dit gedaan?', 'Deze projecten zijn zo -- PayPal, SolarCity, Tesla, SpaceX, ze zijn zo spectaculair anders, het zijn zeer grote, ambitieuze projecten.', 'Hoe is het in hemelsnaam mogelijk, dat één persoon zó kan innoveren?'], 'ru': ['КА: Насколько это здорово? Элон, как вы это сделали?', 'Эти проекты такие — Paypal, SolarCity, Tesla, SpaceX, они настолько впечатляюще разные, они такие амбициозные проекты по масштабу.', 'Как это возможно чтобы один человек смог внедрить инновации таким образом?']}}


Context is stored as list of sentences, concatenate to a single, long string.

In [6]:
def concatenate_context(row, languages=None):
    if languages is None:
        languages = ['en', 'nl', 'ru']
    row['context_concatenated'] = {
        lan: ' '.join(row['context'][lan]) for lan in languages
    }
    return row

In [7]:
data = data.map(concatenate_context)

In [8]:
for row in data:
    print(row['context_concatenated']['en'])
    print(row['context_concatenated']['nl'])
    print(row['context_concatenated']['ru'])
    print(row['translation']['en'])
    print(row['translation']['nl'])
    # print(row['translation']['ru'])
    print()

CA: How cool is that?  Elon, how have you done this? These projects are so -- Paypal, SolarCity, Tesla, SpaceX, they're so spectacularly different, they're such ambitious projects at scale. How on Earth has one person been able to innovate in this way?
CA: Hoe gaaf is data  Elon, hoe heb je dit gedaan? Deze projecten zijn zo -- PayPal, SolarCity, Tesla, SpaceX, ze zijn zo spectaculair anders, het zijn zeer grote, ambitieuze projecten. Hoe is het in hemelsnaam mogelijk, dat één persoon zó kan innoveren?
КА: Насколько это здорово? Элон, как вы это сделали? Эти проекты такие — Paypal, SolarCity, Tesla, SpaceX, они настолько впечатляюще разные, они такие амбициозные проекты по масштабу. Как это возможно чтобы один человек смог внедрить инновации таким образом?
What is it about you?
Wat heb jij?

And the week before I showed up, the CEO of this big software company went to that group, 200 engineers, and canceled the project. And I stood there in front of 200 of the most depressed people I'v

In [9]:
# inseq seems to only work when doing 'pip install -r requirements.txt'
import inseq
from inseq.commands.attribute_context.attribute_context import attribute_context_with_model, AttributeContextArgs

Load inseq with mbart large en to nl

In [14]:
inseq_model = inseq.load_model(
    "facebook/mbart-large-50-one-to-many-mmt",
    "saliency",
    tokenizer_kwargs={'src_lang': 'en_XX', 'tgt_lang': 'ru_RU'},
)

In [15]:
def get_pecore_args(
        input_context_text,
        input_current_text,
        output_context_text,
        output_current_text=None,
        sample_identifier=None,
        output_folder='output'

):
    return AttributeContextArgs(
        model_name_or_path="facebook/mbart-large-50-one-to-many-mmt",
        attribution_method="saliency",
        attributed_fn="contrast_prob_diff",
        context_sensitivity_metric="kl_divergence",
        context_sensitivity_std_threshold=0,
        attribution_std_threshold=2,
        attribution_topk=5,
        input_context_text=input_context_text,
        input_current_text=input_current_text,
        output_context_text=output_context_text,
        # output_current_text=output_current_text,
        contextless_input_current_text="""{current}""",
        input_template="""{context} {current}""",
        contextless_output_current_text="""{current}""",
        output_template="{context} {current}",
        save_path=f"{output_folder}/{sample_identifier}.json",
        viz_path=f"{output_folder}/{sample_identifier}.html",
        tokenizer_kwargs={'src_lang': 'en_XX', 'tgt_lang': 'ru_RU'},
    )

Run little test

In [16]:
pec_args = get_pecore_args("This is context", "for translating this text", "Dit is context", sample_identifier="sample")

In [17]:
out = attribute_context_with_model(pec_args, inseq_model)

Now for use with the dataset

In [18]:
def get_pecore_args_for_row(row, sample_identifier=None, output_folder='output'):
    return get_pecore_args(
        row['context_concatenated']['en'],
        row['translation']['en'],
        row['context_concatenated']['ru'],
        sample_identifier=sample_identifier,
        output_folder=output_folder
    )

In [19]:
for i, row in enumerate(data):
    try:
        pecore_args = get_pecore_args_for_row(row, f"{i}_ru")
        out = attribute_context_with_model(pecore_args, inseq_model)
    except ValueError as e:
        print()
        print(f"ERROR for {i}")
        print(e)
        print()
        