# Run exploratory code and print graphs

In [69]:
import re
import pandas as pd
from datasets import load_dataset, load_from_disk, Dataset
from find_formality import add_context_sentences, find_formality_dutch, filter_examples

## Code running on the whole iwslt2017_context en-nl dataset

In [64]:
# dataset.to_csv('annotations/annotations_bjurn_20.csv')

# dataset[0]

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  2.71ba/s]


30028

In [91]:
# load the iwslt2017 dataset and do the pre-processing
dataset = load_dataset(
    'gsarti/iwslt2017_context', 'iwslt2017-en-nl', split='train'
)  # change to train after testing
# print(f'Total amount of rows in train set: {len(dataset)}')

# add context fields to the dataset
dataset = add_context_sentences(dataset)

# filter the dataset for formality
formality_words = ['u', 'je', 'jij', 'jou', 'jouw', 'uw', 'jullie']
# formality_words = ['u', 'jij', 'jou', 'jouw', 'uw', 'jullie']
word_boundary = r'\b'
formality_regex = fr'{"|".join([word_boundary + word + word_boundary for word in formality_words])}'

# filter the dataset on formality examples
dataset = dataset.filter(lambda x: find_formality_dutch(formality_regex, x))

# remove sentences without enough context or with multiple formality words
dataset = dataset.filter(lambda x: filter_examples(formality_regex, x))

Adding context sentences: 100%|██████████| 237240/237240 [02:50<00:00, 1387.76it/s]


In [92]:
dataset

Dataset({
    features: ['doc_id', 'seg_id', 'translation', 'context'],
    num_rows: 29478
})

In [93]:
formality_word_count = {formality_word: 0 for formality_word in formality_words}

for example in dataset:
    formality_word_count[re.search(formality_regex, example['translation']['nl'], re.IGNORECASE).group().lower()] += 1

print(formality_word_count)

{'u': 1349, 'je': 22886, 'jij': 624, 'jou': 282, 'jouw': 240, 'uw': 118, 'jullie': 3979}


## Code running on the annotated dataset

In [97]:
# load the annotated dataset
dataset_20 = load_from_disk('annotations/annotations_nl')

dataset_20

Dataset({
    features: ['doc_id', 'seg_id', 'translation', 'context'],
    num_rows: 20
})

In [96]:
formality_word_count = {formality_word: 0 for formality_word in formality_words}

for idx, example in enumerate(dataset_20):
    formality_word_count[re.search(formality_regex, example['translation']['nl'], re.IGNORECASE).group(0).lower()] += 1

print(formality_word_count)

{'u': 5, 'je': 1, 'jij': 3, 'jou': 1, 'jouw': 1, 'uw': 0, 'jullie': 9}
