In [None]:
from datasets import load_dataset, concatenate_datasets
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
plt.style.use('seaborn-whitegrid')

## Load datasetstrain_test_split

In [None]:
da = load_dataset('alexandrainst/scandiqa', 'da', use_auth_token=True, download_mode="force_redownload")
da

In [None]:
sv = load_dataset('alexandrainst/scandiqa', 'sv', use_auth_token=True, download_mode="force_redownload")
sv

In [None]:
no = load_dataset('alexandrainst/scandiqa', 'no', use_auth_token=True, download_mode="force_redownload")
no

## Analyse data

In [None]:
def analyse(language: str):
    print(f'\n=== Analysing training split of {language} ===')
    df = globals()[language]['train'].to_pandas()
    
    # Show a sample
    display(df.head(3))
    
    # Count samples
    num_samples = len(df)
    print(f'There are {num_samples:,} samples.')
    
    # Get has-answer ratio
    frac_has_answer = df.answers.map(lambda dct: dct['text'][0] != '').value_counts(normalize=True).iloc[0]
    print(f'{100 * frac_has_answer:.2f}% of the samples have an answer.')
    
    # Get answer-is-number ratio
    num_is_number = len(df.loc[df.answers.map(lambda dct: re.match('[0-9]+', dct['text'][0]) is not None)])
    print(f'{100 * num_is_number / num_samples:.2f}% of the answers are numbers.')
    
    # Get average/median translated context length
    avg_translated_context_length = df.context.str.len().mean()
    median_translated_context_length = df.context.str.len().median()
    print(f'The average translated context has {avg_translated_context_length:,.0f} characters.')
    print(f'The median translated context has {median_translated_context_length:,.0f} characters.')
    
    # Get average original context length
    avg_original_context_length = df.context_en.str.len().mean()
    median_original_context_length = df.context_en.str.len().median()
    print(f'The average original context has {avg_original_context_length:,.0f} characters.')
    print(f'The median original context has {median_original_context_length:,.0f} characters.')
    
    # Get average answer length
    avg_answer_length = df.answers.map(lambda dct: len(dct['text'][0])).mean()
    median_answer_length = df.answers.map(lambda dct: len(dct['text'][0])).median()
    print(f'The mean answer has {avg_answer_length:,.0f} characters.')
    print(f'The median answer has {median_answer_length:,.0f} characters.')
    
    # Plots
    plt.hist(df.context.str.len().tolist(), alpha=0.7, density=True, log=True)
    plt.title(f'Translated context lengths for {language}', fontsize=16)
    plt.show()
    
    return df

In [None]:
for language in ['da', 'sv', 'no']:
    analyse(language)

## Split data across languages

In [None]:
def get_all_ids(dataset_dict) -> set:
    return {id for split in ['train', 'val', 'test'] for id in dataset_dict[split]['example_id']}
ids = dict(da=get_all_ids(da), sv=get_all_ids(sv), no=get_all_ids(no))

In [None]:
unique_da_ids = np.array(list(ids['da'].difference(ids['sv'].union(ids['no']))))
unique_da_ids.size

In [None]:
unique_sv_ids = np.array(list(ids['sv'].difference(ids['da'].union(ids['no']))))
unique_sv_ids.size

In [None]:
unique_no_ids = np.array(list(ids['no'].difference(ids['da'].union(ids['sv']))))
unique_no_ids.size

In [None]:
ids_in_common = np.array(list(ids['da'].intersection(ids['sv']).intersection(ids['no'])))
ids_in_common.size

In [None]:
all_da = concatenate_datasets([da['train'], da['val'], da['test']]).to_pandas().set_index('example_id')
all_da['has_answer'] = all_da.answers.map(lambda dct: dct['text'][0] != "")
all_da.loc[ids_in_common, 'has_answer'].value_counts()

In [None]:
all_sv = concatenate_datasets([sv['train'], sv['val'], sv['test']]).to_pandas().set_index('example_id')
all_sv['has_answer'] = all_sv.answers.map(lambda dct: dct['text'][0] != "")
all_sv.loc[ids_in_common, 'has_answer'].value_counts()

In [None]:
all_no = concatenate_datasets([no['train'], no['val'], no['test']]).to_pandas().set_index('example_id')
all_no['has_answer'] = all_no.answers.map(lambda dct: dct['text'][0] != "")
all_no.loc[ids_in_common, 'has_answer'].value_counts()

In [None]:
_, val_test_idxs = train_test_split(ids_in_common, test_size=1000, stratify=all_da.loc[ids_in_common, 'has_answer'])
val_idxs, test_idxs = train_test_split(val_test_idxs, test_size=500, stratify=all_da.loc[val_test_idxs, 'has_answer'])
val_idxs.size, test_idxs.size

In [None]:
da_val = all_da.loc[val_idxs]
da_test = all_da.loc[test_idxs]
da_train_idxs = set(all_da.index.tolist()).difference(val_idxs).difference(test_idxs)
da_train = all_da.loc[train_idxs]
len(da_train), len(da_val), len(da_test)