In [None]:
from datasets import load_dataset
import re
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

## Load datasets

In [None]:
da = load_dataset('alexandrainst/scandiqa', 'da', use_auth_token=True, download_mode="force_redownload")
da

In [None]:
sv = load_dataset('alexandrainst/scandiqa', 'sv', use_auth_token=True, download_mode="force_redownload")
sv

In [None]:
no = load_dataset('alexandrainst/scandiqa', 'no', use_auth_token=True, download_mode="force_redownload")
no

## Analyse data

In [None]:
def analyse(language: str):
    print(f'\n=== Analysing training split of {language} ===')
    df = globals()[language]['train'].to_pandas()
    
    # Show a sample
    display(df.head(3))
    
    # Count samples
    num_samples = len(df)
    print(f'There are {num_samples:,} samples.')
    
    # Get has-answer ratio
    frac_has_answer = df.answer.map(lambda x: x != '').value_counts(normalize=True).iloc[0]
    print(f'{100 * frac_has_answer:.2f}% of the samples have an answer.')
    
    # Get answer-is-number ratio
    num_is_number = len(df.loc[df.answer.map(lambda x: re.match('[0-9]+', x) is not None)])
    print(f'{100 * num_is_number / num_samples:.2f}% of the answers are numbers.')
    
    # Get average/median translated context length
    avg_translated_context_length = df.context.str.len().mean()
    median_translated_context_length = df.context.str.len().median()
    print(f'The average translated context has {avg_translated_context_length:,.0f} characters.')
    print(f'The median translated context has {median_translated_context_length:,.0f} characters.')
    
    # Get average original context length
    avg_original_context_length = df.context_en.str.len().mean()
    median_original_context_length = df.context_en.str.len().median()
    print(f'The average original context has {avg_original_context_length:,.0f} characters.')
    print(f'The median original context has {median_original_context_length:,.0f} characters.')
    
    # Get average answer length
    avg_answer_length = df.answer.str.len().mean()
    median_answer_length = df.answer.str.len().median()
    print(f'The mean answer has {avg_answer_length:,.0f} characters.')
    print(f'The median answer has {median_answer_length:,.0f} characters.')
    
    # Plots
    plt.hist(df.context.str.len().tolist(), alpha=0.7, density=True, log=True)
    plt.title(f'Translated context lengths for {language}', fontsize=16)
    plt.show()
    
    return df

In [None]:
for language in ['da', 'sv', 'no']:
    analyse(language)