In [None]:
%cd ..

In [None]:
from pathlib import Path
import pandas as pd
import json
import luga
import re

In [None]:
path = Path('data') / 'processed' / 'translation_cache.jsonl'

In [None]:
with path.open() as f:
    records =  [json.loads(line) for line in f]
df = pd.DataFrame.from_records(records)
df.head()

In [None]:
df.target_lang.value_counts()

## Deduplication

In [None]:
num_samples = len(df)
df_deduplicated = df.drop_duplicates(subset=['context_en', 'target_lang'])
print(f"Removed {num_samples - len(df_deduplicated):,} samples.")

In [None]:
df_deduplicated.target_lang.value_counts()

## Check languages

In [None]:
df_with_languages = df_deduplicated.copy()
def process_context(context: str) -> str:
    lines = [
        re.sub(r"[^A-ZÆØÅa-zæøå ]", "", line) for line in context.split('\n')
        if re.search("[A-ZÆØÅa-zæøå ]{5,}", line)
    ]
    return re.sub(' +', ' ', ' '.join(lines)[:1000])
df_with_languages["predicted_language"] = luga.languages(df_with_languages.context.map(process_context).tolist(), only_language=True, threshold=0.)
df_with_languages.head()

In [None]:
df_wrong_languages = df_with_languages.query('target_lang != predicted_language')
df_wrong_languages.target_lang.value_counts()

In [None]:
df_wrong_languages.target_lang.value_counts().sum()

In [None]:
df_wrong_languages.query('target_lang == "da"').predicted_language.value_counts()

In [None]:
df_wrong_languages.query('target_lang == "sv"').predicted_language.value_counts()

In [None]:
df_wrong_languages.query('target_lang == "no"').predicted_language.value_counts()

In [None]:
wrong_sample = df_wrong_languages.sample(n=1)
print(f'True language: {wrong_sample.target_lang.iloc[0]}')
print(f'Predicted language: {wrong_sample.predicted_language.iloc[0]}', end='\n\n')
print(wrong_sample.context.map(process_context).iloc[0])