# **Dataset Quality**


## **Libraries**

In [1]:
import os
import re
import warnings

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

import constants


## **Directories**

In [2]:
CORPORA_FOLDER = constants.CORPORA_FOLDER
DATASET_DIR = os.path.join(CORPORA_FOLDER, 'jojajovai_all.csv')

In [3]:
dataset = pd.read_csv(DATASET_DIR, sep=',', encoding='utf-8')


## **Corpus**

In [4]:
dataset.head()

Unnamed: 0,split,source,gn,es,tokens_gn,tokens_es
0,train,abc,"Itaugua omokyre'ÿ ""omopotî"" Congreso","En Itauguá promueven ""limpiar"" el Congreso",Itaugua omokyre'ÿ `` omopotî '' Congreso,En Itauguá promueven `` limpiar '' el Congreso
1,train,abc,"Omopotîvo hikuái tetãme vicio política, ko'ã i...",Con el propósito de limpiar al país de los vic...,"Omopotîvo hikuái tetãme vicio política , ko'ã ...",Con el propósito de limpiar al país de los vic...
2,train,abc,Ko'ã 50 tapicha oñembyaty parroquía Virgen del...,Unas 50 personas se encuentran en la plazoleta...,Ko'ã 50 tapicha oñembyaty parroquía Virgen del...,Unas 50 personas se encuentran en la plazoleta...
3,train,abc,Itaugüeño oipotáva ohechauka ipotîha itáva ha ...,Los itaugüeños quieren demostrar que los parag...,Itaugüeño oipotáva ohechauka ipotîha itáva ha ...,Los itaugüeños quieren demostrar que los parag...
4,train,abc,Opavave tavaygua ojerúre senador Oscar Gonzál...,"Personas de todas las edades, pero en su mayor...",Opavave tavaygua ojerúre senador Oscar Gonzále...,"Personas de todas las edades , pero en su mayo..."


In [5]:
SPLIT_COLUMN = 'split'
SOURCE_COLUMN = 'source'
LANGUAGE_SOURCE_COLUMN = 'gn'
LANGUAGE_TARGET_COLUMN = 'es'

In [6]:
train_dataset = dataset[dataset[SPLIT_COLUMN] == 'train']

In [7]:
len(train_dataset)

20207

## **Quality metrics**

- the minimum and maximum length of segments
- the ratio between the source and target lengths.
- the ratio between alphabet to non-alphabet characters
- the ratio between alphabet to non-alphabet words

Source: https://marian-nmt.github.io/examples/training-overview

In [8]:
def sentence_word_length(sentence: str):
    return len(sentence.split())

def sentence_character_length(sentence: str):
    return len(sentence)

def sentences_word_length_ratio(sentence1: str, sentence2: str):
    return sentence_word_length(sentence1) / sentence_word_length(sentence2)

def sentences_character_length_ratio(sentence1: str, sentence2: str):
    return sentence_character_length(sentence1) / sentence_character_length(sentence2)

def sentence_alphabet_character_ratio(sentence: str):
    non_alphabet_characters = re.sub('[a-zA-Z]+', '', sentence)
    non_alphabet_characters = re.sub(' ( )+', '', non_alphabet_characters)
    non_alphabet_characters = non_alphabet_characters.strip()
    return len(non_alphabet_characters) / len(sentence)

def sentence_alphabet_words_ratio(sentence: str):
    words = sentence.split()
    non_alphabet_words = list(filter(lambda word: not word.isalpha(), words))
    non_alphabet_words = ' '.join(non_alphabet_words)
    non_alphabet_words = re.sub(' ( )+', '', non_alphabet_words)
    non_alphabet_words = non_alphabet_words.strip()
    return len(non_alphabet_words.split()) / len(sentence.split())

In [9]:
def create_quality_metrics_df(df: pd.DataFrame) -> pd.DataFrame:
    warnings.filterwarnings('ignore')
    df['source_sentence_word_length'] = df[LANGUAGE_SOURCE_COLUMN].apply(sentence_word_length)
    df['source_sentence_character_length'] = df[LANGUAGE_SOURCE_COLUMN].apply(sentence_character_length)
    df['source_sentence_alphabet_character_ratio'] = df[LANGUAGE_SOURCE_COLUMN].apply(sentence_alphabet_character_ratio)
    df['source_sentence_alphabet_words_ratio'] = df[LANGUAGE_SOURCE_COLUMN].apply(sentence_alphabet_words_ratio)
    df['target_sentence_word_length'] = df[LANGUAGE_TARGET_COLUMN].apply(sentence_word_length)
    df['target_sentence_character_length'] = df[LANGUAGE_TARGET_COLUMN].apply(sentence_character_length)
    df['target_sentence_alphabet_character_ratio'] = df[LANGUAGE_TARGET_COLUMN].apply(sentence_alphabet_character_ratio)
    df['target_sentence_alphabet_words_ratio'] = df[LANGUAGE_TARGET_COLUMN].apply(sentence_alphabet_words_ratio)
    df['sentence_word_length_ratio'] = df.apply(lambda row: sentences_word_length_ratio(row[LANGUAGE_SOURCE_COLUMN], row[LANGUAGE_TARGET_COLUMN]), axis=1)
    df['sentence_character_length_ratio'] = df.apply(lambda row: sentences_character_length_ratio(row[LANGUAGE_SOURCE_COLUMN], row[LANGUAGE_TARGET_COLUMN]), axis=1)
    warnings.filterwarnings('default')
    return df

In [10]:
quality_metrics_df = create_quality_metrics_df(train_dataset)

In [11]:
quality_metrics_df.tail()

Unnamed: 0,split,source,gn,es,tokens_gn,tokens_es,source_sentence_word_length,source_sentence_character_length,source_sentence_alphabet_character_ratio,source_sentence_alphabet_words_ratio,target_sentence_word_length,target_sentence_character_length,target_sentence_alphabet_character_ratio,target_sentence_alphabet_words_ratio,sentence_word_length_ratio,sentence_character_length_ratio
20202,train,spl,Ko cursillo reieténte ojejapokuaa ha oikóta 18...,El cursillo es totalmente gratuito y se desarr...,Ko cursillo reieténte ojejapokuaa ha oikóta 18...,El cursillo es totalmente gratuito y se desarr...,20,137,0.145985,0.3,28,160,0.0875,0.142857,0.714286,0.85625
20203,train,spl,"Upe rire, oñepyrũta tekombo’e ojeikévo jasyrun...",Las clases iniciarán en la primera semana de a...,"Upe rire , oñepyrũta tekombo'e ojeikévo jasyru...",Las clases iniciarán en la primera semana de a...,18,131,0.251908,0.388889,20,110,0.127273,0.2,0.9,1.190909
20204,train,spl,Reñemomaranduvekuaa ko’ápe (0981) 463-338.,Para mayor información llamar al (0981) 463-338.,Reñemomaranduvekuaa ko'ápe ( 0981 ) 463-338 .,Para mayor información llamar al ( 0981 ) 463-...,4,42,0.47619,0.75,7,48,0.333333,0.285714,0.571429,0.875
20205,train,spl,Ko guarani ñe’ẽme jehekombo’e rupive hi’ã temi...,La carrera en Lengua Guaraní tiene por objetiv...,Ko guarani ñe'ẽme jehekombo'e rupive hi'ã temi...,La carrera en Lengua Guaraní tiene por objetiv...,24,190,0.184211,0.5,33,196,0.035714,0.121212,0.727273,0.969388
20206,train,spl,Umi oikeséva oñemoaradu oiporukuaava’erã vaiva...,Los postulantes deberán demostrar un manejo bá...,Umi oikeséva oñemoaradu oiporukuaava'erã vaiva...,Los postulantes deberán demostrar un manejo bá...,7,61,0.114754,0.285714,9,63,0.063492,0.111111,0.777778,0.968254


In [16]:
quality_metrics_df[quality_metrics_df['sentence_word_length_ratio'] > 1.5][[LANGUAGE_SOURCE_COLUMN, LANGUAGE_TARGET_COLUMN, 'sentence_word_length_ratio']].sort_values(by='sentence_word_length_ratio', ascending=False)

Unnamed: 0,gn,es,sentence_word_length_ratio
19709,"Estigarribia Tavao kotýpe, ypykuéra Lumnanas-p...",Estigarribia.,8.000000
14515,Ñeꞌẽveve og̃uahẽva ojejerureꞌỹ rehe,Spam,4.000000
16673,ñemitỹ ha mymbakuéra rehegua,agropecuario,4.000000
16708,pysyrõ (oñepysyrõ se defiende),defender,4.000000
20084,"Upéicha, opa umi 100 ñe’ẽme oñembohasámava Moz...",299 nuevas traducciones en el último mes repor...,3.000000
...,...,...,...
1440,Temporal ange pyahre oityvyróva heta teda rupi...,El temporal de anoche azotó con furia varias c...,1.583333
10306,Ko cauce hídrico oime 5 kilómetro ko távagui h...,Dicho cauce hídrico se encuentra a unos 5 kiló...,1.583333
15169,"Para unirse a la diversión, solo Registrese pa...","Ejoaju hağua, ehaínte nde rerañemi eñemboaty h...",1.571429
12197,Ko mba’e ikatu jahechakuaa ko Arapokôindy Mara...,En esta última Semana Santa todos los medios d...,1.533333


In [12]:
quality_metrics_df.describe()

Unnamed: 0,source_sentence_word_length,source_sentence_character_length,source_sentence_alphabet_character_ratio,source_sentence_alphabet_words_ratio,target_sentence_word_length,target_sentence_character_length,target_sentence_alphabet_character_ratio,target_sentence_alphabet_words_ratio,sentence_word_length_ratio,sentence_character_length_ratio
count,20207.0,20207.0,20207.0,20207.0,20207.0,20207.0,20207.0,20207.0,20207.0,20207.0
mean,15.337259,118.303657,0.146058,0.279846,22.746029,141.127431,0.058637,0.12605,0.724802,0.892976
std,10.926488,84.373576,0.070045,0.176456,16.214508,101.808352,0.045188,0.111748,0.22436,0.256727
min,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.122449,0.159509
25%,7.0,51.0,0.100335,0.176471,9.0,56.0,0.032258,0.0625,0.6,0.762556
50%,14.0,104.0,0.138889,0.266667,21.0,126.0,0.050388,0.111111,0.690476,0.857143
75%,22.0,169.0,0.183673,0.368421,33.0,205.0,0.075,0.166667,0.814815,0.977142
max,155.0,1172.0,1.0,1.0,214.0,1303.0,0.666667,1.0,8.0,8.75
