# **Data augmentation in Test Set**

In [8]:
import os
import sys
import subprocess
import warnings; warnings.filterwarnings('ignore')

import pandas as pd

utils_path = os.path.join(os.getcwd(), '..', '..', '..')
if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from notebooks.src.utils import plots, constants

In [9]:
TEST_OUTPUTS_DIR = os.path.join(constants.INSPECTED_DATA_FOLDER, 'test_set')
GOOGLE_TRANSLATIONS_DIR = os.path.join(TEST_OUTPUTS_DIR, 'google')

In [58]:
architectures = ['transformer', 's2s']
directions = ['gn_es', 'es_gn']
pretraining_options = ['pretraining', 'without_pretraining']
hyperparameter_tuning_options = ['adjusted', 'default']
corpus_names = ['grammar', 'ancora', 'bible', 'merged']

In [63]:
NOT_PRETRAINED_CORPUS = 'not_pretrained'

model_output = {
    architecture: {
        direction: {
            pretraining_option: {
                hyperparameter_tuning_option: {
                    corpus_name: None for corpus_name in corpus_names + [NOT_PRETRAINED_CORPUS]
                } for hyperparameter_tuning_option in hyperparameter_tuning_options
            } for pretraining_option in pretraining_options
        } for direction in directions
    } for architecture in architectures
}

def get_substring(name: str, substrings: list[str]):
    for substring in substrings:
        if substring in name:
            return substring
    return None

for pretraining_options_dirname in os.listdir(TEST_OUTPUTS_DIR):
    pretraining_options_dir = os.path.join(TEST_OUTPUTS_DIR, pretraining_options_dirname)
    for hyperparameter_tuning_options_dirname in os.listdir(pretraining_options_dir):
        if hyperparameter_tuning_options_dirname not in hyperparameter_tuning_options:
            continue

        hyperparameter_options_dir = os.path.join(pretraining_options_dir, hyperparameter_tuning_options_dirname)
        for output_filename in os.listdir(hyperparameter_options_dir):
            file_architecture = get_substring(output_filename, architectures)
            file_direction = get_substring(output_filename, directions)
            corpus_name = get_substring(output_filename, corpus_names) if pretraining_options_dirname != 'without_pretraining' else NOT_PRETRAINED_CORPUS
            print(file_architecture, file_direction, corpus_name, pretraining_options_dirname, output_filename)
            output_filepath = os.path.join(hyperparameter_options_dir, output_filename)
            print(output_filepath, end='\n\n')
            model_output[file_architecture][file_direction][pretraining_options_dirname][hyperparameter_tuning_options_dirname][corpus_name] = output_filepath


s2s es_gn ancora pretraining decoded_adjusted_ancora_es_gn_s2s.txt
f:\my_year2023\PLN\code\MARIAN\notebooks\src\utils\..\..\..\notebooks\data\inspected\test_set\pretraining\adjusted\decoded_adjusted_ancora_es_gn_s2s.txt

transformer es_gn ancora pretraining decoded_adjusted_ancora_es_gn_transformer.txt
f:\my_year2023\PLN\code\MARIAN\notebooks\src\utils\..\..\..\notebooks\data\inspected\test_set\pretraining\adjusted\decoded_adjusted_ancora_es_gn_transformer.txt

s2s gn_es ancora pretraining decoded_adjusted_ancora_gn_es_s2s.txt
f:\my_year2023\PLN\code\MARIAN\notebooks\src\utils\..\..\..\notebooks\data\inspected\test_set\pretraining\adjusted\decoded_adjusted_ancora_gn_es_s2s.txt

transformer gn_es ancora pretraining decoded_adjusted_ancora_gn_es_transformer.txt
f:\my_year2023\PLN\code\MARIAN\notebooks\src\utils\..\..\..\notebooks\data\inspected\test_set\pretraining\adjusted\decoded_adjusted_ancora_gn_es_transformer.txt

s2s es_gn bible pretraining decoded_adjusted_finetuned_bible_es_gn_s

In [65]:
data_list = []

for architecture, dir_dict in model_output.items():
    for direction, pretrain_dict in dir_dict.items():
        for pretrain_option, hyper_dict in pretrain_dict.items():
            for hyper_option, corpora_dir in hyper_dict.items():
                for corpus_name, filepath in corpora_dir.items():
                    
                    if filepath is None:
                        continue
                    
                    data_list.append({
                        'architecture': architecture,
                        'pretraining_corpus': corpus_name,
                        'source': direction.split('_')[0],
                        'target': direction.split('_')[1],
                        'pretraining_option': pretrain_option,
                        'hyperparameter_tuning_option': hyper_option,
                        'filepath': filepath
                    })

df = pd.DataFrame(data_list)
df

Unnamed: 0,architecture,pretraining_corpus,source,target,pretraining_option,hyperparameter_tuning_option,filepath
0,transformer,grammar,gn,es,pretraining,adjusted,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...
1,transformer,ancora,gn,es,pretraining,adjusted,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...
2,transformer,bible,gn,es,pretraining,adjusted,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...
3,transformer,merged,gn,es,pretraining,adjusted,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...
4,transformer,grammar,gn,es,pretraining,default,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...
5,transformer,ancora,gn,es,pretraining,default,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...
6,transformer,bible,gn,es,pretraining,default,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...
7,transformer,merged,gn,es,pretraining,default,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...
8,transformer,not_pretrained,gn,es,without_pretraining,adjusted,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...
9,transformer,not_pretrained,gn,es,without_pretraining,default,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...


In [72]:
real_test_set_template = os.path.join(constants.PROJECT_DIR, 'artifacts', 'data', 'test', 'test_{trg}.txt.{trg}')
scorer_script = os.path.join(constants.PROJECT_DIR, 'scripts', 'validate', 'score.py')
score_types = ['sacrebleu_corpus_bleu', 'sacrebleu_corpus_chrf']
scores = {metric: [] for metric in score_types}

def get_score(filepath: str, target: str, metric: str):
    current_reference_file = real_test_set_template.format(trg=target)
    res = subprocess.run(['python', scorer_script, 
                          '--reference_file', current_reference_file, 
                          '--translation_file', filepath, 
                          '--score', metric], 
                          capture_output=True)
    output = res.stdout.decode('utf-8')
    return float(output)

In [73]:
for metric in score_types:
    for i, row in df.iterrows():
        filepath = row['filepath']
        current_reference_file = real_test_set_template.format(trg=row['target'])
        output = get_score(filepath, row['target'], metric)
        print(f'{os.path.basename(filepath)} {metric}: {output}')
        scores[metric].append(float(output))
        
    df[metric] = scores[metric]

f:\my_year2023\PLN\code\MARIAN\notebooks\src\utils\..\..\..\notebooks\data\inspected\test_set\pretraining\adjusted\decoded_adjusted_finetuned_grammar_gn_es_transformer_from3.txt sacrebleu_corpus_bleu: 14.7929
f:\my_year2023\PLN\code\MARIAN\notebooks\src\utils\..\..\..\notebooks\data\inspected\test_set\pretraining\adjusted\decoded_adjusted_ancora_gn_es_transformer.txt sacrebleu_corpus_bleu: 17.238
f:\my_year2023\PLN\code\MARIAN\notebooks\src\utils\..\..\..\notebooks\data\inspected\test_set\pretraining\adjusted\decoded_adjusted_finetuned_bible_gn_es_transformer_from4.txt sacrebleu_corpus_bleu: 14.1128
f:\my_year2023\PLN\code\MARIAN\notebooks\src\utils\..\..\..\notebooks\data\inspected\test_set\pretraining\adjusted\decoded_adjusted_pretraining_merged_gn_es_transformer.txt sacrebleu_corpus_bleu: 5.83534
f:\my_year2023\PLN\code\MARIAN\notebooks\src\utils\..\..\..\notebooks\data\inspected\test_set\pretraining\default\decoded_default_grammar__gn_es_transformer.txt sacrebleu_corpus_bleu: 7.372

In [74]:
google_gn_es_bleu = get_score(os.path.join(GOOGLE_TRANSLATIONS_DIR, 'test_translation_gn_es.txt'), 'es', 'sacrebleu_corpus_bleu')
google_gn_es_chrf = get_score(os.path.join(GOOGLE_TRANSLATIONS_DIR, 'test_translation_gn_es.txt'), 'es', 'sacrebleu_corpus_chrf')
google_es_gn_bleu = get_score(os.path.join(GOOGLE_TRANSLATIONS_DIR, 'test_translation_es_gn.txt'), 'gn', 'sacrebleu_corpus_bleu')
google_es_gn_chrf = get_score(os.path.join(GOOGLE_TRANSLATIONS_DIR, 'test_translation_es_gn.txt'), 'gn', 'sacrebleu_corpus_chrf')

In [75]:
google_df = pd.DataFrame([{'architecture': 'google',
                            'source': 'gn',
                            'target': 'es',
                            'pretraining_option': 'without_pretraining',
                            'hyperparameter_tuning_option': 'default',
                            'sacrebleu_corpus_bleu': google_gn_es_bleu,
                            'sacrebleu_corpus_chrf': google_gn_es_chrf}, 
                          {'architecture': 'google',
                           'source': 'es',
                           'target': 'gn',
                           'pretraining_option': 'without_pretraining',
                           'hyperparameter_tuning_option': 'default',
                           'sacrebleu_corpus_bleu': google_es_gn_bleu,
                           'sacrebleu_corpus_chrf': google_es_gn_chrf}])

df = pd.concat([df, google_df], axis=0)

In [76]:
df.sort_values(by=['source', 'sacrebleu_corpus_bleu'], ascending=False)

Unnamed: 0,architecture,pretraining_corpus,source,target,pretraining_option,hyperparameter_tuning_option,filepath,sacrebleu_corpus_bleu,sacrebleu_corpus_chrf
0,google,,gn,es,without_pretraining,default,,26.9631,50.9526
21,s2s,ancora,gn,es,pretraining,adjusted,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...,26.8243,49.1144
23,s2s,merged,gn,es,pretraining,adjusted,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...,26.2075,49.0953
22,s2s,bible,gn,es,pretraining,adjusted,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...,25.9919,47.885
20,s2s,grammar,gn,es,pretraining,adjusted,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...,25.6581,47.4604
28,s2s,not_pretrained,gn,es,without_pretraining,adjusted,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...,25.5384,47.2416
1,transformer,ancora,gn,es,pretraining,adjusted,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...,17.238,42.4477
25,s2s,ancora,gn,es,pretraining,default,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...,15.2425,37.086
0,transformer,grammar,gn,es,pretraining,adjusted,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...,14.7929,39.3789
2,transformer,bible,gn,es,pretraining,adjusted,f:\my_year2023\PLN\code\MARIAN\notebooks\src\u...,14.1128,37.9571


In [77]:
len(df)

42