In [1]:
# Initialization Cell
path = 'C:\\Users\\under\\Jupyter-Projects\\My-Research\\On-the-Curse-of-Sentence-Length\\data-bin\\out_balanced_transformer.txt'

In [2]:
from utils import plots
from utils import preprocess
from utils import metrics as m
from utils import splits

In [3]:
output = preprocess.loadFairseqOutput(path)
factors = plots.get_factors(output)
metrics = plots.get_metrics(data=output, n_gram=4)

Edit Distance takes: 3 min 50 sec
BLEU score takes: 7 min 46 sec


In [4]:
# bin by source sentence length
bins_src_len = splits.merge_bin(bins=splits.splitDatabySentenceLength(triples=output, tick=10, choice='src'),
                                target_bin=14)
# bin by reference sentence length
bins_ref_len = splits.merge_bin(bins=splits.splitDatabySentenceLength(triples=output, tick=10, choice='ref'),
                                target_bin=14)
# bin by # unks in source
bins_src_num_unk = splits.merge_bin(bins=splits.splitDatabyNumberOfUnknowns(triples=output, tick=5, choice='src'), 
                                    target_bin=6)
# bin by # unks in reference
bins_ref_num_unk = splits.merge_bin(bins=splits.splitDatabyNumberOfUnknowns(triples=output, tick=5, choice='ref'), 
                                    target_bin=7)
# bin by % unks in source
bins_src_frac_unk = splits.merge_bin(bins=splits.splitDatabyFractionOfUnknowns(triples=output, tick=1, choice='src'), 
                                    target_bin=5)
# bin by % unks in reference
bins_ref_frac_unk = splits.merge_bin(bins=splits.splitDatabyFractionOfUnknowns(triples=output, tick=1, choice='ref'), 
                                    target_bin=5)

# Source Length

In [5]:
NUM_DATA, KEYS, BLEU, BLEU_pre, BLEU_bp, EDIT, EDIT_N = \
                  plots.calcAverage(data=bins_src_len, 
                                    edit_dist=m.edit_distance_by_word, 
                                    tick=10)
keys = [int(x)+10 for x in KEYS]

In [1]:
plots.plot_scatter_line(metrics_scatter=metrics, 
                        factor_scatter=factors[0], 
                        metrics_line=[EDIT, EDIT_N, BLEU_pre, BLEU_bp, BLEU], 
                        factor_line=keys,
                        xlim=250,
                        remove_zero=True)

# Reference Length

In [13]:
for idx, f in enumerate(factors):
    print(idx, '-', f[0])

0 - Source Length
1 - # Unknowns in Source
2 - % Unknowns in Source
3 - Refenrence Length
4 - # Unknowns in Reference
5 - % Unknowns in Reference


In [7]:
NUM_DATA, KEYS, BLEU, BLEU_pre, BLEU_bp, EDIT, EDIT_N = \
                  plots.calcAverage(data=bins_ref_len, 
                                    edit_dist=m.edit_distance_by_word, 
                                    tick=10)
keys = [int(x)+10 for x in KEYS]

In [2]:
plots.plot_scatter_line(metrics_scatter=metrics, 
                        factor_scatter=factors[3], 
                        metrics_line=[EDIT, EDIT_N, BLEU_pre, BLEU_bp, BLEU], 
                        factor_line=keys,
                        xlim=250,
                        remove_zero=True)

# Num unks in source

In [11]:
NUM_DATA, KEYS, BLEU, BLEU_pre, BLEU_bp, EDIT, EDIT_N = \
                  plots.calcAverage(data=bins_src_num_unk, 
                                    edit_dist=m.edit_distance_by_word, 
                                    tick=5)
keys = [int(x)+5 for x in KEYS]

In [3]:
plots.plot_scatter_line(metrics_scatter=metrics, 
                        factor_scatter=factors[1], 
                        metrics_line=[EDIT, EDIT_N, BLEU_pre, BLEU_bp, BLEU], 
                        factor_line=keys,
                        xlim=100,
                        remove_zero=True)

# Num unks in reference

In [16]:
NUM_DATA, KEYS, BLEU, BLEU_pre, BLEU_bp, EDIT, EDIT_N = \
                  plots.calcAverage(data=bins_ref_num_unk, 
                                    edit_dist=m.edit_distance_by_word, 
                                    tick=5)
keys = [int(x)+5 for x in KEYS]

In [4]:
plots.plot_scatter_line(metrics_scatter=metrics, 
                        factor_scatter=factors[4], 
                        metrics_line=[EDIT, EDIT_N, BLEU_pre, BLEU_bp, BLEU], 
                        factor_line=keys,
                        xlim=100,
                        remove_zero=True)

# Frac unk in source

In [28]:
NUM_DATA, KEYS, BLEU, BLEU_pre, BLEU_bp, EDIT, EDIT_N = \
                  plots.calcAverage(data=bins_src_frac_unk, 
                                    edit_dist=m.edit_distance_by_word, 
                                    tick=1)
keys = [round(float(x)*0.1+0.1,1) for x in KEYS]

In [5]:
plots.plot_scatter_line(metrics_scatter=metrics, 
                        factor_scatter=factors[2], 
                        metrics_line=[EDIT, EDIT_N, BLEU_pre, BLEU_bp, BLEU], 
                        factor_line=keys,
                        xlim=1.0,
                        remove_zero=True)

# Frac unk in reference

In [31]:
NUM_DATA, KEYS, BLEU, BLEU_pre, BLEU_bp, EDIT, EDIT_N = \
                  plots.calcAverage(data=bins_ref_frac_unk, 
                                    edit_dist=m.edit_distance_by_word, 
                                    tick=1)
keys = [round(float(x)*0.1+0.1,1) for x in KEYS]

In [6]:
plots.plot_scatter_line(metrics_scatter=metrics, 
                        factor_scatter=factors[5], 
                        metrics_line=[EDIT, EDIT_N, BLEU_pre, BLEU_bp, BLEU], 
                        factor_line=keys,
                        xlim=1.0,
                        remove_zero=True)