In [None]:
#all_slow

In [None]:
from fastai.text.all import *
from reformer_fastai.all import *

In [None]:
#hide
def print_eval(learn, res):
    print('Evaluation results: ', '; '.join(f'{m.name}: {v:.2f}' for m,v in zip(learn.metrics, res[1:])), sep='\n')

# Evaluation of saved models

## Evaluation on synthetic task

See https://arampacha.github.io/reformer_fastai/experiment.synthetic-task.html for dataset and training details.

In [None]:
from reformer_fastai.expscript import get_twin_sequence_dataloaders, get_synthetic_learner

In [None]:
import wandb
run = wandb.init()
artifact = run.use_artifact('fastai_community/reformer-fastai/run-wvb5g2re-model:v0', type='model')
artifact_dir = artifact.download(root='./models')

In [None]:
dls = get_twin_sequence_dataloaders()
config = SyntheticConfig(use_lsh=False)
model = LSHLM.from_config(config)
learn = get_synthetic_learner(dls, model, precision=0)
learn.add_cb(MaskTargCallback());

In [None]:
test_dl = learn.dls.test_dl(DeterministicTwinSequence(1024, 1000))

Score of randomly initialized model as sanity check:

In [None]:
res = learn.validate(dl=test_dl)
print_eval(learn, res)

Evaluation results: 
masked_accuracy: 0.01


After loading pretrained weights models achieves perfect accuracy on unseen data, as expected:

In [None]:
learn.load('run-wvb5g2re-model', with_opt=False);
res = learn.validate(dl=test_dl)
print_eval(learn, res)

Evaluation results: 
masked_accuracy: 1.00


## Evaluation on enwik8 test set

In [None]:
from reformer_fastai.expscript import get_lm_learner

In [None]:
path = untar_data('http://mattmahoney.net/dc/enwik8.zip', dest='./data')

In [None]:
df = pd.DataFrame({'text':read_lines('data/enwik8')})

btt = ByteTextTokenizer(is_lm=True, add_bos=False, add_eos=False)
df['toks'] = df['text'].apply(btt)
df['lens'] = df['toks'].apply(len)
df['lens_cum_sum'] = df.lens.cumsum()

In [None]:
train_cutoff = df.lens.sum() - 10_000_000  # 10M characters for val and test
train_idxs = df.loc[df['lens_cum_sum'] < train_cutoff].index.values
train_idxs = list(range(0, max(train_idxs)))

remaining_idxs = len(df) - max(train_idxs)
validation_idxs = list(range(max(train_idxs), max(train_idxs) + int(remaining_idxs/2)))
test_idxs = list(range(max(validation_idxs), len(df)))

splits = [train_idxs, validation_idxs]

tfms = [attrgetter("text"), btt]
dsets = Datasets(df, [tfms], splits=splits, dl_type=LMDataLoader)
dl_kwargs = [{'lens':df['lens'].values[train_idxs]},
             {'val_lens':df['lens'].values[validation_idxs]}]
n_cpus = multiprocessing.cpu_count()
dls = dsets.dataloaders(bs=8, val_bs=24, seq_len=4096, dl_kwargs=dl_kwargs, shuffle_train=True, n_workers=n_cpus)

In [None]:
config = NHashesConfig(n_hashes=2, pad_idx=dls.byte_text_tokenizer.pad_token_id)
model = LSHLM.from_config(config)
learn = get_lm_learner(dls, model, opt_func=adafactor, precision=2)
learn.add_cb(PadBatchCallback(bucket_size=config.bucket_size, val=config.pad_idx, y_val=config.pad_idx));

In [None]:
test_dl = learn.dls.test_dl(df.iloc[test_idxs, :])

In [None]:
test_dl.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"Indiana Jones, &quot;Obtainer of Rare [[Antiquities]],&quot; is modeled after the strong-jawed heroes of the matinee [[serial]]s and [[pulp magazine]]s that Lucas and Spielberg enjoyed in their childhoods, such as the [[Republic Pictures]] [[serial]]s, and [[Doc Savage]]. The two friends first discussed the project while in [[Hawaii]] during the time of release of the first ''[[Star Wars]]'' film. Spielberg told Lucas how he wanted to direct a [[James Bond]] film. Lucas responded that he had something better than that.\n\nSpielberg wanted Indiana to be a James Bond-like figure that got into difficult situations and worked his way out. Upon requests by Spielberg and Lucas the costume designer was given the task to make the character have a distinctive recognizable silhouette through the style of the hat (much like [[Dick Tracy]]). After examining many hats, the designers chose an urban version of the classic [[Australia]]n [[fedora (hat)|fedora]], the [[Akubra]]. The","ndiana Jones, &quot;Obtainer of Rare [[Antiquities]],&quot; is modeled after the strong-jawed heroes of the matinee [[serial]]s and [[pulp magazine]]s that Lucas and Spielberg enjoyed in their childhoods, such as the [[Republic Pictures]] [[serial]]s, and [[Doc Savage]]. The two friends first discussed the project while in [[Hawaii]] during the time of release of the first ''[[Star Wars]]'' film. Spielberg told Lucas how he wanted to direct a [[James Bond]] film. Lucas responded that he had something better than that.\n\nSpielberg wanted Indiana to be a James Bond-like figure that got into difficult situations and worked his way out. Upon requests by Spielberg and Lucas the costume designer was given the task to make the character have a distinctive recognizable silhouette through the style of the hat (much like [[Dick Tracy]]). After examining many hats, the designers chose an urban version of the classic [[Australia]]n [[fedora (hat)|fedora]], the [[Akubra]]. The"
1,"eaders at the state level continue to emphasize the state's past economic base of manufacturing and farming.\n\n== Military installations ==\nIndiana was formerly home to two major military installations, [[Grissom Air Force Base]] near Peru (reduced to reservist operations in 1994) and [[Fort Benjamin Harrison]] near Indianapolis, now largely reduced to reservist operations, though the [[Department of Defense]] continues to operate a large financial operation there.\n\nCurrent active installations include [[Air National Guard]] fighter units at [[Fort Wayne, Indiana|Fort Wayne]] and [[Terre Haute]] airports (to be consolidated at Fort Wayne under the 2005 BRAC proposal, with the Terre Haute facility remaining open as a non-flying installation), the [[Crane Naval Weapons Center]] in the southwest of the state and the Army's [[Newport Chemical Depot]], which is currently heavily involved in neutralizing dangerous chemical weapons stored there.\n\n== Demographics ==\n{| class=&quot;toccolours&quot; align=&quot;right&quot; cellpadding=&quot;4&quot; cellspacing=&quot;0&quot; style=&quot;margin:0 0 1em 1em; font-size: 95%;&quot;\n|-\n! colspan=2 bgcolor=&quot;#ccccff&quot; align=&quot;center&quot;| Historical populations\n|-\n!","aders at the state level continue to emphasize the state's past economic base of manufacturing and farming.\n\n== Military installations ==\nIndiana was formerly home to two major military installations, [[Grissom Air Force Base]] near Peru (reduced to reservist operations in 1994) and [[Fort Benjamin Harrison]] near Indianapolis, now largely reduced to reservist operations, though the [[Department of Defense]] continues to operate a large financial operation there.\n\nCurrent active installations include [[Air National Guard]] fighter units at [[Fort Wayne, Indiana|Fort Wayne]] and [[Terre Haute]] airports (to be consolidated at Fort Wayne under the 2005 BRAC proposal, with the Terre Haute facility remaining open as a non-flying installation), the [[Crane Naval Weapons Center]] in the southwest of the state and the Army's [[Newport Chemical Depot]], which is currently heavily involved in neutralizing dangerous chemical weapons stored there.\n\n== Demographics ==\n{| class=&quot;toccolours&quot; align=&quot;right&quot; cellpadding=&quot;4&quot; cellspacing=&quot;0&quot; style=&quot;margin:0 0 1em 1em; font-size: 95%;&quot;\n|-\n! colspan=2 bgcolor=&quot;#ccccff&quot; align=&quot;center&quot;| Historical populations\n|-\n!"


In [None]:
res = learn.validate(dl=test_dl)
print_eval(learn, res)

Evaluation results: 
accuracy: 0.05; perplexity: 22361.88; bpc: 14.45


In [None]:
# import wandb
# run = wandb.init()
artifact = run.use_artifact('fastai_community/reformer-fastai/3tbfvs77:v0', type='model')
artifact_dir = artifact.download(root='./models')

In [None]:
learn.load('n_hashes_n_hashes-4_enwik8_sl-4096_bs-4_n_eps-10_seed-42_grad-accum-8__24_01_2021_15:12', with_opt=False);

In [None]:
res = learn.validate(dl=test_dl)
print_eval(learn, res)

Evaluation results: 
accuracy: 0.70; perplexity: 2.76; bpc: 1.46


It's possible to improve performance by increasing number of hashing rounds. In this case one can observe small improvement in perplaxity and BPC:

In [None]:
learn.model.n_hashes = 8
res = learn.validate(dl=test_dl)
print_eval(learn, res)

Evaluation results: 
accuracy: 0.70; perplexity: 2.73; bpc: 1.45
