This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
bidirectional_language_model.jsonnet
134 lines (130 loc) · 4.19 KB
/
bidirectional_language_model.jsonnet
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
local NUM_GPUS = 2;
local NUM_GRAD_ACC = 4;
local BATCH_SIZE = 512 / NUM_GPUS / NUM_GRAD_ACC;
local BASE_READER = {
"type": "simple_language_modeling",
"tokenizer": {
// The 1 Billion Word Language Model Benchmark dataset is
// pre-tokenized. (Also, if you're running against a untokenized
// dataset be aware that there are serialization issues with Spacy.
// These come into play in the multiprocess case.)
"type": "just_spaces"
},
"token_indexers": {
"tokens": {
"type": "single_id"
},
"token_characters": {
"type": "elmo_characters"
}
},
"max_sequence_length": 400,
"start_tokens": ["<S>"],
"end_tokens": ["</S>"],
};
local BASE_LOADER = {
"max_instances_in_memory": BATCH_SIZE * 100,
"batch_sampler": {
"type": "bucket",
"batch_size": BATCH_SIZE,
}
};
{
"dataset_reader": {
"type": "sharded",
"base_reader": BASE_READER,
},
// Note: We don't set a validation_data_path because the softmax is only
// sampled during training. Not sampling on GPUs results in a certain OOM
// given our large vocabulary. We'll need to evaluate against the test set
// (when we'll want a full softmax) with the CPU.
"train_data_path": std.extVar("BIDIRECTIONAL_LM_TRAIN_PATH"),
"vocabulary": {
// Use a prespecified vocabulary for efficiency.
"type": "from_files",
"directory": std.extVar("BIDIRECTIONAL_LM_VOCAB_PATH"),
// Plausible config for generating the vocabulary.
// "tokens_to_add": {
// "tokens": ["<S>", "</S>"],
// "token_characters": ["<>/S"]
// },
// "min_count": {"tokens": 3}
},
"model": {
"type": "language_model",
"bidirectional": true,
"num_samples": 8192,
# Sparse embeddings don't work with DistributedDataParallel.
"sparse_embeddings": false,
"text_field_embedder": {
"token_embedders": {
"tokens": {
"type": "empty"
},
"token_characters": {
"type": "character_encoding",
"embedding": {
"num_embeddings": 262,
// Same as the Transformer ELMo in Calypso. Matt reports that
// this matches the original LSTM ELMo as well.
"embedding_dim": 16
},
"encoder": {
"type": "cnn-highway",
"activation": "relu",
"embedding_dim": 16,
"filters": [
[1, 32],
[2, 32],
[3, 64],
[4, 128],
[5, 256],
[6, 512],
[7, 1024]],
"num_highway": 2,
"projection_dim": 512,
"projection_location": "after_highway",
"do_layer_norm": true
}
}
}
},
// TODO(brendanr): Consider the following.
// remove_bos_eos: true,
// Applies to the contextualized embeddings.
"dropout": 0.1,
"contextualizer": {
"type": "bidirectional_language_model_transformer",
"input_dim": 512,
"hidden_dim": 2048,
"num_layers": 6,
"dropout": 0.1,
"input_dropout": 0.1
}
},
"data_loader": BASE_LOADER,
"distributed": {
"cuda_devices": if NUM_GPUS > 1 then std.range(0, NUM_GPUS - 1) else 0,
},
"trainer": {
"num_epochs": 10,
"optimizer": {
// The gradient accumulators in Adam for the running stdev and mean for
// words not used in the sampled softmax would be decayed to zero with the
// standard "adam" optimizer.
"type": "dense_sparse_adam"
},
// TODO(brendanr): Needed with transformer too?
// "grad_norm": 10.0,
"learning_rate_scheduler": {
"type": "noam",
// See https://github.com/allenai/calypso/blob/master/calypso/train.py#L401
"model_size": 512,
// See https://github.com/allenai/calypso/blob/master/bin/train_transformer_lm1b.py#L51.
// Adjusted based on our sample size relative to Calypso's.
"warmup_steps": 6000
},
"num_gradient_accumulation_steps": NUM_GRAD_ACC,
"use_amp": true
}
}