git clone https://github.com/aqweteddy/ImageCaptioning.git
cd ImageCaptioning
# install requirement
# create virtual environment
# ...
pip install -r requirements.txt
sh download.sh
to download MS COCO
- Encoder: ResNet152
- Decoder: GRU or LSTM
- build dictionary
- resize images
python preprocess.py\
--caption_path data/annotations/captions_train2014.json\
--dict_path data/vocab.txt\
--img_input_dir data/train2014 --img_output_dir data/train2014_resize
python train.py\
--vocab_path data/vocab.txt\
--image_dir data/train2014_resize\
--caption_path data/annotations/captions_train2014.json\
--save_ckpt 1000\
--embed_size 256\
--hidden_size 512\
--num_layers 1\
--num_epochs 5\
--batch_size 64\
--learning_rate 0.001\
--model_path model/
from infer import load_model, eval1
from utils.preprocess_text import Dictionary
vocab = Dictionary()
vocab.load_dict('path to Dictionary')
encoder, decoder = load_model(encoder_path='path_to_encder',
decoder_path='path_to_decoder',
vocab_size=len(vocab),
layer_type='gru',
embed_size=256,
hidden_size=512,
num_layers=2,
)
result = eval1('path_to_image', vocab, encoder, decoder)
print(' '.join(result))
- LSTM 4 Epochs, 1 layer, dropout 0: 0.162638
- GRU 4 Epochs, 1layer, dropout 0: 0.157053
- GRU 4 Epochs, 3layer, dropout 0.4: 0.144375
- GRU 8 Epochs, 3layer, dropout 0.4: 0.149145
*On RTX2070 GPU: * Train: 30 mins for each epoch * Infer: 0.044 sec / image