Skip to content

Commit

Permalink
tests on GPU and minor modifications
Browse files Browse the repository at this point in the history
  • Loading branch information
ashwinkalyan committed Oct 7, 2016
1 parent 3bcf9b2 commit 443b595
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 45 deletions.
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Diverse Beam Search

This code implements diverse beam search - an approximate inference algorithm that generates diverse decodings. This repository demos the method for image-captioning using [neuraltalk2][1]

## Requirements
You will need to install [torch](http://torch.ch/) and the packages
- `nn`
- `nngraph`
- `image`
- `loadcaffe`
- `hdf5` (optional, depending on how you want to input data)
You might want to install torch using [this](https://github.com/torch/distro) repository. It installs a bunch of the requirements.
Additionally, if you are using a GPU you will need to install `cutorch` and `cunn`. If the image-captioning checkpoint was trained using `cudnn`, you will need to download `cudnn`. First, you will need to download it from NVIDIA's [website](https://developer.nvidia.com/cudnn) and add it to your `LD_LIBRARY_PATH`.
Any of the checkpoints distributed by Andrej Karpathy along with the [neuraltalk2][1] repository can be used with this code. Additionally, you could also train your own model using [neuraltalk2][1] and use this code to sample diverse sentences.

## Generating Diverse Sequences
After installing all the dependencies, you should be able to obtain diverse captions by:
```
$ th -model /path/to/model.t7 -num_images 1 -image_folder eval_images -gpuid -1
```
To run a beam search of size 10 with 5 diverse groups and a diversity strength of 0.5 on the same image you would do:
```
$ th -model /path/to/model.t7 -B 10 -M 5 -lambda 0.5 -num_images 1 -image_folder eval_images -gpuid -1
```

## References
[1]: https://github.com/karpathy/neuraltalk2

Binary file removed dbs/.beam_utils.lua.swp
Binary file not shown.
53 changes: 8 additions & 45 deletions eval.lua
Original file line number Diff line number Diff line change
@@ -1,20 +1,14 @@
require 'torch'
require 'nn'
require 'nngraph'
require 'debug'
-- exotics
require 'loadcaffe'

-- local imports
utils = require 'misc.utils'
require 'misc.DataLoader'
require 'misc.DataLoaderRaw'
require 'misc.LanguageModel'
net_utils = require 'misc.net_utils'
beam_utils = require 'dbs.beam_utils'
-- div_utils = require 'dbs.div_utils'
-- vis_utils = require 'misc.vis_utils'

local debug = require 'debug'

-------------------------------------------------------------------------------
-- Input arguments and options
Expand All @@ -31,35 +25,24 @@ cmd:option('-model','','path to model to evaluate')
-- Basic options
cmd:option('-batch_size', 1, 'if > 0 then overrule, otherwise load from checkpoint.')
cmd:option('-num_images', 1, 'how many images to use when periodically evaluating the loss? (-1 = all)')
-- cmd:option('-language_eval', 0, 'Evaluate language as well (1 = yes, 0 = no)? BLEU/CIDEr/METEOR/ROUGE_L? requires coco-caption code from Github.')
cmd:option('-dump_images', 1, 'Dump images into vis/imgs folder for vis? (1=yes,0=no)')
cmd:option('-dump_json', 1, 'Dump json with predictions into vis folder? (1=yes,0=no)')
-- cmd:option('-dump_path', 0, 'Write image paths along with predictions into vis json? (1=yes,0=no)')

-- Sampling options
-- cmd:option('-sample_max', 1, '1 = sample argmax words. 0 = sample from distributions.')
cmd:option('-B', 2, 'used when sample_max = 1, indicates number of beams in beam search. Usually 2 or 3 works well. More is not better. Set this to 1 for faster runtime but a bit worse performance.')
cmd:option('-M',2,'number of diverse groups')
cmd:option('-B', 2, 'Beam width - number of beams needed overall')
cmd:option('-G',2,'number of diverse groups')
cmd:option('-lambda', 0.5, 'diversity penalty')
-- cmd:option('-divmode', 0, '1 to turn on cumulative_diversity')
-- cmd:option('-temperature', 1.0, 'temperature when sampling from distributions (i.e. when sample_max = 0). Lower = "safer" predictions.')
cmd:option('-primetext', "", 'used as a prompt to "seed" the state of the LSTM using a given sequence, before we sample.')
-- For evaluation on a folder of images:
cmd:option('-image_folder', '', 'If this is nonempty then will predict on the images in this folder path')
cmd:option('-image_root', '', 'In case the image paths have to be preprended with a root path to an image folder')
-- For evaluation on MSCOCO images from some split:
cmd:option('-input_h5','','path to the h5file containing the preprocessed dataset. empty = fetch from model checkpoint.')
cmd:option('-input_json','','path to the json file containing additional info and vocab. empty = fetch from model checkpoint.')
cmd:option('-split', 'test', 'if running on MSCOCO images, which split to use: val|test|train')
cmd:option('-coco_json', '', 'if nonempty then use this file in DataLoaderRaw (see docs there). Used only in MSCOCO test evaluation, where we have a specific json file of only test set images.')

-- misc
cmd:option('-backend', 'cudnn', 'nn|cudnn')
cmd:option('-id', 'evalscript', 'an id identifying this run/job. used only if language_eval = 1 for appending to intermediate files')
cmd:option('-seed', 123, 'random number generator seed to use')
cmd:option('-gpuid', -1, 'which gpu to use. -1 = use CPU')
-- cmd:option('-div_vis_dir', '', 'store information for the div rnn vis in this directory; it should already exist')
-- cmd:option('-baseline',-1,'implements the stanford baseline if >=0 with strength equal to the set value. -1 implements conventional DBS -- use M = 1 and opt.baseline >0 to run stanford baseline')
cmd:text()

-------------------------------------------------------------------------------
Expand Down Expand Up @@ -95,8 +78,6 @@ assert(string.len(opt.model) > 0, 'must provide a model')
print(opt.model)
local checkpoint = torch.load(opt.model)
-- override and collect parameters
if string.len(opt.input_h5) == 0 then opt.input_h5 = checkpoint.opt.input_h5 end
if string.len(opt.input_json) == 0 then opt.input_json = checkpoint.opt.input_json end
if opt.batch_size == 0 then opt.batch_size = checkpoint.opt.batch_size end
local fetch = {'rnn_size', 'input_encoding_size', 'drop_prob_lm', 'cnn_proto', 'cnn_model', 'seq_per_img'}
for k,v in pairs(fetch) do
Expand All @@ -107,13 +88,7 @@ local inv_vocab = table_invert(checkpoint.vocab)
-------------------------------------------------------------------------------
-- Create the Data Loader instance
-------------------------------------------------------------------------------
local loader
if string.len(opt.image_folder) == 0 then
loader = DataLoader{h5_file = opt.input_h5, json_file = opt.input_json}
else
loader = DataLoaderRaw{folder_path = opt.image_folder, coco_json = opt.coco_json}
end

local loader = DataLoaderRaw{folder_path = opt.image_folder,coco_json= ''}
-------------------------------------------------------------------------------
-- Load the networks from model checkpoint
-------------------------------------------------------------------------------
Expand All @@ -123,7 +98,6 @@ protos.crit = nn.LanguageModelCriterion()
protos.lm:createClones() -- reconstruct clones inside the language model
if opt.gpuid >= 0 then for k,v in pairs(protos) do v:cuda() end end


-------------------------------------------------------------------------------
-- Evaluation fun(ction)
-------------------------------------------------------------------------------
Expand All @@ -139,24 +113,13 @@ local function eval_split(split, evalopt)
local loss_evals = 0
local final_beams = {}
while true do

-- fetch a batch of data
local data = loader:getBatch{batch_size = opt.batch_size, split = split, seq_per_img = opt.seq_per_img}
data.images = net_utils.prepro(data.images, false, opt.gpuid >= 0) -- preprocess in place, and don't augment
n = n + data.images:size(1)
-- forward the model to get loss
local feats = protos.cnn:forward(data.images)

-- evaluate loss if we have the labels
local loss = 0
if data.labels then
local expanded_feats = protos.expander:forward(feats)
local logprobs = protos.lm:forward{expanded_feats, data.labels}
loss = protos.crit:forward(logprobs, data.labels)
loss_sum = loss_sum + loss
loss_evals = loss_evals + 1
end

local function gen_logprobs(word_ix, state)
local embedding = protos.lm.lookup_table:forward(word_ix)
local inputs = {embedding, unpack(state)}
Expand All @@ -167,7 +130,7 @@ local function eval_split(split, evalopt)
local sample_opts = {
T = protos.lm.seq_length,
B = opt.B,
M = opt.M,
M = opt.G,
lambda = opt.lambda,
temperature = opt.temperature,
-- size of a state
Expand Down Expand Up @@ -262,12 +225,12 @@ local function print_and_dump_beam(opt,beam_table)
print('----------------------------------------------------')
local function compare_beam(a,b) return a.logp > b.logp end
json_table = {}
bdash = opt.B / opt.M
bdash = opt.B / opt.G
for im_n = 1,#beam_table do
json_table[im_n] = {}
json_table[im_n]['image_id'] = beam_table[im_n]['image_id']
json_table[im_n]['captions'] = {}
for i = 1,opt.M do
for i = 1,opt.G do
for j = 1,bdash do
current_beam_string = table.concat(net_utils.decode_sequence(vocab, torch.reshape(beam_table[im_n]['caption'][i][j].seq, beam_table[im_n]['caption'][i][j].seq:size(1), 1)))
print('beam ' .. (i-1)*bdash+j ..' diverse group: '..i)
Expand All @@ -286,7 +249,7 @@ local function print_and_dump_beam(opt,beam_table)
end
if opt.dump_json == 1 then
-- dump the json
utils.write_json('amt_all_images_0.4.json', json_table)
utils.write_json(opt.image_folder .. tostring(opt.B) .. '_' .. tostring(opt.G) .. 'json', json_table)
end
return json_table
end
Expand Down
1 change: 1 addition & 0 deletions eval_images/10_5.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"captions":[{"logp":-5.0367479324341,"sentence":"a dog with a frisbee in its mouth"},{"logp":-6.2141075134277,"sentence":"a small dog with a frisbee in its mouth"},{"logp":-6.6345801353455,"sentence":"a dog is playing with a frisbee in the grass"},{"logp":-7.0890574455261,"sentence":"a black and white dog with a frisbee in its mouth"},{"logp":-7.9146413803101,"sentence":"a dog is playing with a frisbee in the yard"},{"logp":-8.264461517334,"sentence":"a small dog with a frisbee in his mouth"},{"logp":-9.0053958892822,"sentence":"a black and white dog with a frisbee in his mouth"},{"logp":-9.5824060440063,"sentence":"a dog with a frisbee in its mouth running in a field"},{"logp":-9.7516613006592,"sentence":"the dog is running in the grass with a frisbee"},{"logp":-9.9132232666016,"sentence":"the dog is in the grass with a frisbee"}],"image_id":"dog.jpg"}]

0 comments on commit 443b595

Please sign in to comment.