Add all code

akirafukui · Jul 8, 2016 · 14bca96 · 14bca96
1 parent 3a90e1d
commit 14bca96
Show file tree

Hide file tree

Showing 30 changed files with 17,866 additions and 1 deletion.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,26 @@
+COPYRIGHT
+
+Copyright (c) 2016, The Regents of the University of California (Regents)
+All rights reserved.
+
+LICENSE
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met: 
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer. 
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
@@ -1 +1,60 @@
-# vqa-public
+# Multimodal Compact Bilinear Pooling for VQA
+
+This is the code that we wrote to train the state-of-the-art VQA models [described in our paper](https://arxiv.org/abs/1606.01847). Our ensemble of 7 models obtained **66.67%** on real open-ended test-dev and **70.24%** on real multiple-choice test-dev.
+
+## Live Demo
+
+You can upload your own images and ask the model your own questions. [Try the live demo!](http://demo.berkeleyvision.org/)
+
+## Pretrained Model
+
+We are releasing the “MCB + Genome + Att. + GloVe” model from the paper, which achieves **65.38%** on real open-ended test-dev. This is our best individual model.
+
+[Download](https://www.dropbox.com/s/o19k39lvt5cm0bc/multi_att_2_glove_pretrained.zip?dl=0)
+
+You can easily use this model with our evaluation code or with our demo server code.
+
+## Prerequisites
+
+In order to use our pretrained model:
+
+- Compile the `feature/20160617_cb_softattention` branch of [our fork of Caffe](https://github.com/akirafukui/caffe/). This branch contains Yang Gao’s Compact Bilinear layers ([dedicated repo](https://github.com/gy20073/compact_bilinear_pooling), [paper](https://arxiv.org/abs/1511.06062)) released under the [BDD license](https://github.com/gy20073/compact_bilinear_pooling/blob/master/LICENSE), and Ronghang Hu’s Soft Attention layers ([paper](https://arxiv.org/abs/1511.03745)) released under BSD 2-clause.
+- Download the [pre-trained ResNet-152 model](https://github.com/KaimingHe/deep-residual-networks).
+
+If you want to train from scratch, do the above plus:
+
+- Download the [VQA tools](https://github.com/VT-vision-lab/VQA).
+- Download the [VQA real-image dataset](http://visualqa.org/download.html).
+- Optional: Install spaCy and download GloVe vectors. The latest stable release of spaCy has a bug that prevents GloVe vectors from working, so you need to install the HEAD version. See `train/README.md`.
+- Optional: Download [Visual Genome](https://visualgenome.org/) data.
+
+## Data Preprocessing
+
+See `preprocess/README.md`.
+
+## Training
+
+See `train/README.md`.
+
+## Evaluation
+
+To generate an answers JSON file in the format expected by the VQA evaluation code and VQA test server, you can use `eval/ensemble.py`. This code can also ensemble multiple models. Running `python ensemble.py` will print out a help message telling you what arguments to use.
+
+## Demo Server
+
+The code that powers our [live demo](http://demo.berkeleyvision.org/) is in `server/`. To run this, you’ll need to install Flask and change the constants at the top of `server.py`. Then, just do `python server.py`, and the server will bind to `0.0.0.0:5000`.
+
+## License and Citation
+
+This code and the pretrained model is released under the BSD 2-Clause license. See `LICENSE` for more information.
+
+Please cite [our paper](https://arxiv.org/abs/1606.01847) if it helps your research:
+
+```
+@article{fukui16mcb,
+  title={Multimodal Compact Bilinear Pooling for Visual Question Answering and Visual Grounding},
+  author={Fukui, Akira and Park, Dong Huk and Yang, Daylen and Rohrbach, Anna and Darrell, Trevor and Rohrbach, Marcus},
+  journal={arXiv:1606.01847},
+  year={2016},
+}
+```
diff --git a/eval/ensemble.py b/eval/ensemble.py
@@ -0,0 +1,298 @@
+"""
+Generates predictions on test-dev or test using an ensemble of nets. The
+ensemble is produced using the average of the pre-softmax output from each net.
+
+Place each model in its own folder. The folder must contain:
+
+- The .caffemodel file
+- proto_test.prototxt
+- adict.json
+- vdict.json
+- aux.json
+
+aux.json should contain the following keys:
+
+- batch_size (value should be integer)
+- data_shape (value should be array of integer)
+- img_feature_prefix (value should be string)
+- spatial_coord (value should be boolean)
+- glove (value should be boolean)
+
+If the folder also contains "preds.pkl", evaluation is skipped for that network.
+
+"""
+
+import caffe
+import numpy as np
+import cPickle
+import argparse, os, glob
+import sys
+import json
+from collections import defaultdict
+import vqa_data_provider_layer
+from vqa_data_provider_layer import LoadVQADataProvider
+
+def verify_all(folder_paths):
+    """
+    Calls verify_one on each folder path. Also checks to make sure all the
+    answer vocabularies are the same.
+    """
+    adict_paths = []
+    for folder_path in folder_paths:
+        paths = verify_one(folder_path)
+        adict_paths.append(paths[2])
+    adicts = []
+    for path in adict_paths:
+        with open(path, 'r') as f:
+            adict = json.load(f)
+            adicts.append(adict)
+    if len(adicts) > 1:
+        for a2 in adicts[1:]:
+            if set(adicts[0].keys()) != set(a2.keys()):
+                print set(adicts[0].keys()) - set(a2.keys())
+                print set(a2.keys()) - set(adicts[0].keys())
+                raise Exception('Answer vocab mismatch')
+    return adicts
+
+def verify_one(folder_path):
+    """
+    Makes sure all the required files exist in the folder. If so, returns the
+    paths to all the files.
+    """
+    model_path = glob.glob(folder_path + '/*.caffemodel')
+    assert len(model_path) == 1, 'one .caffemodel per folder, please'
+    model_path = model_path[0]
+    proto_path = folder_path + '/proto_test.prototxt'
+    adict_path = folder_path + '/adict.json'
+    vdict_path = folder_path + '/vdict.json'
+    aux_path = folder_path + '/aux.json'
+    assert os.path.exists(proto_path), 'proto_test.prototxt missing'
+    assert os.path.exists(adict_path), 'adict.json missing'
+    assert os.path.exists(vdict_path), 'vdict.json missing'
+    assert os.path.exists(aux_path), 'aux.json missing'
+    with open(aux_path, 'r') as f:
+        aux = json.load(f)
+    batch_size = int(aux['batch_size'])
+    data_shape = tuple(map(int, aux['data_shape']))
+    img_feature_prefix = aux['img_feature_prefix']
+    spatial_coord = aux['spatial_coord'] if 'spatial_coord' in aux else False
+    glove = aux['glove'] if 'glove' in aux else False
+    return model_path, proto_path, adict_path, vdict_path, batch_size, data_shape, img_feature_prefix, spatial_coord, glove
+
+def get_pkl_fname(ques_file):
+    if '_val2014_' in ques_file:
+        return '/preds_val.pkl'
+    elif '_test-dev2015_' in ques_file:
+        return '/preds_test_dev.pkl'
+    elif '_test2015_' in ques_file:
+        return '/preds_test.pkl'
+    else:
+        raise NotImplementedError
+
+def eval_one(folder_path, gpuid, ques_file):
+    """
+    Evaluates a single model (in folder_path) on the questions in ques_file.
+    Returns an array of (QID, answer vector) tuples.
+    """
+
+    model_path, proto_path, adict_path, vdict_path, batch_size, data_shape, \
+                    img_feature_prefix, spatial_coord, glove = verify_one(folder_path)
+
+    dp = LoadVQADataProvider(ques_file, img_feature_prefix, vdict_path, \
+        adict_path, mode='test', batchsize=batch_size, data_shape=data_shape)
+    total_questions = len(dp.getQuesIds())
+    print total_questions, 'total questions'
+
+    if os.path.exists(folder_path + get_pkl_fname(ques_file)):
+        print 'Found existing prediction file, trying to load...'
+        with open(folder_path + get_pkl_fname(ques_file), 'r') as f:
+            preds = cPickle.load(f)
+        if len(preds) >= total_questions:
+            print 'Loaded.'
+            return preds
+        else:
+            print 'Number of saved answers does not match number of questions, continuing...'
+
+    caffe.set_device(gpuid)
+    caffe.set_mode_gpu()
+
+    vqa_data_provider_layer.CURRENT_DATA_SHAPE = data_shape # This is a huge hack
+    vqa_data_provider_layer.SPATIAL_COORD = spatial_coord
+    vqa_data_provider_layer.GLOVE = glove
+
+    net = caffe.Net(proto_path, model_path, caffe.TEST)
+
+    print 'Model loaded:', model_path
+    print 'Image feature prefix:', img_feature_prefix
+    sys.stdout.flush()
+
+
+    pred_layers = []
+
+    epoch = 0
+    while epoch == 0:
+        t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, _, epoch = dp.get_batch_vec()
+        net.blobs['data'].data[...] = np.transpose(t_word,(1,0))
+        net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0))
+        net.blobs['img_feature'].data[...] = t_img_feature
+        net.blobs['label'].data[...] = t_answer # dummy
+        if glove:
+            net.blobs['glove'].data[...] = np.transpose(t_glove_matrix, (1,0,2))
+        net.forward()
+        ans_matrix = net.blobs['prediction'].data
+
+        for i in range(len(t_qid_list)):
+            qid = t_qid_list[i]
+            pred_layers.append((qid, np.copy(ans_matrix[i]))) # tricky!
+
+        percent = 100 * float(len(pred_layers)) / total_questions
+        sys.stdout.write('\r' + ('%.2f' % percent) + '%')
+        sys.stdout.flush()
+
+    #print 'Saving predictions...'
+    #with open(folder_path + get_pkl_fname(ques_file), 'w') as f:
+    #   cPickle.dump(pred_layers, f, protocol=-1)
+    #print 'Saved.'
+
+    return pred_layers
+
+def make_rev_adict(adict):
+    """
+    An adict maps text answers to neuron indices. A reverse adict maps neuron
+    indices to text answers.
+    """
+    rev_adict = {}
+    for k,v in adict.items():
+        rev_adict[v] = k
+    return rev_adict
+
+def softmax(arr):
+    e = np.exp(arr)
+    dist = e / np.sum(e)
+    return dist
+
+def get_qid_valid_answer_dict(ques_file, adict):
+    """
+    Returns a dictionary mapping question IDs to valid neuron indices.
+    """
+    print 'Multiple choice mode: making valid answer dictionary...'
+    valid_answer_dict = {}
+    with open(ques_file, 'r') as f:
+        qdata = json.load(f)
+        for q in qdata['questions']:
+            valid_answer_dict[q['question_id']] = q['multiple_choices']
+    for qid in valid_answer_dict:
+        answers = valid_answer_dict[qid]
+        valid_indices = []
+        for answer in answers:
+            if answer in adict:
+                valid_indices.append(adict[answer])
+        if len(valid_indices) == 0:
+            print "we won't be able to answer qid", qid
+        valid_answer_dict[qid] = valid_indices
+    return valid_answer_dict
+
+def dedupe(arr):
+    print 'Deduping arr of len', len(arr)
+    deduped = []
+    seen = set()
+    for qid, pred in arr:
+        if qid not in seen:
+            seen.add(qid)
+            deduped.append((qid, pred))
+    print 'New len', len(deduped)
+    return deduped
+
+def reorder_one(predictions, this_adict, canonical_adict):
+    index_map = {}
+    for idx, word in make_rev_adict(this_adict).iteritems():
+        index_map[int(idx)] = int(canonical_adict[word])
+    index_array = np.zeros(len(index_map), dtype=int)
+    for src_idx, dest_idx in index_map.iteritems():
+        index_array[src_idx] = dest_idx
+    reordered = []
+    for qid, output in predictions:
+        reordered.append((qid, np.copy(output[index_array])))
+    return reordered
+
+def reorder_predictions(predictions, adicts):
+    """
+    Reorders prediction matrices so that the unit order matches that of the
+    first answer dictionary.
+    """
+    if len(adicts) == 1:
+        return predictions
+    need_to_reorder = False
+    for a2 in adicts[1:]:
+        if adicts[0] != a2:
+            need_to_reorder = True
+    print 'Reordering...' if need_to_reorder else 'No need to reorder!'
+    if not need_to_reorder:
+        return predictions
+    reordered = []
+    for i in range(1, len(adicts)):
+        if adicts[0] != adicts[i]:
+            reordered.append(reorder_one(predictions[i], adicts[i], adicts[0]))
+        else:
+            reordered.append(predictions[i])
+    return reordered
+
+def average_outputs(arr_of_arr, rev_adict, qid_valid_answer_dict):
+    """
+    Given a list of lists, where each list contains (QID, answer vector) tuples,
+    returns a single dictionary which maps a question ID to the text answer.
+    """
+    print 'Averaging outputs...'
+    merged = defaultdict(list)
+    for arr in arr_of_arr:
+        for qid, ans_vec in arr:
+            merged[qid].append(ans_vec)
+
+    merged = {qid: softmax(np.vstack(ans_vecs).mean(axis=0)) for qid, ans_vecs in merged.iteritems()}
+    mask_len = len(merged.values()[0])
+
+    # Multiple choice filtering
+    if qid_valid_answer_dict is not None:
+        for qid in merged:
+            valid_indices = qid_valid_answer_dict[qid]
+            mask = np.zeros(mask_len)
+            for idx in valid_indices:
+                mask[idx] = 1
+            merged[qid] *= mask
+
+    merged = {qid: rev_adict[ans_vec.argmax()] for qid, ans_vec in merged.iteritems()}
+
+    return merged
+
+def save_json(qid_ans_dict, fname):
+    tmp = []
+    for qid, ans in qid_ans_dict.iteritems():
+        tmp.append({u'answer': ans, u'question_id': qid})
+    with open(fname, 'w') as f:
+        json.dump(tmp, f)
+    print 'Saved to', fname
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--ques_file', required=True)
+    parser.add_argument('--gpu', type=int, required=True)
+    parser.add_argument('--out_file', required=True)
+    parser.add_argument('folders', nargs='*',
+        help='space-separated list of folders containing models')
+    args = parser.parse_args()
+    assert len(args.folders) > 0, 'please specify at least one folder'
+    print 'Folders', args.folders
+
+    adicts = verify_all(args.folders)
+    qid_valid_answer_dict = None
+    if 'MultipleChoice' in args.ques_file:
+        qid_valid_answer_dict = get_qid_valid_answer_dict(args.ques_file, adicts[0])
+
+    arr_of_arr = [eval_one(folder_path, args.gpu, args.ques_file) for folder_path in args.folders]
+    arr_of_arr = [dedupe(x) for x in arr_of_arr]
+    reordered = reorder_predictions(arr_of_arr, adicts)
+    qid_ans_dict = average_outputs(reordered, make_rev_adict(adicts[0]), qid_valid_answer_dict)
+    save_json(qid_ans_dict, args.out_file)
+
+if __name__ == '__main__':
+    main()