Skip to content
PyTorch library for Visual-Semantic tasks
Branch: master
Clone or download
Fetching latest commit…
Cannot retrieve the latest commit at this time.
Type Name Latest commit message Commit time
Failed to load latest commit information.
test Fixed .travis.yml with corrected cudatoolkit and Meteor tests Aug 25, 2018
.gitignore ImageField, PairedDataset and Association May 18, 2018
LICENSE Updated license Aug 10, 2018 Updated contributors Mar 7, 2019 Version 0.0.1 Feb 14, 2019

Speaksee Logo

Speaksee is a Python package that provides utilities for working with Visual-Semantic data, developed at AImageLab.


To have a working installation, make sure you have Python 3.5+. You can then install speaksee via pip:

pip install speaksee

From source

You can also install speaksee from source with:

git clone
cd speaksee
pip install -e .

and obtain fresh upgrades without reinstalling it, simply running:

git pull


Pre-processing visual data

from import ImageField, TextField
from import EncodeCNN
from import COCO
from torchvision.models import resnet101
from torchvision.transforms import Compose, Normalize
from torch import nn
import torch
from tqdm import tqdm

device = torch.device('cuda')

# Preprocess with some fancy cnn and transformation
cnn = resnet101(pretrained=True).to(device)
cnn.avgpool.forward = lambda x : x.mean(-1).mean(-1)
cnn.fc = nn.Sequential()

transforms = Compose([
    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])

prepro_pipeline = EncodeCNN(cnn, transforms)
image_field = ImageField(preprocessing=prepro_pipeline, precomp_path='/nas/houston/lorenzo/fc2k_coco.hdf5')

Pre-processing textual data

# Pipeline for text
text_field = TextField(eos_token='<eos>', lower=True, tokenize='spacy', remove_punctuation=True)

Calling a dataset

# Create the dataset
dataset = COCO(image_field, text_field, '/tmp/coco/images/',
train_dataset, val_dataset, test_dataset = dataset.splits
#image_field.precomp(dataset)  # do this once, or to refresh cache (we might change this in the near future)
text_field.build_vocab(train_dataset, val_dataset, min_freq=5)

Training a model

from speaksee.models import FC
model = FC(len(text_field.vocab), 2048, 512, 512, dropout_prob_lm=0).to(device)

from import DataLoader
dataloader_train = DataLoader(train_dataset, batch_size=16, shuffle=True)
dataloader_val = DataLoader(val_dataset, batch_size=16)

from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from torch.nn import NLLLoss
optim = Adam(model.parameters(), lr=5e-4)
scheduler = StepLR(optim, step_size=3, gamma=.8)
loss_fn = NLLLoss(ignore_index=text_field.vocab.stoi['<pad>'])

for e in range(50):
    # Training
    running_loss = .0
    with tqdm(desc='Epoch %d - train' % e, unit='it', total=len(dataloader_train)) as pbar:
        for it, (images, captions )in enumerate(dataloader_train):
            images, captions =,
            out = model(images, captions)
            loss = loss_fn(out.view(-1, len(text_field.vocab)), captions.view(-1))

            running_loss += loss.item()
            pbar.set_postfix(loss=running_loss / (it+1))

    if e % 3 == 0 and model.ss_prob < .25:
        model.ss_prob += .05

    # Validation
    running_loss = .0
    with tqdm(desc='Epoch %d - val' % e, unit='it', total=len(dataloader_val)) as pbar:
        for it, (images, captions )in enumerate(dataloader_val):
            images, captions =,
            out = model(images, captions)
            loss = loss_fn(out.view(-1, len(text_field.vocab)), captions.view(-1))

            running_loss += loss.item()
            pbar.set_postfix(loss=running_loss / (it+1))

    # Serialize model{
        'epoch': e,
        'val_loss': running_loss / len(iter(dataloader_val)),
        'state_dict': model.state_dict(),
        'optimizer': optim.state_dict(),
    }, '/nas/houston/lorenzo/fc_epoch_%03d.pth' % e)

Evaluating a model

from speaksee.evaluation import Cider
from speaksee.evaluation import PTBTokenizer
dict_dataset_val = val_dataset.image_dictionary({'image': image_field, 'text': RawField()})
dict_dataloader_val = DataLoader(dict_dataset_val, batch_size=16)
gen = {}
gts = {}
with tqdm(desc='Validation', unit='it', total=len(dict_dataloader_val)) as pbar:
    for it, (images, caps_gt) in enumerate(iter(dict_dataloader_val)):
        images =
        with torch.no_grad():
            out = model.beam_search(images, 50, text_field.vocab.stoi['<eos>'], 2, out_size=1)
        caps_gen = text_field.decode(out)
        for i, (gts_i, gen_i) in enumerate(zip(caps_gt, caps_gen)):
            gen['%d_%d' % (it, i)] = [gen_i, ]
            gts['%d_%d' % (it, i)] = gts_i

gts = PTBTokenizer.tokenize(gts)
gen = PTBTokenizer.tokenize(gen)
val_cider, _ = Cider().compute_score(gts, gen)
print("CIDEr is %f" % val_cider)

Model zoo

Model CIDEr Download
FC-2k (beam) 93.8 Download
Bottomup Topdown with sentinel 117.8 Download

The team

Speaksee is currently maintained by Lorenzo Baraldi, Marcella Cornia and Matteo Stefanini

You can’t perform that action at this time.