Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TrecQA for MP-CNN #77

Merged
merged 9 commits into from Nov 4, 2017
@@ -6,5 +6,6 @@ __pycache__
*idfsim
*.swp
trec_eval.9.0/
trec_eval-9.0.5
*.pt
text/
No changes.
@@ -0,0 +1,76 @@
import os

import torch
from torchtext.data.dataset import Dataset
from torchtext.data.example import Example
from torchtext.data.field import Field
from torchtext.data.iterator import BucketIterator
from torchtext.vocab import Vectors

from datasets.idf_utils import get_pairwise_word_to_doc_freq, get_pairwise_overlap_features


class TRECQA(Dataset):
NAME = 'trecqa'
NUM_CLASSES = 2
ID_FIELD = Field(sequential=False, tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True)
TEXT_FIELD = Field(batch_first=True, tokenize=lambda x: x) # tokenizer is identity since we already tokenized it to compute external features

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

The better way of doing this is:

TEXT_FIELD = Field(batch_first=True, sequential=False)

Refer torchtext doc

This comment has been minimized.

Copy link
@tuzhucheng

tuzhucheng Nov 4, 2017

Author Member

You are right in that this is already tokenized, so no tokenization is required.

sequential: Whether the datatype represents sequential data. If False,
no tokenization is applied. Default: True.

When I tried to change this to sequential=False, I get a TypeError: unhashable type: 'list on the last line in this block from torchtext source:

        for data in sources:
            for x in data:
                if not self.sequential:
                    x = [x]
                counter.update(x)

As you can see, if data is not declared sequential, then torchtext wraps data in another list. I already have a list so this will create a list of list, which is incorrect.This is sequential data (a list of tokens), so I think what I did originally makes sense. It's just that the sequence data is already tokenized.

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

I'm sorry, you're right. This was not the case with the previous version of torchtext.

EXT_FEATS_FIELD = Field(tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True, tokenize=lambda x: x)
LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)

@staticmethod
def sort_key(ex):
return len(ex.a)

def __init__(self, path):
"""
Create a TRECQA dataset instance
"""
fields = [('id', self.ID_FIELD), ('a', self.TEXT_FIELD), ('b', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD)]

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

although the files are called a.toks and b.toks, I think we should called these fields as questions and answers

This comment has been minimized.

Copy link
@tuzhucheng

tuzhucheng Nov 4, 2017

Author Member

They are not necessarily question and answer. MP-CNN can be used for semantic text similarity between two sentences in general. I guess I can call them sentence_1 and sentence_2 to be more clearer than just a and b.


examples = []
f1 = open(os.path.join(path, 'a.toks'), 'r')
f2 = open(os.path.join(path, 'b.toks'), 'r')
id_file = open(os.path.join(path, 'id.txt'), 'r')
label_file = open(os.path.join(path, 'sim.txt'), 'r')

sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1]
sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2]

word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2)
overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt)

for pair_id, l1, l2, ext_feats, label in zip(id_file, sent_list_1, sent_list_2, overlap_feats, label_file):
pair_id = pair_id.rstrip('.\n')
label = label.rstrip('.\n')
example = Example.fromlist([pair_id, l1, l2, ext_feats, label], fields)
examples.append(example)

map(lambda f: f.close(), [f1, f2, label_file])

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

could you process the files within a with block?

This comment has been minimized.

Copy link
@tuzhucheng

tuzhucheng Nov 4, 2017

Author Member

Sounds good :)


super(TRECQA, self).__init__(examples, fields)

@classmethod
def splits(cls, path, train='train-all', validation='raw-dev', test='raw-test', **kwargs):
return super(TRECQA, cls).splits(path, train=train, validation=validation, test=test, **kwargs)

@classmethod
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_):
"""
:param path: directory containing train, test, dev files
:param vectors_name: name of word vectors file
:param vectors_cache: path to word vectors file

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

Do you mean to say path to word vector cache file?

:param batch_size: batch size
:param device: GPU device
:param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
:param unk_init: function used to generate vector for OOV words
:return:
"""
if vectors is None:
vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init)

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

the vector cache file is a file generated from the original SM code (https://github.com/aseveryn/deep-qa) and this dependency can be completely discarded. Check this

This comment has been minimized.

Copy link
@tuzhucheng

tuzhucheng Nov 4, 2017

Author Member

MP-CNN is independent of SM-Model so I think it can generate its own cache. Doing it this way will also allow me to experiment with custom word vectors, e.g. GloVe vectors of other dimensions, word2vec trained on Google News, fastText etc..

I'm not quite sure which dependency you are referring to. I think the current approach is very flexible and doesn't tie us to any particular variant of word vectors.

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

I thought vector cache is a dependency here, no? or is it generated from one of your scripts?

This comment has been minimized.

Copy link
@tuzhucheng

tuzhucheng Nov 4, 2017

Author Member

Nope, torchtext is awesome! It generates this automatically on the first run if it is not there :) Nothing needs to be downloaded, all we need is torchtext.

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

oh perfect!


train, validation, test = cls.splits(path)

cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors)

return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle, device=device)
@@ -20,6 +20,8 @@ Directory layout should be like this:
│ └── GloVe/
```

## SICK Dataset

To run MP-CNN on the SICK dataset, use the following command. `--dropout 0` is for mimicking the original paper, although adding dropout can improve performance. If you have any problems running it check the Troubleshooting section below.

```
@@ -29,7 +31,9 @@ python main.py mpcnn.sick.model.castor --dataset sick --epochs 19 --epsilon 1e-7
| Implementation and config | Pearson's r | Spearman's p |
| -------------------------------- |:-------------:|:-------------:|
| Paper | 0.8686 | 0.8047 |
| PyTorch using above config | 0.8763 | 0.8215 |
| PyTorch using above config | 0.8684 | 0.8083 |

## MSRVID Dataset

To run MP-CNN on the MSRVID dataset, use the following command:
```
@@ -39,7 +43,23 @@ python main.py mpcnn.msrvid.model.castor --dataset msrvid --batch-size 16 --epsi
| Implementation and config | Pearson's r |
| -------------------------------- |:-------------:|
| Paper | 0.9090 |
| PyTorch using above config | 0.9050 |
| PyTorch using above config | 0.8911 |

## TrecQA Dataset

To run MP-CNN on (Raw) TrecQA, you first need to the `get_trec_eval.sh` script in `utils` under the repo root.

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

I'd suggest writing the actual command so that a user can directly copy paste it into his terminal. Also add commands to uncompress and make trec_eval.

This comment has been minimized.

Copy link
@tuzhucheng

tuzhucheng Nov 4, 2017

Author Member

Yep, my sentence is also incoherent :)


Then, you can run:
```
python main.py mpcnn.trecqa.model --dataset trecqa --epochs 5 --regularization 0.0005 --dropout 0.5 --eps 0.1
```

| Implementation and config | map | mrr |
| -------------------------------- |:------:|:------:|
| Paper | 0.762 | 0.830 |
| PyTorch using above config | 0.7904 | 0.8223 |

The paper results are reported in [Noise-Contrastive Estimation for Answer Selection with Deep Neural Networks](https://dl.acm.org/citation.cfm?id=2983872).

These are not the optimal hyperparameters but they are decent. This README will be updated with more optimal hyperparameters and results in the future.

@@ -1,30 +1,14 @@
from collections import defaultdict
from enum import Enum
import math
import os

import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.utils.data as data

from datasets.sick import SICK
from datasets.msrvid import MSRVID
from datasets.trecqa import TRECQA

# logging setup
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)


class UnknownWorcVecCache(object):
class UnknownWordVecCache(object):
"""
Caches the first randomly generated word vector for a certain size to make it is reused.
"""
@@ -47,19 +31,28 @@ class MPCNNDatasetFactory(object):
def get_dataset(dataset_name, word_vectors_dir, word_vectors_file, batch_size, device):
if dataset_name == 'sick':
dataset_root = os.path.join(os.pardir, os.pardir, 'data', 'sick/')
train_loader, dev_loader, test_loader = SICK.iters(dataset_root, word_vectors_file, word_vectors_dir, batch_size, device=device, unk_init=UnknownWorcVecCache.unk)
train_loader, dev_loader, test_loader = SICK.iters(dataset_root, word_vectors_file, word_vectors_dir, batch_size, device=device, unk_init=UnknownWordVecCache.unk)
embedding_dim = SICK.TEXT_FIELD.vocab.vectors.size()
embedding = nn.Embedding(embedding_dim[0], embedding_dim[1])
embedding.weight = nn.Parameter(SICK.TEXT_FIELD.vocab.vectors)
return SICK, embedding, train_loader, test_loader, dev_loader
elif dataset_name == 'msrvid':
dataset_root = os.path.join(os.pardir, os.pardir, 'data', 'msrvid/')
dev_loader = None
train_loader, test_loader = MSRVID.iters(dataset_root, word_vectors_file, word_vectors_dir, batch_size, device=device, unk_init=UnknownWorcVecCache.unk)
train_loader, test_loader = MSRVID.iters(dataset_root, word_vectors_file, word_vectors_dir, batch_size, device=device, unk_init=UnknownWordVecCache.unk)
embedding_dim = MSRVID.TEXT_FIELD.vocab.vectors.size()
embedding = nn.Embedding(embedding_dim[0], embedding_dim[1])
embedding.weight = nn.Parameter(MSRVID.TEXT_FIELD.vocab.vectors)
return MSRVID, embedding, train_loader, test_loader, dev_loader
elif dataset_name == 'trecqa':
if not os.path.exists('../utils/trec_eval-9.0.5/trec_eval'):
raise FileNotFoundError('TrecQA requires the trec_eval tool to run. Please run get_trec_eval.sh inside Castor/utils (as working directory) before continuing.')
dataset_root = os.path.join(os.pardir, os.pardir, 'data', 'TrecQA/')
train_loader, dev_loader, test_loader = TRECQA.iters(dataset_root, word_vectors_file, word_vectors_dir, batch_size, device=device, unk_init=UnknownWordVecCache.unk)
embedding_dim = TRECQA.TEXT_FIELD.vocab.vectors.size()
embedding = nn.Embedding(embedding_dim[0], embedding_dim[1])
embedding.weight = nn.Parameter(TRECQA.TEXT_FIELD.vocab.vectors)
return TRECQA, embedding, train_loader, test_loader, dev_loader
else:
raise ValueError('{} is not a valid dataset.'.format(dataset_name))

@@ -1,117 +1,29 @@
from scipy.stats import pearsonr, spearmanr
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from mp_cnn.evaluators.sick_evaluator import SICKEvaluator
from mp_cnn.evaluators.msrvid_evaluator import MSRVIDEvaluator
from mp_cnn.evaluators.trecqa_evaluator import TRECQAEvaluator


class MPCNNEvaluatorFactory(object):
"""
Get the corresponding Evaluator class for a particular dataset.
"""
evaluator_map = {
'sick': SICKEvaluator,
'msrvid': MSRVIDEvaluator,
'trecqa': TRECQAEvaluator
}

@staticmethod
def get_evaluator(dataset_cls, model, data_loader, batch_size, device):
if data_loader is None:
return None

if hasattr(dataset_cls, 'NAME') and dataset_cls.NAME == 'sick':
return SICKEvaluator(dataset_cls, model, data_loader, batch_size, device)
elif hasattr(dataset_cls, 'NAME') and dataset_cls.NAME == 'msrvid':
return MSRVIDEvaluator(dataset_cls, model, data_loader, batch_size, device)
else:
raise ValueError('{} is not a valid dataset.'.format(dataset_cls))


class Evaluator(object):
"""
Evaluates performance of model on a Dataset, using metrics specific to the Dataset.
"""

def __init__(self, dataset_cls, model, data_loader, batch_size, device):
self.dataset_cls = dataset_cls
self.model = model
self.data_loader = data_loader
self.batch_size = batch_size
self.device = device

def get_scores(self):
"""
Get the scores used to evaluate the model.
Should return ([score1, score2, ..], [score1_name, score2_name, ...]).
The first score is the primary score used to determine if the model has improved.
"""
raise NotImplementedError('Evaluator subclass needs to implement get_score')


class SICKEvaluator(Evaluator):

def __init__(self, dataset_cls, model, data_loader, batch_size, device):
super(SICKEvaluator, self).__init__(dataset_cls, model, data_loader, batch_size, device)

def get_scores(self):
self.model.eval()
num_classes = self.dataset_cls.NUM_CLASSES
predict_classes = torch.arange(1, num_classes + 1).expand(self.batch_size, num_classes)
test_kl_div_loss = 0
predictions = []
true_labels = []

for batch in self.data_loader:
output = self.model(batch.a, batch.b, batch.ext_feats)
test_kl_div_loss += F.kl_div(output, batch.label, size_average=False).data[0]
# handle last batch which might have smaller size
if len(predict_classes) != len(batch.a):
predict_classes = torch.arange(1, num_classes + 1).expand(len(batch.a), num_classes)

if self.data_loader.device != -1:
with torch.cuda.device(self.device):
predict_classes = predict_classes.cuda()

true_labels.append((predict_classes * batch.label.data).sum(dim=1))
predictions.append((predict_classes * output.data.exp()).sum(dim=1))

del output

predictions = torch.cat(predictions).cpu().numpy()
true_labels = torch.cat(true_labels).cpu().numpy()
test_kl_div_loss /= len(batch.dataset.examples)
pearson_r = pearsonr(predictions, true_labels)[0]
spearman_r = spearmanr(predictions, true_labels)[0]

return [pearson_r, spearman_r, test_kl_div_loss], ['pearson_r', 'spearman_r', 'KL-divergence loss']


class MSRVIDEvaluator(Evaluator):

def __init__(self, dataset_cls, model, data_loader, batch_size, device):
super(MSRVIDEvaluator, self).__init__(dataset_cls, model, data_loader, batch_size, device)

def get_scores(self):
self.model.eval()
num_classes = self.dataset_cls.NUM_CLASSES
predict_classes = torch.arange(0, num_classes).expand(self.batch_size, num_classes)
test_kl_div_loss = 0
predictions = []
true_labels = []

for batch in self.data_loader:
output = self.model(batch.a, batch.b, batch.ext_feats)
test_kl_div_loss += F.kl_div(output, batch.label, size_average=False).data[0]
# handle last batch which might have smaller size
if len(predict_classes) != len(batch.a):
predict_classes = torch.arange(0, num_classes).expand(len(batch.a), num_classes)

if self.data_loader.device != -1:
with torch.cuda.device(self.device):
predict_classes = predict_classes.cuda()

true_labels.append((predict_classes * batch.label.data).sum(dim=1))
predictions.append((predict_classes * output.data.exp()).sum(dim=1))

del output
if not hasattr(dataset_cls, 'NAME'):
raise ValueError('Invalid dataset. Dataset should have NAME attribute.')

predictions = torch.cat(predictions).cpu().numpy()
true_labels = torch.cat(true_labels).cpu().numpy()
test_kl_div_loss /= len(batch.dataset.examples)
pearson_r = pearsonr(predictions, true_labels)[0]
if dataset_cls.NAME not in MPCNNEvaluatorFactory.evaluator_map:
raise ValueError('{} is not implemented.'.format(dataset_cls))

return [pearson_r, test_kl_div_loss], ['pearson_r', 'KL-divergence loss']
return MPCNNEvaluatorFactory.evaluator_map[dataset_cls.NAME](
dataset_cls, model, data_loader, batch_size, device
)
No changes.
@@ -0,0 +1,19 @@
class Evaluator(object):
"""
Evaluates performance of model on a Dataset, using metrics specific to the Dataset.
"""

def __init__(self, dataset_cls, model, data_loader, batch_size, device):
self.dataset_cls = dataset_cls
self.model = model
self.data_loader = data_loader
self.batch_size = batch_size
self.device = device

def get_scores(self):
"""
Get the scores used to evaluate the model.
Should return ([score1, score2, ..], [score1_name, score2_name, ...]).
The first score is the primary score used to determine if the model has improved.
"""
raise NotImplementedError('Evaluator subclass needs to implement get_score')
@@ -0,0 +1,42 @@
from scipy.stats import pearsonr
import torch
import torch.nn.functional as F

from mp_cnn.evaluators.evaluator import Evaluator


class MSRVIDEvaluator(Evaluator):

def __init__(self, dataset_cls, model, data_loader, batch_size, device):
super(MSRVIDEvaluator, self).__init__(dataset_cls, model, data_loader, batch_size, device)

def get_scores(self):
self.model.eval()
num_classes = self.dataset_cls.NUM_CLASSES
predict_classes = torch.arange(0, num_classes).expand(self.batch_size, num_classes)
test_kl_div_loss = 0
predictions = []
true_labels = []

for batch in self.data_loader:
output = self.model(batch.a, batch.b, batch.ext_feats)
test_kl_div_loss += F.kl_div(output, batch.label, size_average=False).data[0]
# handle last batch which might have smaller size
if len(predict_classes) != len(batch.a):
predict_classes = torch.arange(0, num_classes).expand(len(batch.a), num_classes)

if self.data_loader.device != -1:
with torch.cuda.device(self.device):
predict_classes = predict_classes.cuda()

true_labels.append((predict_classes * batch.label.data).sum(dim=1))
predictions.append((predict_classes * output.data.exp()).sum(dim=1))

del output

predictions = torch.cat(predictions).cpu().numpy()
true_labels = torch.cat(true_labels).cpu().numpy()
test_kl_div_loss /= len(batch.dataset.examples)
pearson_r = pearsonr(predictions, true_labels)[0]

return [pearson_r, test_kl_div_loss], ['pearson_r', 'KL-divergence loss']
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.