Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TrecQA for MP-CNN #77

Merged
merged 9 commits into from Nov 4, 2017
@@ -6,5 +6,6 @@ __pycache__
*idfsim
*.swp
trec_eval.9.0/
trec_eval-9.0.5
*.pt
text/
No changes.
@@ -0,0 +1,43 @@
from abc import ABCMeta, abstractmethod
import os

from torchtext.data.dataset import Dataset
from torchtext.data.example import Example
from torchtext.data.field import Field

from datasets.idf_utils import get_pairwise_word_to_doc_freq, get_pairwise_overlap_features


class CastorPairDataset(Dataset, metaclass=ABCMeta):

# Child classes must define
NAME = None
NUM_CLASSES = None
ID_FIELD = None
TEXT_FIELD = None
EXT_FEATS_FIELD = None
LABEL_FIELD = None

@abstractmethod
def __init__(self, path):
"""
Create a Castor dataset involving pairs of texts
"""
fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD)]

examples = []
with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2:
sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1]
sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2]

word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2)
overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt)

with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file:
for pair_id, l1, l2, ext_feats, label in zip(id_file, sent_list_1, sent_list_2, overlap_feats, label_file):
pair_id = pair_id.rstrip('.\n')
label = label.rstrip('.\n')
example = Example.fromlist([pair_id, l1, l2, ext_feats, label], fields)
examples.append(example)

super(CastorPairDataset, self).__init__(examples, fields)
@@ -3,13 +3,13 @@

import numpy as np
import torch
from torchtext.data.dataset import Dataset
from torchtext.data.example import Example
from torchtext.data.field import Field
from torchtext.data.iterator import BucketIterator
from torchtext.data.pipeline import Pipeline
from torchtext.vocab import Vectors

from datasets.castor_dataset import CastorPairDataset
from datasets.idf_utils import get_pairwise_word_to_doc_freq, get_pairwise_overlap_features


@@ -28,7 +28,7 @@ def get_class_probs(sim, *args):
return class_probs


class MSRVID(Dataset):
class MSRVID(CastorPairDataset):
NAME = 'msrvid'
NUM_CLASSES = 6
ID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
@@ -38,35 +38,13 @@ class MSRVID(Dataset):

@staticmethod
def sort_key(ex):
return len(ex.a)
return len(ex.sentence_1)

def __init__(self, path):
"""
Create a MSRVID dataset instance
"""
fields = [('id', self.ID_FIELD), ('a', self.TEXT_FIELD), ('b', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD)]

examples = []
f1 = open(os.path.join(path, 'a.txt'), 'r')
f2 = open(os.path.join(path, 'b.txt'), 'r')
id_file = open(os.path.join(path, 'id.txt'), 'r')
label_file = open(os.path.join(path, 'sim.txt'), 'r')

sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1]
sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2]

word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2)
overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt)

for pair_id, l1, l2, ext_feats, label in zip(id_file, sent_list_1, sent_list_2, overlap_feats, label_file):
pair_id = pair_id.rstrip('.\n')
label = label.rstrip('.\n')
example = Example.fromlist([pair_id, l1, l2, ext_feats, label], fields)
examples.append(example)

map(lambda f: f.close(), [f1, f2, label_file])

super(MSRVID, self).__init__(examples, fields)
super(MSRVID, self).__init__(path)

@classmethod
def splits(cls, path, train='train', test='test', **kwargs):
@@ -3,13 +3,13 @@

import numpy as np
import torch
from torchtext.data.dataset import Dataset
from torchtext.data.example import Example
from torchtext.data.field import Field
from torchtext.data.iterator import BucketIterator
from torchtext.data.pipeline import Pipeline
from torchtext.vocab import Vectors

from datasets.castor_dataset import CastorPairDataset
from datasets.idf_utils import get_pairwise_word_to_doc_freq, get_pairwise_overlap_features


@@ -28,7 +28,7 @@ def get_class_probs(sim, *args):
return class_probs


class SICK(Dataset):
class SICK(CastorPairDataset):
NAME = 'sick'
NUM_CLASSES = 5
ID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
@@ -38,35 +38,13 @@ class SICK(Dataset):

@staticmethod
def sort_key(ex):
return len(ex.a)
return len(ex.sentence_1)

def __init__(self, path):
"""
Create a SICK dataset instance
"""
fields = [('id', self.ID_FIELD), ('a', self.TEXT_FIELD), ('b', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD)]

examples = []
f1 = open(os.path.join(path, 'a.txt'), 'r')
f2 = open(os.path.join(path, 'b.txt'), 'r')
id_file = open(os.path.join(path, 'id.txt'), 'r')
label_file = open(os.path.join(path, 'sim.txt'), 'r')

sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1]
sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2]

word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2)
overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt)

for pair_id, l1, l2, ext_feats, label in zip(id_file, sent_list_1, sent_list_2, overlap_feats, label_file):
pair_id = pair_id.rstrip('.\n')
label = label.rstrip('.\n')
example = Example.fromlist([pair_id, l1, l2, ext_feats, label], fields)
examples.append(example)

map(lambda f: f.close(), [f1, f2, label_file])

super(SICK, self).__init__(examples, fields)
super(SICK, self).__init__(path)

@classmethod
def splits(cls, path, train='train', validation='dev', test='test', **kwargs):
@@ -0,0 +1,54 @@
import os

import torch
from torchtext.data.example import Example
from torchtext.data.field import Field
from torchtext.data.iterator import BucketIterator
from torchtext.vocab import Vectors

from datasets.castor_dataset import CastorPairDataset
from datasets.idf_utils import get_pairwise_word_to_doc_freq, get_pairwise_overlap_features


class TRECQA(CastorPairDataset):
NAME = 'trecqa'
NUM_CLASSES = 2
ID_FIELD = Field(sequential=False, tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True)
TEXT_FIELD = Field(batch_first=True, tokenize=lambda x: x) # tokenizer is identity since we already tokenized it to compute external features

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

The better way of doing this is:

TEXT_FIELD = Field(batch_first=True, sequential=False)

Refer torchtext doc

This comment has been minimized.

Copy link
@tuzhucheng

tuzhucheng Nov 4, 2017

Author Member

You are right in that this is already tokenized, so no tokenization is required.

sequential: Whether the datatype represents sequential data. If False,
no tokenization is applied. Default: True.

When I tried to change this to sequential=False, I get a TypeError: unhashable type: 'list on the last line in this block from torchtext source:

        for data in sources:
            for x in data:
                if not self.sequential:
                    x = [x]
                counter.update(x)

As you can see, if data is not declared sequential, then torchtext wraps data in another list. I already have a list so this will create a list of list, which is incorrect.This is sequential data (a list of tokens), so I think what I did originally makes sense. It's just that the sequence data is already tokenized.

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

I'm sorry, you're right. This was not the case with the previous version of torchtext.

EXT_FEATS_FIELD = Field(tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True, tokenize=lambda x: x)
LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)

@staticmethod
def sort_key(ex):
return len(ex.sentence_1)

def __init__(self, path):
"""
Create a TRECQA dataset instance
"""
super(TRECQA, self).__init__(path)

@classmethod
def splits(cls, path, train='train-all', validation='raw-dev', test='raw-test', **kwargs):
return super(TRECQA, cls).splits(path, train=train, validation=validation, test=test, **kwargs)

@classmethod
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_):
"""
:param path: directory containing train, test, dev files
:param vectors_name: name of word vectors file
:param vectors_cache: directory containing word vectors file
:param batch_size: batch size
:param device: GPU device
:param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
:param unk_init: function used to generate vector for OOV words
:return:
"""
if vectors is None:
vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init)

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

the vector cache file is a file generated from the original SM code (https://github.com/aseveryn/deep-qa) and this dependency can be completely discarded. Check this

This comment has been minimized.

Copy link
@tuzhucheng

tuzhucheng Nov 4, 2017

Author Member

MP-CNN is independent of SM-Model so I think it can generate its own cache. Doing it this way will also allow me to experiment with custom word vectors, e.g. GloVe vectors of other dimensions, word2vec trained on Google News, fastText etc..

I'm not quite sure which dependency you are referring to. I think the current approach is very flexible and doesn't tie us to any particular variant of word vectors.

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

I thought vector cache is a dependency here, no? or is it generated from one of your scripts?

This comment has been minimized.

Copy link
@tuzhucheng

tuzhucheng Nov 4, 2017

Author Member

Nope, torchtext is awesome! It generates this automatically on the first run if it is not there :) Nothing needs to be downloaded, all we need is torchtext.

This comment has been minimized.

Copy link
@rosequ

rosequ Nov 4, 2017

Member

oh perfect!


train, validation, test = cls.splits(path)

cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors)

return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle, device=device)
@@ -20,6 +20,8 @@ Directory layout should be like this:
│ └── GloVe/
```

## SICK Dataset

To run MP-CNN on the SICK dataset, use the following command. `--dropout 0` is for mimicking the original paper, although adding dropout can improve performance. If you have any problems running it check the Troubleshooting section below.

```
@@ -29,7 +31,9 @@ python main.py mpcnn.sick.model.castor --dataset sick --epochs 19 --epsilon 1e-7
| Implementation and config | Pearson's r | Spearman's p |
| -------------------------------- |:-------------:|:-------------:|
| Paper | 0.8686 | 0.8047 |
| PyTorch using above config | 0.8763 | 0.8215 |
| PyTorch using above config | 0.8684 | 0.8083 |

## MSRVID Dataset

To run MP-CNN on the MSRVID dataset, use the following command:
```
@@ -39,7 +43,23 @@ python main.py mpcnn.msrvid.model.castor --dataset msrvid --batch-size 16 --epsi
| Implementation and config | Pearson's r |
| -------------------------------- |:-------------:|
| Paper | 0.9090 |
| PyTorch using above config | 0.9050 |
| PyTorch using above config | 0.8911 |

## TrecQA Dataset

To run MP-CNN on (Raw) TrecQA, you first need to run `./get_trec_eval.sh` in `utils` under the repo root while inside the `utils` directory. This will download and compile the official `trec_eval` tool used for evaluation.

Then, you can run:
```
python main.py mpcnn.trecqa.model --dataset trecqa --epochs 5 --regularization 0.0005 --dropout 0.5 --eps 0.1
```

| Implementation and config | map | mrr |
| -------------------------------- |:------:|:------:|
| Paper | 0.762 | 0.830 |
| PyTorch using above config | 0.7904 | 0.8223 |

The paper results are reported in [Noise-Contrastive Estimation for Answer Selection with Deep Neural Networks](https://dl.acm.org/citation.cfm?id=2983872).

These are not the optimal hyperparameters but they are decent. This README will be updated with more optimal hyperparameters and results in the future.

@@ -1,30 +1,14 @@
from collections import defaultdict
from enum import Enum
import math
import os

import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.utils.data as data

from datasets.sick import SICK
from datasets.msrvid import MSRVID
from datasets.trecqa import TRECQA

# logging setup
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)


class UnknownWorcVecCache(object):
class UnknownWordVecCache(object):
"""
Caches the first randomly generated word vector for a certain size to make it is reused.
"""
@@ -47,19 +31,28 @@ class MPCNNDatasetFactory(object):
def get_dataset(dataset_name, word_vectors_dir, word_vectors_file, batch_size, device):
if dataset_name == 'sick':
dataset_root = os.path.join(os.pardir, os.pardir, 'data', 'sick/')
train_loader, dev_loader, test_loader = SICK.iters(dataset_root, word_vectors_file, word_vectors_dir, batch_size, device=device, unk_init=UnknownWorcVecCache.unk)
train_loader, dev_loader, test_loader = SICK.iters(dataset_root, word_vectors_file, word_vectors_dir, batch_size, device=device, unk_init=UnknownWordVecCache.unk)
embedding_dim = SICK.TEXT_FIELD.vocab.vectors.size()
embedding = nn.Embedding(embedding_dim[0], embedding_dim[1])
embedding.weight = nn.Parameter(SICK.TEXT_FIELD.vocab.vectors)
return SICK, embedding, train_loader, test_loader, dev_loader
elif dataset_name == 'msrvid':
dataset_root = os.path.join(os.pardir, os.pardir, 'data', 'msrvid/')
dev_loader = None
train_loader, test_loader = MSRVID.iters(dataset_root, word_vectors_file, word_vectors_dir, batch_size, device=device, unk_init=UnknownWorcVecCache.unk)
train_loader, test_loader = MSRVID.iters(dataset_root, word_vectors_file, word_vectors_dir, batch_size, device=device, unk_init=UnknownWordVecCache.unk)
embedding_dim = MSRVID.TEXT_FIELD.vocab.vectors.size()
embedding = nn.Embedding(embedding_dim[0], embedding_dim[1])
embedding.weight = nn.Parameter(MSRVID.TEXT_FIELD.vocab.vectors)
return MSRVID, embedding, train_loader, test_loader, dev_loader
elif dataset_name == 'trecqa':
if not os.path.exists('../utils/trec_eval-9.0.5/trec_eval'):
raise FileNotFoundError('TrecQA requires the trec_eval tool to run. Please run get_trec_eval.sh inside Castor/utils (as working directory) before continuing.')
dataset_root = os.path.join(os.pardir, os.pardir, 'data', 'TrecQA/')
train_loader, dev_loader, test_loader = TRECQA.iters(dataset_root, word_vectors_file, word_vectors_dir, batch_size, device=device, unk_init=UnknownWordVecCache.unk)
embedding_dim = TRECQA.TEXT_FIELD.vocab.vectors.size()
embedding = nn.Embedding(embedding_dim[0], embedding_dim[1])
embedding.weight = nn.Parameter(TRECQA.TEXT_FIELD.vocab.vectors)
return TRECQA, embedding, train_loader, test_loader, dev_loader
else:
raise ValueError('{} is not a valid dataset.'.format(dataset_name))

ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.