Skip to content

Commit

Permalink
PyTorch 1.3 + TorchText 0.4 fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
sidharthms committed Dec 30, 2019
1 parent 328fe3c commit 3783741
Show file tree
Hide file tree
Showing 16 changed files with 39 additions and 66 deletions.
19 changes: 6 additions & 13 deletions .travis.yml
@@ -1,26 +1,19 @@
matrix: matrix:
include: include:
# - os: linux
# python: 2.7
# env: PYTHON_VERSION=2.7
- os: linux
python: 3.5
env: PYTHON_VERSION=3.5
- os: linux - os: linux
python: 3.6 python: 3.6
env: PYTHON_VERSION=3.6 env: PYTHON_VERSION=3.6
# - os: osx - os: linux
# language: generic python: 3.7
# env: env: PYTHON_VERSION=3.7
# - PYTHON_VERSION=2.7
- os: osx - os: osx
language: generic language: generic
env: env:
- PYTHON_VERSION=3.5 - PYTHON_VERSION=3.6
- os: osx - os: osx
language: generic language: generic
env: env:
- PYTHON_VERSION=3.6 - PYTHON_VERSION=3.7


notifications: notifications:
email: false email: false
Expand All @@ -39,7 +32,7 @@ before_install:


install: install:
- conda install --yes python=$PYTHON_VERSION pip scikit-learn nose - conda install --yes python=$PYTHON_VERSION pip scikit-learn nose
- pip install --process-dependency-links git+https://github.com/anhaidgroup/deepmatcher | cat - pip install -e . | cat
- python -m nltk.downloader perluniprops nonbreaking_prefixes punkt - python -m nltk.downloader perluniprops nonbreaking_prefixes punkt


script: script:
Expand Down
2 changes: 1 addition & 1 deletion deepmatcher/batch.py
Expand Up @@ -48,7 +48,7 @@ def __new__(cls, *args, **kwargs):
if 'word_probs' in train_info.metadata: if 'word_probs' in train_info.metadata:
raw_word_probs = train_info.metadata['word_probs'][name] raw_word_probs = train_info.metadata['word_probs'][name]
word_probs = torch.Tensor( word_probs = torch.Tensor(
[[raw_word_probs[w] for w in b] for b in data.data]) [[raw_word_probs[int(w)] for w in b] for b in data.data])
if data.is_cuda: if data.is_cuda:
word_probs = word_probs.cuda() word_probs = word_probs.cuda()
pc = None pc = None
Expand Down
6 changes: 3 additions & 3 deletions deepmatcher/data/dataset.py
Expand Up @@ -225,15 +225,15 @@ def compute_metadata(self, pca=False):


# Create an iterator over the entire dataset. # Create an iterator over the entire dataset.
train_iter = MatchingIterator( train_iter = MatchingIterator(
self, self, train=False, batch_size=1024, device=-1, sort_in_buckets=False) self, self, train=False, batch_size=1024, device='cpu', sort_in_buckets=False)
counter = defaultdict(Counter) counter = defaultdict(Counter)


# For each attribute, find the number of times each word id occurs in the dataset. # For each attribute, find the number of times each word id occurs in the dataset.
# Note that word ids here also include ``UNK`` tokens, padding tokens, etc. # Note that word ids here also include ``UNK`` tokens, padding tokens, etc.
for batch in pyprind.prog_bar(train_iter, title='\nBuilding vocabulary'): for batch in pyprind.prog_bar(train_iter, title='\nBuilding vocabulary'):
for name in self.all_text_fields: for name in self.all_text_fields:
attr_input = getattr(batch, name) attr_input = getattr(batch, name)
counter[name].update(attr_input.data.data.view(-1)) counter[name].update(attr_input.data.data.view(-1).tolist())


word_probs = {} word_probs = {}
totals = {} totals = {}
Expand Down Expand Up @@ -270,7 +270,7 @@ def compute_metadata(self, pca=False):


# Create an iterator over the entire dataset. # Create an iterator over the entire dataset.
train_iter = MatchingIterator( train_iter = MatchingIterator(
self, self, train=False, batch_size=1024, device=-1, sort_in_buckets=False) self, self, train=False, batch_size=1024, device='cpu', sort_in_buckets=False)
attr_embeddings = defaultdict(list) attr_embeddings = defaultdict(list)


# Run the constructed neural network to compute weighted sequence embeddings # Run the constructed neural network to compute weighted sequence embeddings
Expand Down
7 changes: 6 additions & 1 deletion deepmatcher/data/field.py
Expand Up @@ -10,6 +10,7 @@
import torch import torch
from torchtext import data, vocab from torchtext import data, vocab
from torchtext.utils import download_from_url from torchtext.utils import download_from_url
from urllib.request import urlretrieve


logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)


Expand All @@ -23,6 +24,7 @@ def __init__(self,
url = url_base + suffix url = url_base + suffix
base, ext = os.path.splitext(suffix) base, ext = os.path.splitext(suffix)
name = suffix if ext == '.vec' else base name = suffix if ext == '.vec' else base
print('Fasttext url b', url_base)
super(FastText, self).__init__(name, url=url, **kwargs) super(FastText, self).__init__(name, url=url, **kwargs)




Expand Down Expand Up @@ -60,7 +62,10 @@ def cache(self, name, cache, url=None):
if not os.path.exists(cache): if not os.path.exists(cache):
os.makedirs(cache) os.makedirs(cache)
if not os.path.isfile(self.destination): if not os.path.isfile(self.destination):
download_from_url(url, self.destination) if 'drive.google.com' in url:
download_from_url(url, self.destination)
else:
urlretrieve(url, self.destination)
logger.info('Extracting vectors into {}'.format(cache)) logger.info('Extracting vectors into {}'.format(cache))
ext = os.path.splitext(self.destination)[1][1:] ext = os.path.splitext(self.destination)[1][1:]
if ext == 'zip': if ext == 'zip':
Expand Down
2 changes: 1 addition & 1 deletion deepmatcher/models/core.py
Expand Up @@ -351,7 +351,7 @@ def initialize(self, train_dataset, init_batch=None):
train_dataset, train_dataset,
train=False, train=False,
batch_size=4, batch_size=4,
device=-1, device='cpu',
sort_in_buckets=False) sort_in_buckets=False)
init_batch = next(run_iter.__iter__()) init_batch = next(run_iter.__iter__())
self.forward(init_batch) self.forward(init_batch)
Expand Down
6 changes: 3 additions & 3 deletions deepmatcher/models/modules.py
Expand Up @@ -721,13 +721,13 @@ def _forward(self, input_with_meta):
if input_with_meta.lengths is not None: if input_with_meta.lengths is not None:
mask = _utils.sequence_mask(input_with_meta.lengths) mask = _utils.sequence_mask(input_with_meta.lengths)
mask = mask.unsqueeze(2) # Make it broadcastable. mask = mask.unsqueeze(2) # Make it broadcastable.
input.data.masked_fill_(1 - mask, -float('inf')) input.data.masked_fill_(~mask, -float('inf'))
output = input.max(dim=1)[0] output = input.max(dim=1)[0]
else: else:
if input_with_meta.lengths is not None: if input_with_meta.lengths is not None:
mask = _utils.sequence_mask(input_with_meta.lengths) mask = _utils.sequence_mask(input_with_meta.lengths)
mask = mask.unsqueeze(2) # Make it broadcastable. mask = mask.unsqueeze(2) # Make it broadcastable.
input.data.masked_fill_(1 - mask, 0) input.data.masked_fill_(~mask, 0)


lengths = Variable(input_with_meta.lengths.clamp(min=1).unsqueeze(1).float()) lengths = Variable(input_with_meta.lengths.clamp(min=1).unsqueeze(1).float())
if self.style == 'avg': if self.style == 'avg':
Expand Down Expand Up @@ -860,7 +860,7 @@ def _forward(self, transformed, raw):
res *= math.sqrt(0.5) res *= math.sqrt(0.5)
return res return res
elif self.style == 'highway': elif self.style == 'highway':
transform_gate = F.sigmoid(self.highway_gate(raw) + self.highway_bias) transform_gate = torch.sigmoid(self.highway_gate(raw) + self.highway_bias)
carry_gate = 1 - transform_gate carry_gate = 1 - transform_gate
return transform_gate * transformed + carry_gate * adjusted_raw return transform_gate * transformed + carry_gate * adjusted_raw


Expand Down
2 changes: 1 addition & 1 deletion deepmatcher/models/word_aggregators.py
Expand Up @@ -137,7 +137,7 @@ def _forward(self, input_with_meta, context_with_meta):


if input_with_meta.lengths is not None: if input_with_meta.lengths is not None:
mask = _utils.sequence_mask(input_with_meta.lengths) mask = _utils.sequence_mask(input_with_meta.lengths)
alignment_scores.data.masked_fill_(1 - mask, -float('inf')) alignment_scores.data.masked_fill_(~mask, -float('inf'))


# Make values along dim 2 sum to 1. # Make values along dim 2 sum to 1.
normalized_scores = self.softmax(alignment_scores) normalized_scores = self.softmax(alignment_scores)
Expand Down
2 changes: 1 addition & 1 deletion deepmatcher/models/word_comparators.py
Expand Up @@ -172,7 +172,7 @@ def _forward(self,
if context_with_meta.lengths is not None: if context_with_meta.lengths is not None:
mask = _utils.sequence_mask(context_with_meta.lengths) mask = _utils.sequence_mask(context_with_meta.lengths)
mask = mask.unsqueeze(1) # Make it broadcastable. mask = mask.unsqueeze(1) # Make it broadcastable.
alignment_scores.data.masked_fill_(1 - mask, -float('inf')) alignment_scores.data.masked_fill_(~mask, -float('inf'))


# Make values along dim 2 sum to 1. # Make values along dim 2 sum to 1.
normalized_scores = self.softmax(alignment_scores) normalized_scores = self.softmax(alignment_scores)
Expand Down
2 changes: 1 addition & 1 deletion deepmatcher/models/word_contextualizers.py
Expand Up @@ -156,7 +156,7 @@ def _forward(self, input_with_meta):
if input_with_meta.lengths is not None: if input_with_meta.lengths is not None:
mask = _utils.sequence_mask(input_with_meta.lengths) mask = _utils.sequence_mask(input_with_meta.lengths)
mask = mask.unsqueeze(1) # Make it broadcastable. mask = mask.unsqueeze(1) # Make it broadcastable.
alignment_scores.data.masked_fill_(1 - mask, -float('inf')) alignment_scores.data.masked_fill_(~mask, -float('inf'))


normalized_scores = self.softmax(alignment_scores) normalized_scores = self.softmax(alignment_scores)


Expand Down
4 changes: 2 additions & 2 deletions deepmatcher/optim.py
Expand Up @@ -4,7 +4,7 @@
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
from torch.autograd import Variable from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm from torch.nn.utils import clip_grad_norm_


logger = logging.getLogger('deepmatcher.optim') logger = logging.getLogger('deepmatcher.optim')


Expand Down Expand Up @@ -143,7 +143,7 @@ def step(self):
self._step += 1 self._step += 1


if self.max_grad_norm: if self.max_grad_norm:
clip_grad_norm(self.params, self.max_grad_norm) clip_grad_norm_(self.params, self.max_grad_norm)
self.base_optimizer.step() self.base_optimizer.step()


def update_learning_rate(self, acc, epoch): def update_learning_rate(self, acc, epoch):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -37,6 +37,6 @@ def find_version(*file_paths):
packages=['deepmatcher', 'deepmatcher.data', 'deepmatcher.models'], packages=['deepmatcher', 'deepmatcher.data', 'deepmatcher.models'],
python_requires='>=3.5', python_requires='>=3.5',
install_requires=[ install_requires=[
'torch==0.3.1', 'tqdm', 'pyprind', 'six', 'Cython', 'torchtext', 'nltk>=3.2.5', 'torch>=1.0', 'tqdm', 'pyprind', 'six', 'Cython', 'torchtext', 'nltk>=3.2.5',
'fasttextmirror', 'pandas' 'fasttextmirror', 'pandas'
]) ])
9 changes: 2 additions & 7 deletions test/test_dataset.py
Expand Up @@ -12,13 +12,8 @@
from deepmatcher.data.field import FastText, MatchingField from deepmatcher.data.field import FastText, MatchingField
from deepmatcher.data.process import _make_fields, process from deepmatcher.data.process import _make_fields, process
from torchtext.utils import unicode_csv_reader from torchtext.utils import unicode_csv_reader

from urllib.parse import urljoin
try: from urllib.request import pathname2url
from urllib.parse import urljoin
from urllib.request import pathname2url
except ImportError:
from urlparse import urljoin
from urllib import path2pathname2url




# import nltk # import nltk
Expand Down
15 changes: 5 additions & 10 deletions test/test_field.py
Expand Up @@ -11,13 +11,8 @@
from deepmatcher.data.field import (FastText, FastTextBinary, MatchingField, from deepmatcher.data.field import (FastText, FastTextBinary, MatchingField,
MatchingVocab, reset_vector_cache) MatchingVocab, reset_vector_cache)
from torchtext.vocab import Vectors from torchtext.vocab import Vectors

from urllib.parse import urljoin
try: from urllib.request import pathname2url
from urllib.parse import urljoin
from urllib.request import pathname2url
except ImportError:
from urlparse import urljoin
from urllib import path2pathname2url


# import nltk # import nltk
# nltk.download('perluniprops') # nltk.download('perluniprops')
Expand Down Expand Up @@ -226,10 +221,10 @@ def test_extend_vectors_1(self):
v.unk_init = torch.Tensor.zero_ v.unk_init = torch.Tensor.zero_
tokens = {'hello', 'world'} tokens = {'hello', 'world'}
v.extend_vectors(tokens, vec_data) v.extend_vectors(tokens, vec_data)
self.assertEqual(len(v.itos), 3) self.assertEqual(len(v.itos), 4)
self.assertEqual(v.vectors.size(), torch.Size([3, 300])) self.assertEqual(v.vectors.size(), torch.Size([4, 300]))
self.assertEqual(list(v.vectors[1][0:10]), [0.0] * 10)
self.assertEqual(list(v.vectors[2][0:10]), [0.0] * 10) self.assertEqual(list(v.vectors[2][0:10]), [0.0] * 10)
self.assertEqual(list(v.vectors[3][0:10]), [0.0] * 10)


if os.path.exists(vectors_cache_dir): if os.path.exists(vectors_cache_dir):
shutil.rmtree(vectors_cache_dir) shutil.rmtree(vectors_cache_dir)
9 changes: 2 additions & 7 deletions test/test_integration.py
Expand Up @@ -11,13 +11,8 @@
from deepmatcher.data.field import MatchingField, FastText from deepmatcher.data.field import MatchingField, FastText
from deepmatcher.data.process import process, process_unlabeled from deepmatcher.data.process import process, process_unlabeled
from deepmatcher import MatchingModel from deepmatcher import MatchingModel

from urllib.parse import urljoin
try: from urllib.request import pathname2url
from urllib.parse import urljoin
from urllib.request import pathname2url
except ImportError:
from urlparse import urljoin
from urllib import path2pathname2url


from test import test_dir_path from test import test_dir_path


Expand Down
9 changes: 2 additions & 7 deletions test/test_iterator.py
Expand Up @@ -9,13 +9,8 @@
from deepmatcher.data.iterator import MatchingIterator from deepmatcher.data.iterator import MatchingIterator


from test import test_dir_path from test import test_dir_path

from urllib.parse import urljoin
try: from urllib.request import pathname2url
from urllib.parse import urljoin
from urllib.request import pathname2url
except ImportError:
from urlparse import urljoin
from urllib import path2pathname2url


class ClassMatchingIteratorTestCases(unittest.TestCase): class ClassMatchingIteratorTestCases(unittest.TestCase):
def test_splits_1(self): def test_splits_1(self):
Expand Down
9 changes: 2 additions & 7 deletions test/test_process.py
Expand Up @@ -10,13 +10,8 @@
from deepmatcher.data.process import _check_header, _make_fields, process, process_unlabeled from deepmatcher.data.process import _check_header, _make_fields, process, process_unlabeled
from torchtext.utils import unicode_csv_reader from torchtext.utils import unicode_csv_reader
from deepmatcher import MatchingModel from deepmatcher import MatchingModel

from urllib.parse import urljoin
try: from urllib.request import pathname2url
from urllib.parse import urljoin
from urllib.request import pathname2url
except ImportError:
from urlparse import urljoin
from urllib import path2pathname2url




class CheckHeaderTestCases(unittest.TestCase): class CheckHeaderTestCases(unittest.TestCase):
Expand Down

0 comments on commit 3783741

Please sign in to comment.