Skip to content
Permalink
Browse files

PyTorch 1.3 + TorchText 0.4 fixes

  • Loading branch information
sidharthms committed Dec 30, 2019
1 parent 328fe3c commit 378374138c218abacff05ac43be0169c3c0cd8dc
@@ -1,26 +1,19 @@
matrix:
include:
# - os: linux
# python: 2.7
# env: PYTHON_VERSION=2.7
- os: linux
python: 3.5
env: PYTHON_VERSION=3.5
- os: linux
python: 3.6
env: PYTHON_VERSION=3.6
# - os: osx
# language: generic
# env:
# - PYTHON_VERSION=2.7
- os: linux
python: 3.7
env: PYTHON_VERSION=3.7
- os: osx
language: generic
env:
- PYTHON_VERSION=3.5
- PYTHON_VERSION=3.6
- os: osx
language: generic
env:
- PYTHON_VERSION=3.6
- PYTHON_VERSION=3.7

notifications:
email: false
@@ -39,7 +32,7 @@ before_install:

install:
- conda install --yes python=$PYTHON_VERSION pip scikit-learn nose
- pip install --process-dependency-links git+https://github.com/anhaidgroup/deepmatcher | cat
- pip install -e . | cat
- python -m nltk.downloader perluniprops nonbreaking_prefixes punkt

script:
@@ -48,7 +48,7 @@ def __new__(cls, *args, **kwargs):
if 'word_probs' in train_info.metadata:
raw_word_probs = train_info.metadata['word_probs'][name]
word_probs = torch.Tensor(
[[raw_word_probs[w] for w in b] for b in data.data])
[[raw_word_probs[int(w)] for w in b] for b in data.data])
if data.is_cuda:
word_probs = word_probs.cuda()
pc = None
@@ -225,15 +225,15 @@ def compute_metadata(self, pca=False):

# Create an iterator over the entire dataset.
train_iter = MatchingIterator(
self, self, train=False, batch_size=1024, device=-1, sort_in_buckets=False)
self, self, train=False, batch_size=1024, device='cpu', sort_in_buckets=False)
counter = defaultdict(Counter)

# For each attribute, find the number of times each word id occurs in the dataset.
# Note that word ids here also include ``UNK`` tokens, padding tokens, etc.
for batch in pyprind.prog_bar(train_iter, title='\nBuilding vocabulary'):
for name in self.all_text_fields:
attr_input = getattr(batch, name)
counter[name].update(attr_input.data.data.view(-1))
counter[name].update(attr_input.data.data.view(-1).tolist())

word_probs = {}
totals = {}
@@ -270,7 +270,7 @@ def compute_metadata(self, pca=False):

# Create an iterator over the entire dataset.
train_iter = MatchingIterator(
self, self, train=False, batch_size=1024, device=-1, sort_in_buckets=False)
self, self, train=False, batch_size=1024, device='cpu', sort_in_buckets=False)
attr_embeddings = defaultdict(list)

# Run the constructed neural network to compute weighted sequence embeddings
@@ -10,6 +10,7 @@
import torch
from torchtext import data, vocab
from torchtext.utils import download_from_url
from urllib.request import urlretrieve

logger = logging.getLogger(__name__)

@@ -23,6 +24,7 @@ def __init__(self,
url = url_base + suffix
base, ext = os.path.splitext(suffix)
name = suffix if ext == '.vec' else base
print('Fasttext url b', url_base)
super(FastText, self).__init__(name, url=url, **kwargs)


@@ -60,7 +62,10 @@ def cache(self, name, cache, url=None):
if not os.path.exists(cache):
os.makedirs(cache)
if not os.path.isfile(self.destination):
download_from_url(url, self.destination)
if 'drive.google.com' in url:
download_from_url(url, self.destination)
else:
urlretrieve(url, self.destination)
logger.info('Extracting vectors into {}'.format(cache))
ext = os.path.splitext(self.destination)[1][1:]
if ext == 'zip':
@@ -351,7 +351,7 @@ def initialize(self, train_dataset, init_batch=None):
train_dataset,
train=False,
batch_size=4,
device=-1,
device='cpu',
sort_in_buckets=False)
init_batch = next(run_iter.__iter__())
self.forward(init_batch)
@@ -721,13 +721,13 @@ def _forward(self, input_with_meta):
if input_with_meta.lengths is not None:
mask = _utils.sequence_mask(input_with_meta.lengths)
mask = mask.unsqueeze(2) # Make it broadcastable.
input.data.masked_fill_(1 - mask, -float('inf'))
input.data.masked_fill_(~mask, -float('inf'))
output = input.max(dim=1)[0]
else:
if input_with_meta.lengths is not None:
mask = _utils.sequence_mask(input_with_meta.lengths)
mask = mask.unsqueeze(2) # Make it broadcastable.
input.data.masked_fill_(1 - mask, 0)
input.data.masked_fill_(~mask, 0)

lengths = Variable(input_with_meta.lengths.clamp(min=1).unsqueeze(1).float())
if self.style == 'avg':
@@ -860,7 +860,7 @@ def _forward(self, transformed, raw):
res *= math.sqrt(0.5)
return res
elif self.style == 'highway':
transform_gate = F.sigmoid(self.highway_gate(raw) + self.highway_bias)
transform_gate = torch.sigmoid(self.highway_gate(raw) + self.highway_bias)
carry_gate = 1 - transform_gate
return transform_gate * transformed + carry_gate * adjusted_raw

@@ -137,7 +137,7 @@ def _forward(self, input_with_meta, context_with_meta):

if input_with_meta.lengths is not None:
mask = _utils.sequence_mask(input_with_meta.lengths)
alignment_scores.data.masked_fill_(1 - mask, -float('inf'))
alignment_scores.data.masked_fill_(~mask, -float('inf'))

# Make values along dim 2 sum to 1.
normalized_scores = self.softmax(alignment_scores)
@@ -172,7 +172,7 @@ def _forward(self,
if context_with_meta.lengths is not None:
mask = _utils.sequence_mask(context_with_meta.lengths)
mask = mask.unsqueeze(1) # Make it broadcastable.
alignment_scores.data.masked_fill_(1 - mask, -float('inf'))
alignment_scores.data.masked_fill_(~mask, -float('inf'))

# Make values along dim 2 sum to 1.
normalized_scores = self.softmax(alignment_scores)
@@ -156,7 +156,7 @@ def _forward(self, input_with_meta):
if input_with_meta.lengths is not None:
mask = _utils.sequence_mask(input_with_meta.lengths)
mask = mask.unsqueeze(1) # Make it broadcastable.
alignment_scores.data.masked_fill_(1 - mask, -float('inf'))
alignment_scores.data.masked_fill_(~mask, -float('inf'))

normalized_scores = self.softmax(alignment_scores)

@@ -4,7 +4,7 @@
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm
from torch.nn.utils import clip_grad_norm_

logger = logging.getLogger('deepmatcher.optim')

@@ -143,7 +143,7 @@ def step(self):
self._step += 1

if self.max_grad_norm:
clip_grad_norm(self.params, self.max_grad_norm)
clip_grad_norm_(self.params, self.max_grad_norm)
self.base_optimizer.step()

def update_learning_rate(self, acc, epoch):
@@ -37,6 +37,6 @@ def find_version(*file_paths):
packages=['deepmatcher', 'deepmatcher.data', 'deepmatcher.models'],
python_requires='>=3.5',
install_requires=[
'torch==0.3.1', 'tqdm', 'pyprind', 'six', 'Cython', 'torchtext', 'nltk>=3.2.5',
'torch>=1.0', 'tqdm', 'pyprind', 'six', 'Cython', 'torchtext', 'nltk>=3.2.5',
'fasttextmirror', 'pandas'
])
@@ -12,13 +12,8 @@
from deepmatcher.data.field import FastText, MatchingField
from deepmatcher.data.process import _make_fields, process
from torchtext.utils import unicode_csv_reader

try:
from urllib.parse import urljoin
from urllib.request import pathname2url
except ImportError:
from urlparse import urljoin
from urllib import path2pathname2url
from urllib.parse import urljoin
from urllib.request import pathname2url


# import nltk
@@ -11,13 +11,8 @@
from deepmatcher.data.field import (FastText, FastTextBinary, MatchingField,
MatchingVocab, reset_vector_cache)
from torchtext.vocab import Vectors

try:
from urllib.parse import urljoin
from urllib.request import pathname2url
except ImportError:
from urlparse import urljoin
from urllib import path2pathname2url
from urllib.parse import urljoin
from urllib.request import pathname2url

# import nltk
# nltk.download('perluniprops')
@@ -226,10 +221,10 @@ def test_extend_vectors_1(self):
v.unk_init = torch.Tensor.zero_
tokens = {'hello', 'world'}
v.extend_vectors(tokens, vec_data)
self.assertEqual(len(v.itos), 3)
self.assertEqual(v.vectors.size(), torch.Size([3, 300]))
self.assertEqual(list(v.vectors[1][0:10]), [0.0] * 10)
self.assertEqual(len(v.itos), 4)
self.assertEqual(v.vectors.size(), torch.Size([4, 300]))
self.assertEqual(list(v.vectors[2][0:10]), [0.0] * 10)
self.assertEqual(list(v.vectors[3][0:10]), [0.0] * 10)

if os.path.exists(vectors_cache_dir):
shutil.rmtree(vectors_cache_dir)
@@ -11,13 +11,8 @@
from deepmatcher.data.field import MatchingField, FastText
from deepmatcher.data.process import process, process_unlabeled
from deepmatcher import MatchingModel

try:
from urllib.parse import urljoin
from urllib.request import pathname2url
except ImportError:
from urlparse import urljoin
from urllib import path2pathname2url
from urllib.parse import urljoin
from urllib.request import pathname2url

from test import test_dir_path

@@ -9,13 +9,8 @@
from deepmatcher.data.iterator import MatchingIterator

from test import test_dir_path

try:
from urllib.parse import urljoin
from urllib.request import pathname2url
except ImportError:
from urlparse import urljoin
from urllib import path2pathname2url
from urllib.parse import urljoin
from urllib.request import pathname2url

class ClassMatchingIteratorTestCases(unittest.TestCase):
def test_splits_1(self):
@@ -10,13 +10,8 @@
from deepmatcher.data.process import _check_header, _make_fields, process, process_unlabeled
from torchtext.utils import unicode_csv_reader
from deepmatcher import MatchingModel

try:
from urllib.parse import urljoin
from urllib.request import pathname2url
except ImportError:
from urlparse import urljoin
from urllib import path2pathname2url
from urllib.parse import urljoin
from urllib.request import pathname2url


class CheckHeaderTestCases(unittest.TestCase):

0 comments on commit 3783741

Please sign in to comment.
You can’t perform that action at this time.