Skip to content

Commit

Permalink
Add support to load from GloVe models.
Browse files Browse the repository at this point in the history
  • Loading branch information
alantian committed Mar 20, 2016
1 parent 06f664a commit a5b4c90
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/Embeddings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ The Embedding class can read word embeddings from different sources:

- Gensim word2vec objects: (``from_gensim`` method)
- Word2vec binary/text models: (``from_word2vec`` method)
- GloVe models (``from_glove`` method)
- polyglot pickle files: (``load`` method)

.. code:: python
Expand Down
40 changes: 40 additions & 0 deletions polyglot/mapping/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,46 @@ def from_word2vec(fname, fvocab=None, binary=False):

return Embedding(vocabulary=vocabulary, vectors=vectors)

@staticmethod
def _from_glove(fname):
with _open(fname, 'rb') as fin:
words = []
vocab_size, layer1_size = None, None
vectors = []
for line_no, line in enumerate(fin):
try:
parts = unicode(line, encoding="utf-8").strip().split()
except TypeError as e:
parts = line.strip().split()
except Exception as e:
logger.warning("We ignored line number {} because of erros in parsing"
"\n{}".format(line_no, e))
continue
# We deduce layer1_size because GloVe files have no header.
if layer1_size is None:
layer1_size = len(parts) - 1
# We differ from Gensim implementation.
# Our assumption that a difference of one happens because of having a
# space in the word.
if len(parts) == layer1_size + 1:
word, weights = parts[0], list(map(float32, parts[1:]))
else:
logger.warning("We ignored line number {} because of unrecognized "
"number of columns {}".format(line_no, parts[:-layer1_size]))
continue
index = line_no
words.append(word)
vectors.append(weights)
vectors = np.asarray(vectors, dtype=np.float32)
return words, vectors


@staticmethod
def from_glove(fname):
words, vectors = Embedding._from_glove(fname)
vocabulary = OrderedVocabulary(words)
return Embedding(vocabulary=vocabulary, vectors=vectors)

@staticmethod
def load(fname):
"""Load an embedding dump generated by `save`"""
Expand Down

0 comments on commit a5b4c90

Please sign in to comment.