In [1]:
import scipy as cp
import sklearn
import spacy
import numpy as np

import sense2vec

In [2]:
from spacy.tokens.doc import Doc

In [3]:
LABELS = {
    'ENT': 'ENT',
    'PERSON': 'ENT',
    'NORP': 'ENT',
    'FAC': 'ENT',
    'ORG': 'ENT',
    'GPE': 'ENT',
    'LOC': 'ENT',
    'LAW': 'ENT',
    'PRODUCT': 'ENT',
    'EVENT': 'ENT',
    'WORK_OF_ART': 'ENT',
    'LANGUAGE': 'ENT',
    'DATE': 'DATE',
    'TIME': 'TIME',
    'PERCENT': 'PERCENT',
    'MONEY': 'MONEY',
    'QUANTITY': 'QUANTITY',
    'ORDINAL': 'ORDINAL',
    'CARDINAL': 'CARDINAL'
}

In [4]:
en_nlp = spacy.load('en')

In [60]:
def transform_texts(texts):
    # Load the annotation models
    # Stream texts through the models. We accumulate a buffer and release
    # the GIL around the parser, for efficient multi-threading.
    for doc in en_nlp.pipe(texts, n_threads=4):
        # Iterate over base NPs, e.g. "all their good ideas"
        for np in list(doc.noun_chunks):
            # Only keep adjectives and nouns, e.g. "good ideas"
            while len(np) > 1 and np[0].dep_ not in ('amod', 'compound'):
                np = np[1:]
            if len(np) > 1:
                # Merge the tokens, e.g. good_ideas
                np.merge(np.root.tag_, np.text, np.root.ent_type_)
            # Iterate over named entities
            for ent in doc.ents:
                if len(ent) > 1:
                    # Merge them into single tokens
                    ent.merge(ent.root.tag_, ent.text, ent.label_)
        token_strings = []
        for token in doc:
            text = token.text.replace(' ', '_')
            tag = token.ent_type_ or token.pos_
            token_strings.append('%s|%s' % (text, tag))
        yield ' '.join(token_strings)

In [72]:
print list(transform_texts([u"I love Donald Trump."]))

[u'I|PRON love|VERB Donald_Trump|PERSON .|PUNCT']


In [15]:
list(doc.noun_chunks_iterator(doc))

[(2, 4, 15729), (5, 7, 15729), (11, 14, 15729)]

In [None]:
# IMPORTANT!

# Part of speech tags
# ADJ ADP ADV AUX CONJ DET INTJ NOUN NUM PART PRON PROPN PUNCT SCONJ SYM VERB X

# Named entities tags
# NORP FACILITY ORG GPE LOC PRODUCT EVENT WORK_OF_ART LANGUAGE

In [64]:
# Loading model
model_s2v = sense2vec.load('reddit_vectors')

In [None]:
# Finding n most similar word vectors to a particular word vector
freq, query_vector = model_s2v[u'friend|NOUN']
model_s2v.most_similar(query_vector, n =5)

In [70]:
# Finding how similar two word vectors are each other on a 0 to 1 scale
freq1, vector1 = model_s2v[u'Donald_Trump|PERSON']
freq2, vector2 = model_s2v[u'?|PUNCT']
print model_s2v.data.similarity(vector1, vector2)

0.424733996391
