In [0]:
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

!gzip -d GoogleNews-vectors-negative300.bin.gz 

--2019-06-07 13:25:19--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.104.133
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.104.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2019-06-07 13:25:37 (86.4 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [0]:
import gensim

model =  gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
words= (list(model.vocab.keys()))
embeddings = model.vectors

import os
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
# POSITIVE_FINANCE = ["bullish","climb","surge","hortation","successful", "excellent", "profit", "beneficial", "improving", "improved", "success", "gains", "positive"]
# NEGATIVE_FINANCE = ["bearish","fall","slump","negligent", "sanction","loss", "volatile", "wrong", "losses", "damages", "bad", "litigation", "failure", "down", "negative"]

In [0]:
POSITIVE_FINANCE = ["successful", "excellent", "profit", "beneficial", "improving", "improved", "success", "gains", "positive"]
NEGATIVE_FINANCE = ["negligent", "loss", "volatile", "wrong", "losses", "damages", "bad", "litigation", "failure", "down", "negative"]

def finance_seeds():
    return POSITIVE_FINANCE, NEGATIVE_FINANCE

In [0]:
import numpy as np
from scipy import sparse
from itertools import chain
from nltk.corpus import wordnet as wn
import multiprocessing

MAX_PROCS=8

"""
Methods for constructing graphs from word embeddings.
"""

def similarity_matrix(embeddings, arccos=False, similarity_power=1, nn=20, **kwargs):
    """
    Constructs a similarity matrix from embeddings.
    nn argument controls the degree.
    """
    def make_knn(vec, nn=nn):
        vec[vec < vec[np.argsort(vec)[-nn]]] = 0
        return vec
    L = embeddings.m.dot(embeddings.m.T)
    if sparse.issparse(L):
        L = L.todense()
    if arccos:
        L = np.arccos(np.clip(-L, -1, 1))/np.pi
    else:
        L += 1
    np.fill_diagonal(L, 0)
    L = np.apply_along_axis(make_knn, 1, L)
    return L ** similarity_power

def transition_matrix(embeddings, word_net=False, first_order=False, sym=False, trans=False, **kwargs):
    """
    Build a probabilistic transition matrix from word embeddings.
    """
    if word_net:
        L =  wordnet_similarity_matrix(embeddings)
    elif not first_order:
        L = similarity_matrix(embeddings, **kwargs)
    else:
        L = embeddings.m.todense().A
    if sym:
        Dinv = np.diag([1. / np.sqrt(L[i].sum()) if L[i].sum() > 0 else 0 for i in range(L.shape[0])])
        return Dinv.dot(L).dot(Dinv)
    else:
        Dinv = np.diag([1. / L[i].sum() for i in range(L.shape[0])])
        L = L.dot(Dinv)
    if trans:
        return L.T
    return L


In [0]:
import heapq

import numpy as np
from sklearn import preprocessing


class Embedding:
    """
    Base class for all embeddings. SGNS can be directly instantiated with it.
    """

    def __init__(self, vecs, vocab, normalize=True, **kwargs):
        self.m = vecs
        self.dim = self.m.shape[1]
        self.iw = vocab
        self.wi = {w:i for i,w in enumerate(self.iw)}
        if normalize:
            self.normalize()

    def __getitem__(self, key):
        if self.oov(key):
            raise KeyError
        else:
            return self.represent(key)

    def __iter__(self):
        return self.iw.__iter__()

    def __contains__(self, key):
        return not self.oov(key)

    @classmethod
    def load(cls, path, normalize=True, add_context=True, **kwargs):
        mat = np.load(path + "-w.npy")
        if add_context:
            mat += np.load(path + "-c.npy")
        iw = load_pickle(path + "-vocab.pkl")
        return cls(mat, iw, normalize) 

    def get_subembed(self, word_list, **kwargs):
        word_list = [word for word in word_list if not self.oov(word)]
        keep_indices = [self.wi[word] for word in word_list]
        return Embedding(self.m[keep_indices, :], word_list, normalize=False)

    def reindex(self, word_list, **kwargs):
        new_mat = np.empty((len(word_list), self.m.shape[1]))
        valid_words = set(self.iw)
        for i, word in enumerate(word_list):
            if word in valid_words:
                new_mat[i, :] = self.represent(word)
            else:
                new_mat[i, :] = 0 
        return Embedding(new_mat, word_list, normalize=False)

    def get_neighbourhood_embed(self, w, n=1000):
        neighbours = self.closest(w, n=n)
        keep_indices = [self.wi[neighbour] for _, neighbour in neighbours] 
        new_mat = self.m[keep_indices, :]
        return Embedding(new_mat, [neighbour for _, neighbour in neighbours]) 

    def normalize(self):
        preprocessing.normalize(self.m, copy=False)

    def oov(self, w):
        return not (w in self.wi)

    def represent(self, w):
        if w in self.wi:
            return self.m[self.wi[w], :]
        else:
            print ("OOV: ", w)
            return np.zeros(self.dim)

    def similarity(self, w1, w2):
        """
        Assumes the vectors have been normalized.
        """
        sim = self.represent(w1).dot(self.represent(w2))
        return sim

    def closest(self, w, n=10):
        """
        Assumes the vectors have been normalized.
        """
        scores = self.m.dot(self.represent(w))
        return heapq.nlargest(n, zip(scores, self.iw))
    
from sklearn.utils.extmath import randomized_svd
class SVDEmbedding(Embedding):
    """
    SVD embeddings.
    Enables controlling the weighted exponent of the eigenvalue matrix (eig).
    Context embeddings can be created with "transpose".
    """
    
    def __init__(self, embed,words,dim=300, normalize=True, eig=0.0, **kwargs):
        ut, s, v = randomized_svd(embed, n_components=dim, n_iter=5)
        self.iw = words
        self.wi = {w:i for i, w in enumerate(self.iw)}
 
        if eig == 0.0:
            self.m = ut
        elif eig == 1.0:
            self.m = s * ut
        else:
            self.m = np.power(s, eig) * ut

        self.dim = self.m.shape[1]

        if normalize:
            self.normalize()

class GigaEmbedding(Embedding):
    def __init__(self, embed, words, dim=300, normalize=True, **kwargs):
        self.iw = words
        self.wi = {w:i for i,w in enumerate(self.iw)}
        self.m = np.vstack(w for w in embed)
        if normalize:
            self.normalize()




In [0]:

import random
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations, product
from keras import backend as K
from keras.models import Model,Sequential
from keras.layers.core import Dense, Lambda
from keras.optimizers import Adam, Optimizer
from keras.regularizers import Regularizer
from keras.constraints import Constraint
import theano.tensor as T
from keras.layers import Input, Dense
import keras
from tqdm import tqdm
import tensorflow as tf
"""
Helper methods for learning transformations of word embeddings.
"""

class SimpleSGD(Optimizer):
    def __init__(self, lr=5, momentum=0., decay=0.,
                 nesterov=False, **kwargs):
        super(SimpleSGD, self).__init__(**kwargs)
        self.__dict__.update(locals())
        self.iterations = K.variable(0.)
        self.lr = K.variable(lr)
        self.momentum = K.variable(momentum)
        self.decay = K.variable(decay)

    def get_updates(self, params, loss):
        grads = self.get_gradients(loss, params)
        lr = self.lr * 0.99
        self.updates = [(self.iterations, self.iterations + 1.)]

        # momentum
        self.weights = [K.variable(np.zeros(K.get_value(p).shape)) for p in params]
        for p, g, m in zip(params, grads, self.weights):
            v = self.momentum * m - lr * g  # velocity
            self.updates.append((m, v))

            if self.nesterov:
                new_p = p + self.momentum * v - lr * g
            else:
                new_p = p + v

            self.updates.append((p, new_p))
        return self.updates

    def get_config(self):
        config = {'lr': float(K.get_value(self.lr)),
                  'momentum': float(K.get_value(self.momentum)),
                  'decay': float(K.get_value(self.decay)),
                  'nesterov': self.nesterov}
        base_config = super(SimpleSGD, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

class Orthogonal(Constraint):
    def __call__(self, p):
        print ("here")
        u,s,v = T.nlinalg.svd(p)
        return K.dot(u,K.transpose(v))

class OthogonalRegularizer(Regularizer):
    def __init__(self, strength=0.):
        self.strength = strength

    def set_param(self, p):
        self.p = p

    def __call__(self, loss):
        loss += K.sum(K.square(self.p.dot(self.p.T) - T.identity_like(self.p))) * self.strength
        return loss

    def get_config(self):
        return {"name": self.__class__.__name__,
                "strength": self.strength}


def orthogonalize(Q):
    U, S, V = np.linalg.svd(Q)
    return U.dot(V.T)


class DatasetMinibatchIterator:
    def __init__(self, embeddings, positive_seeds, negative_seeds, batch_size=512, **kwargs):
        self.words, embeddings1, embeddings2, labels = [], [], [], []

        def add_examples(word_pairs, label):
            for w1, w2 in word_pairs:
                embeddings1.append(embeddings[w1])
                embeddings2.append(embeddings[w2])
                labels.append(label)
                self.words.append((w1, w2))

        add_examples(combinations(positive_seeds, 2), 1)
        add_examples(combinations(negative_seeds, 2), 1)
        add_examples(product(positive_seeds, negative_seeds), -1)
        self.e1 = np.vstack(embeddings1)
        self.e2 = np.vstack(embeddings2)
        self.y = np.array(labels)

        self.batch_size = batch_size
        self.n_batches = (self.y.size + self.batch_size - 1) // self.batch_size

    def shuffle(self):
        perm = np.random.permutation(np.arange(self.y.size))
        self.e1, self.e2, self.y, self.words = \
            self.e1[perm], self.e2[perm], self.y[perm], [self.words[i] for i in perm]

    def __iter__(self):
        for i in range(self.n_batches):
            batch = np.arange(i * self.batch_size, min(self.y.size, (i + 1) * self.batch_size))
            yield {
                'embeddings1': self.e1[batch],
                'embeddings2': self.e2[batch],
                'y': self.y[batch][:, np.newaxis]
            }


def get_model(inputdim, outputdim, regularization_strength=0.01, lr=0.000, cosine=False, **kwargs):
  
    
    
   
    inputA = Input(name='embeddings1', shape=(inputdim,))
    inputB = Input(name='embeddings2', shape=(inputdim,))
    couche = Dense(inputdim, init='identity',
                           W_constraint=Orthogonal())
    transformed1 = couche(inputA)
    transformed2 = couche( inputB)
    projected1 = (Lambda(lambda x: x[:, :outputdim]))(transformed1)
    negprojected2 = (Lambda(lambda x: -x[:, :outputdim]))(transformed2)
    
    if cosine:
        w = Lambda(lambda x:  x / K.reshape(K.sqrt(K.sum(x * x, axis=1)),shape= (-1, 1)))(projected1)
       
        z = Lambda(lambda x:  x / K.reshape(K.sqrt(K.sum(x * x, axis=1)), shape = (-1, 1)))(negprojected2)
        w1 = Lambda(lambda x: K.reshape(K.sum(x, axis=1), shape = (-1, 1)))(w)
        
        z1 = Lambda(lambda x: K.reshape(K.sum(x, axis=1),shape =  (-1, 1)))(z)
        
        multiply_layer = keras.layers.Multiply(name='y')
        merged = multiply_layer([z1, w1])
        
    else:
        w = Lambda(lambda x: K.reshape(K.sqrt(K.sum(x * x, axis=1)),shape =  (-1, 1)))(projected1) 
        
        z = Lambda(lambda x: K.reshape(K.sqrt(K.sum(x * x, axis=1)), shape =(-1, 1)))(negprojected2)
                  
        merged = keras.layers.add([z, w], name='y')
        
   
    model = Model(inputs=[inputA, inputB], outputs=merged)
    model.compile(loss={'y' : lambda y, d: K.mean(y * tf.norm(d))}, optimizer=SimpleSGD())
    return model


def apply_embedding_transformation(embeddings, positive_seeds, negative_seeds,
                                   n_epochs=5, n_dim=10, force_orthogonal=False,
                                   plot=True, plot_points=50, plot_seeds=False,
                                   **kwargs):
    print ("Preparing to learn embedding tranformation")
    dataset = DatasetMinibatchIterator(embeddings, positive_seeds, negative_seeds, **kwargs)
    model = get_model(embeddings.m.shape[1], n_dim, **kwargs)

    print ("Learning embedding transformation")

    for epoch in tqdm(range(n_epochs)):
        dataset.shuffle()
        loss = 0
        
        for i, X in enumerate(dataset):
            loss += model.train_on_batch([X[ 'embeddings1'],X[ 'embeddings2']],X['y']) * X['y'].size
           
            Q, b = model.get_weights()
            if force_orthogonal:
                Q = orthogonalize(Q)
            model.set_weights([Q, np.zeros_like(b)])
    print(loss)
    Q, b = model.get_weights()
    new_mat = embeddings.m.dot(Q)[:,0:n_dim]
    
    if plot and n_dim == 2:
#         plot_seeds = True

        plot_words = list(positive_seeds.keys()) + list(negative_seeds.keys()) if plot_seeds else \
            [w for w in embeddings if w not in positive_seeds and w not in negative_seeds]
#         plot_words = set(random.sample(plot_words, plot_points))
        to_plot = {w: embeddings[w].dot(Q)[0:n_dim] for w in embeddings if w in plot_words}
        

        
        plt.figure(figsize=(10, 10))
        for w, e in to_plot.items():
            plt.text(e[0], e[1], w,
                     bbox=dict(facecolor='green' if w in positive_seeds else 'red', alpha=0.1))
        if len(list(to_plot.values()))!=0:

          xmin, ymin = np.min(np.vstack(list(to_plot.values())),axis=0)
          xmax, ymax = np.max(np.vstack(list(to_plot.values())),axis=0)
          plt.xlim(xmin, xmax)
          plt.ylim(ymin, ymax)
          plt.show()
    
    return Embedding(new_mat, embeddings.iw, normalize=n_dim!=1)


Using TensorFlow backend.


In [0]:
import json
import subprocess

import os
import shutil
import sys
import time
import numpy as np

import functools
import numpy as np

from scipy.sparse import csr_matrix
from multiprocessing import Pool
from sklearn.linear_model import LogisticRegression, Ridge

"""
A set of methods for inducing polarity lexicons using word embeddings and seed words.
"""


def densify(embeddings, positive_seeds, negative_seeds, 
        transform_method=apply_embedding_transformation, **kwargs):
    """
    Learns polarity scores via orthogonally-regularized projection to one-dimension
    Adapted from: http://arxiv.org/pdf/1602.07572.pdf
    """
    p_seeds = {word:1.0 for word in positive_seeds}
    n_seeds = {word:1.0 for word in negative_seeds}
    new_embeddings = embeddings
    new_embeddings = apply_embedding_transformation(
            embeddings, p_seeds, n_seeds, n_dim=1,  **kwargs)
    polarities = {w:new_embeddings[w][0] for w in embeddings.iw}
    
    

    return  polarities



def random_walk(embeddings, positive_seeds, negative_seeds, beta=0.9, **kwargs):
    """
    Learns polarity scores via random walks with teleporation to seed sets.
    Main method used in paper. 
    """
    def run_random_walk(M, teleport, beta, **kwargs):
        def update_seeds(r):
            r += (1 - beta) * teleport / np.sum(teleport)
        return run_iterative(M * beta, np.ones(M.shape[1]) / M.shape[1], update_seeds, **kwargs)

    if not type(positive_seeds) is dict:
        positive_seeds = {word:1.0 for word in positive_seeds}
        negative_seeds = {word:1.0 for word in negative_seeds}
    words = embeddings.iw
    M = transition_matrix(embeddings, **kwargs)
    rpos = run_random_walk(M, weighted_teleport_set(words, positive_seeds), beta, **kwargs)
    rneg = run_random_walk(M, weighted_teleport_set(words, negative_seeds), beta, **kwargs)
    return {w: rpos[i] / (rpos[i] + rneg[i]) for i, w in enumerate(words)}


def label_propagate_probabilistic(embeddings, positive_seeds, negative_seeds, **kwargs):
    """
    Learns polarity scores via standard label propagation from seed sets.
    One walk per label. Scores normalized to probabilities. 
    """
    words = embeddings.iw
    M = transition_matrix(embeddings, **kwargs)
    pos, neg = teleport_set(words, positive_seeds), teleport_set(words, negative_seeds)
    def update_seeds(r):
        r[pos] = [1, 0]
        r[neg] = [0, 1]
        r /= np.sum(r, axis=1)[:, np.newaxis]
    r = run_iterative(M, np.random.random((M.shape[0], 2)), update_seeds, **kwargs)
    return {w: r[i][0] / (r[i][0] + r[i][1]) for i, w in enumerate(words)}


def label_propagate_continuous(embeddings, positive_seeds, negative_seeds, **kwargs):
    """
    Learns polarity scores via standard label propagation from seed sets.
    One walk for both labels, continuous non-normalized scores.
    """
    words = embeddings.iw
    M = transition_matrix(embeddings, **kwargs)
    pos, neg = teleport_set(words, positive_seeds), teleport_set(words, negative_seeds)
    def update_seeds(r):
        r[pos] = 1
        r[neg] = -1
    r = run_iterative(M, np.zeros(M.shape[0]), update_seeds, **kwargs)
    return {w: r[i] for i, w in enumerate(words)}

from  keras.utils.generic_utils import Progbar
def logged_loop(iterable, n=None):
    if n is None:
        n = len(iterable)
    step = max(1, n / 1000)
    prog = Progbar(n)
    for i, elem in enumerate(iterable):
        if i % step == 0 or i == n - 1:
            prog.update(i + 1)
        yield elem
        
def graph_propagate(embeddings, positive_seeds, negative_seeds, **kwargs):
    """
    Graph propagation method dapted from Velikovich, Leonid, et al. "The viability of web-derived polarity lexicons."
    http://www.aclweb.org/anthology/N10-1119
    Should be used with arccos=True
    """
    def run_graph_propagate(seeds, alpha_mat, trans_mat, T=1, **kwargs):
        def get_rel_edges(ind_set):
            rel_edges = set([])
            for node in ind_set:
                rel_edges = rel_edges.union(
                        [(node, other) for other in trans_mat[node,:].nonzero()[1]])
            return rel_edges

        for seed in seeds:
            F = set([seed])
            for t in range(T):
                for edge in get_rel_edges(F):
                    alpha_mat[seed, edge[1]] = max(
                            alpha_mat[seed, edge[1]], 
                            alpha_mat[seed, edge[0]] * trans_mat[edge[0], edge[1]])
                    F.add(edge[1])
        return alpha_mat

    M = similarity_matrix(embeddings, **kwargs)
    M = (M + M.T)/2
    print ("Getting positive scores..")
    pos_alpha = M.copy()
    neg_alpha = M.copy()
    M = csr_matrix(M)
    pos_alpha = run_graph_propagate([embeddings.wi[seed] for seed in positive_seeds],
            pos_alpha, M, **kwargs)
    pos_alpha = pos_alpha + pos_alpha.T
    print ("Getting negative scores..")
    neg_alpha = run_graph_propagate([embeddings.wi[seed] for seed in negative_seeds],
            neg_alpha, M, **kwargs)
    neg_alpha = neg_alpha + neg_alpha.T
    print ("Computing final scores...")
    polarities = {}
    index = embeddings.wi
    pos_pols = {w:1.0 for w in positive_seeds}
    for w in negative_seeds:
        pos_pols[w] = 0.0
    neg_pols = {w:1.0 for w in negative_seeds}
    for w in positive_seeds:
        neg_pols[w] = 0.0
    for w in logged_loop(index):
        if w not in positive_seeds and w not in negative_seeds:
            pos_pols[w] = sum(pos_alpha[index[w], index[seed]] for seed in positive_seeds if seed in index) 
            neg_pols[w] = sum(neg_alpha[index[w], index[seed]] for seed in negative_seeds if seed in index)
    beta = np.sum(list(pos_pols.values())) / np.sum(list(neg_pols.values()))
    for w in index:
        polarities[w] = pos_pols[w] - beta * neg_pols[w]
    return polarities


### HELPER METHODS #####

def teleport_set(words, seeds):
    return [i for i, w in enumerate(words) if w in seeds]

def weighted_teleport_set(words, seed_weights):
    return np.array([seed_weights[word] if word in seed_weights else 0.0 for word in words])

def run_iterative(M, r, update_seeds, max_iter=50, epsilon=1e-6, **kwargs):
    for i in range(max_iter):
        last_r = np.array(r)
        r = np.dot(M, r)
        update_seeds(r)
        if np.abs(r - last_r).sum() < epsilon:
            break
    return r

### META METHODS ###

def _bootstrap_func(embeddings, positive_seeds, negative_seeds, boot_size, score_method, seed, **kwargs):
    np.random.seed(seed)
    pos_seeds = np.random.choice(positive_seeds, boot_size)
    neg_seeds = np.random.choice(negative_seeds, boot_size)
    polarities = score_method(embeddings, pos_seeds, neg_seeds, **kwargs)
    return {word:score for word, score in polarities.items() if
            not word in positive_seeds and not word in negative_seeds}

def bootstrap(embeddings, positive_seeds, negative_seeds, num_boots=10, score_method=random_walk,
        boot_size=7, return_all=False, n_procs=15, **kwargs):
    pool = Pool(n_procs)
    map_func = functools.partial(_bootstrap_func, embeddings, positive_seeds, negative_seeds,
            boot_size, score_method, **kwargs)
    polarities_list = pool.map(map_func, range(num_boots))
    if return_all:
        return polarities_list
    else:
        polarities = {}
        for word in polarities_list[0]:
            polarities[word] = np.mean([polarities_list[i][word] for i in range(num_boots)])
        return polarities


In [0]:
def lines(fname):
    with open(fname) as f:
        for line in f:
            yield line

def create_representation(rep_type, path, *args, **kwargs):
    if rep_type == 'Explicit':
        return Explicit.load(path, *args, **kwargs)
    elif rep_type == 'SVD':
        return SVDEmbedding(path, *args, **kwargs)
    elif rep_type == 'GIGA':
        return GigaEmbedding(path, *args, **kwargs)
    else:
        return Embedding.load(path, *args, **kwargs)

common_embed = create_representation("GIGA", embeddings, words)



In [0]:
del embeddings

In [0]:
def run_method(positive_seeds, negative_seeds, embeddings, transform_embeddings=False, post_densify=False,
        method=densify, **kwargs):
    if transform_embeddings:
        print ("Transforming embeddings...")
        embeddings = apply_embedding_transformation(embeddings, positive_seeds, negative_seeds, n_dim=50)
    if post_densify:
        polarities = method(embeddings, positive_seeds, negative_seeds, **kwargs)
        top_pos = [word for word in 
                sorted(polarities, key = lambda w : -polarities[w])[:5]]#150
        top_neg = [word for word in 
                sorted(polarities, key = lambda w : polarities[w])[:5]]#150
        top_pos.extend(positive_seeds)
        top_neg.extend(negative_seeds)
        return densify(embeddings, top_pos, top_neg)
    positive_seeds = [s for s in positive_seeds if s in embeddings]
    negative_seeds = [s for s in negative_seeds if s in embeddings]
    return method(embeddings, positive_seeds, negative_seeds, **kwargs)

In [0]:

positive_seeds, negative_seeds = finance_seeds()

In [0]:
import re
input_str = "Tunisia Industrial Output Shrinks for 7th Straight Month."
input_str = re.sub(r'\d+', '', input_str)

# print(result)


In [0]:
eval_words = ['shrink','fall', 'failure','shock','low','hits','good','best','profitable','best','worst','upwarding']
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
example_sent = "Tunisia Industrial Output Shrinks for 7th Straight Month"

  
stop_words = set(stopwords.words('english')) 
  
word_tokens = word_tokenize(example_sent) 
  
filtered_sentence = [w.lower() for w in word_tokens if not w.lower() in stop_words]
# eval_words = ['shrink']

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:

# too much ram
'''
label_propagate_probabilistic
'''
DEFAULT_ARGUMENTS = dict(
        # for iterative graph algorithms
        similarity_power=1,
        arccos=True,
        max_iter=50,
        epsilon=1e-6,
        sym=True,

        # for learning embeddings transformation
        n_epochs=50,
        force_orthogonal=False,
        batch_size=100,
        cosine=False,

        ## bootstrap
        num_boots=1,
        n_procs=1,
)
polarities = run_method(positive_seeds, negative_seeds, 
            common_embed,
            method=label_propagate_continuous, #label_propagate_probabilistic
            #method=bootstrap, 
            beta=0.99, nn=5,
#             transform_embeddings=True,

            **DEFAULT_ARGUMENTS)
from operator import itemgetter
myvalues = itemgetter(*list(set(eval_words) & set(polarities.keys())))(polarities)
myvalues,list(set(eval_words) & set(polarities.keys()))

In [0]:
'''
graph_propagate
'''
DEFAULT_ARGUMENTS = dict(
        # for iterative graph algorithms
        similarity_power=1,
        arccos=True,
        max_iter=50,
        epsilon=1e-6,
        sym=True,

        # for learning embeddings transformation
        n_epochs=100,
        force_orthogonal=True,
        batch_size=100,
        cosine=False,

        ## bootstrap
        num_boots=1,
        n_procs=1,
)
polarities = run_method(positive_seeds, negative_seeds, 
                        common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
#                         score_method=bootstrap,
                        method=graph_propagate,
                        T=10,
                        transform_embeddings=True,
                        **DEFAULT_ARGUMENTS)
from operator import itemgetter
myvalues = itemgetter(*list(set(eval_words) & set(polarities.keys())))(polarities)
dict(zip(list(set(eval_words) & set(polarities.keys())),myvalues))

  0%|          | 0/5 [00:00<?, ?it/s]

Transforming embeddings...
Preparing to learn embedding tranformation
Learning embedding transformation


100%|██████████| 5/5 [00:01<00:00,  3.43it/s]


-959.7970199584961
Getting positive scores..
Getting negative scores..
Computing final scores...


{'best': 2.8911447403992643,
 'failure': -0.8756833698491466,
 'fall': -0.15372589662434777,
 'good': 2.913776825427723,
 'hits': -1.2046365206698262,
 'low': 2.3917099272276605,
 'profitable': 1.777632448114801,
 'shock': -5.100133118168005,
 'shrink': -0.5905424908226706,
 'worst': -2.2927088465439827}

In [0]:
'''
random walk
'''
DEFAULT_ARGUMENTS = dict(
        # for iterative graph algorithms
        similarity_power=1,
        arccos=True,
        max_iter=50,
        epsilon=1e-6,
        sym=True,

        # for learning embeddings transformation
        n_epochs=50,
        force_orthogonal=True,
        batch_size=100,
        cosine=False,

        ## bootstrap
        num_boots=1,
        n_procs=1,
)
nn=10
beta=0.1
polarities = run_method(positive_seeds, negative_seeds, 
                    common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    score_method=bootstrap, 
                    method=random_walk, 
                    nn=nn, beta=beta,
                    transform_embeddings=True,
                    **DEFAULT_ARGUMENTS)

from operator import itemgetter
myvalues = itemgetter(*list(set(eval_words) & set(polarities.keys())))(polarities)
dict(zip(list(set(eval_words) & set(polarities.keys())),myvalues))

  0%|          | 0/5 [00:00<?, ?it/s]

Transforming embeddings...
Preparing to learn embedding tranformation
Learning embedding transformation


100%|██████████| 5/5 [00:02<00:00,  2.03it/s]

-959.796838760376





{'best': 0.8763161580160217,
 'failure': 0.002433381754637584,
 'fall': 0.3797587947562773,
 'good': 0.7879682680414204,
 'hits': 0.25277628664878626,
 'low': 0.7147992797703622,
 'profitable': 0.868881296745753,
 'shock': 0.18137989348575925,
 'shrink': 0.38013812533565233,
 'worst': 0.040143569649967056}

In [0]:
DEFAULT_ARGUMENTS = dict(
        # for iterative graph algorithms
        similarity_power=1,
        arccos=True,
        max_iter=50,
        epsilon=1e-6,
        sym=True,

        # for learning embeddings transformation
        n_epochs=50,
        force_orthogonal=True,
        batch_size=100,
        cosine=True,

        ## bootstrap
        num_boots=1,
        n_procs=1,
)
polarities = run_method(positive_seeds, negative_seeds, 
                          common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                          method=densify, 
                        post_densify = True,
                          lr=0.01, regularization_strength=0.01,

                          **DEFAULT_ARGUMENTS)
from operator import itemgetter
myvalues = itemgetter(*list(set(eval_words) & set(polarities.keys())))(polarities)
dict(zip(list(set(eval_words) & set(polarities.keys())),myvalues))

  0%|          | 0/50 [00:00<?, ?it/s]

Preparing to learn embedding tranformation
Learning embedding transformation


100%|██████████| 50/50 [00:12<00:00,  4.09it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

-73.84199440479279
Preparing to learn embedding tranformation
Learning embedding transformation


100%|██████████| 5/5 [00:08<00:00,  1.74s/it]

-862.3596575260162





{'best': -3.3844862,
 'failure': 0.06777992,
 'fall': -1.218575,
 'good': -4.6377993,
 'hits': -0.7553856,
 'low': -1.9848655,
 'profitable': -5.490222,
 'shock': -0.032394607,
 'shrink': -1.7407255,
 'worst': -1.2267815}

In [0]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
data = np.array(list(polarities.values())).reshape(-1, 1)
scaler = MinMaxScaler()
v = scaler.fit_transform(data)

a = dict(zip(polarities.keys(), v.flatten()))
myvalues = itemgetter(*list(set(eval_words) & set(a.keys())))(a)
dict(zip(list(set(eval_words) & set(a.keys())),myvalues))

{'best': 0.48350433,
 'failure': 0.9802322,
 'fall': 0.7951455,
 'good': 0.30317193,
 'hits': 0.8617913,
 'low': 0.6848881,
 'profitable': 0.18052143,
 'shock': 0.9658186,
 'shrink': 0.7200161,
 'worst': 0.79396474}

In [0]:
d=dict(zip(list(set(eval_words) & set(a.keys())),myvalues))
from toolz.dicttoolz import valmap
valmap(lambda x: 1-x, d)

{'best': 0.21649567484855647,
 'failure': -0.2802321791648865,
 'fall': -0.09514551162719731,
 'good': 0.3968280673027038,
 'hits': -0.1617913126945496,
 'low': 0.015111875534057573,
 'profitable': 0.5194785714149475,
 'shock': -0.26581858396530156,
 'shrink': -0.020016121864318892,
 'worst': -0.09396474361419682}

In [0]:
'''
densify
'''
lr =  [0.01]
reg = [0.001, 0.003, 0.005, 0.007, 0.009]
DEFAULT_ARGUMENTS = dict(
        # for iterative graph algorithms
        similarity_power=1,
        arccos=False,
        max_iter=50,
        epsilon=1e-6,
        sym=True,

        # for learning embeddings transformation
        n_epochs=50,
        force_orthogonal=True,
        batch_size=1,
        cosine=False,

        ## bootstrap
        num_boots=1,
        n_procs=1,
)
for lr_ in lr:
  for reg_ in reg : 
    polarities = run_method(positive_seeds, negative_seeds, 
                          common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                          method=densify, 
                          lr=lr_, regularization_strength=reg_,

                          **DEFAULT_ARGUMENTS)

    print(polarities)

In [0]:
# with more seed words ( bullish, bearish ...)
lr =  [0.001, 0.01, 0.1, 0.5]
reg = [0.001, 0.003, 0.005, 0.007, 0.009, 0.01, 0.1, 0.5]

DEFAULT_ARGUMENTS = dict(
        # for iterative graph algorithms
        similarity_power=1,
        arccos=False,
        max_iter=50,
        epsilon=1e-6,
        sym=True,

        # for learning embeddings transformation
        n_epochs=50,
        force_orthogonal=True,
        batch_size=1,
        cosine=False,

        ## bootstrap
        num_boots=1,
        n_procs=1,
)
for lr_ in lr:
  for reg_ in reg : 
    polarities = run_method(positive_seeds, negative_seeds, 
                          common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                          method=densify, 
                          lr=lr_, regularization_strength=reg_,

                          **DEFAULT_ARGUMENTS)

    print(polarities)








  0%|          | 0/50 [00:00<?, ?it/s][A[A[A[A[A[A[A

Preparing to learn embedding tranformation
Learning embedding transformation









  2%|▏         | 1/50 [00:18<15:08, 18.54s/it][A[A[A[A[A[A[A






  4%|▍         | 2/50 [00:30<13:08, 16.43s/it][A[A[A[A[A[A[A






  6%|▌         | 3/50 [00:41<11:40, 14.89s/it][A[A[A[A[A[A[A






  8%|▊         | 4/50 [00:52<10:35, 13.82s/it][A[A[A[A[A[A[A






 10%|█         | 5/50 [01:03<09:47, 13.06s/it][A[A[A[A[A[A[A






 12%|█▏        | 6/50 [01:15<09:08, 12.47s/it][A[A[A[A[A[A[A






 14%|█▍        | 7/50 [01:26<08:38, 12.06s/it][A[A[A[A[A[A[A






 16%|█▌        | 8/50 [01:37<08:15, 11.80s/it][A[A[A[A[A[A[A






 18%|█▊        | 9/50 [01:48<07:55, 11.60s/it][A[A[A[A[A[A[A






 20%|██        | 10/50 [01:59<07:39, 11.50s/it][A[A[A[A[A[A[A






 22%|██▏       | 11/50 [02:10<07:24, 11.39s/it][A[A[A[A[A[A[A






 24%|██▍       | 12/50 [02:21<07:08, 11.29s/it][A[A[A[A[A[A[A






 26%|██▌       | 13/50 [02:33<06:55, 11.23s/it][A[A[A[A[A[A[A






 28%|██▊       | 14/50 [02:

KeyboardInterrupt: ignored

In [0]:
def apply_embedding_transformation_grid(embeddings, positive_seeds, negative_seeds,
                                   n_epochs=5, n_dim=10, force_orthogonal=False,
                                   plot=True, plot_points=50, plot_seeds=False,
                                   **kwargs):
   
    dataset = DatasetMinibatchIterator(embeddings, positive_seeds, negative_seeds, **kwargs)
    model = get_model(embeddings.m.shape[1], n_dim, **kwargs)

    

    for epoch in tqdm(range(n_epochs)):
        dataset.shuffle()
        loss = 0
        
        for i, X in enumerate(dataset):
            loss += model.train_on_batch([X[ 'embeddings1'],X[ 'embeddings2']],X['y']) * X['y'].size
           
            Q, b = model.get_weights()
            if force_orthogonal:
                Q = orthogonalize(Q)
            model.set_weights([Q, np.zeros_like(b)])
    return loss
  
  
def densify_grid(embeddings, positive_seeds, negative_seeds, 
        transform_method=apply_embedding_transformation, **kwargs):

    p_seeds = {word:1.0 for word in positive_seeds}
    n_seeds = {word:1.0 for word in negative_seeds}
    new_embeddings = embeddings
    return apply_embedding_transformation_grid(
            embeddings, p_seeds, n_seeds, n_dim=2,  **kwargs)
  
def run_method_grid(positive_seeds, negative_seeds, embeddings, transform_embeddings=False,
        method=densify, **kwargs):
    print('new')
    positive_seeds = [s for s in positive_seeds if s in embeddings]
    negative_seeds = [s for s in negative_seeds if s in embeddings]
    return method(embeddings, positive_seeds, negative_seeds, **kwargs)



def f(x,y):
  return - run_method_grid(positive_seeds, negative_seeds, 
                            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                            method=densify_grid, 
                            lr=x, regularization_strength=y,

                            similarity_power=1,
                            arccos=False,
                            max_iter=50,
                            epsilon=1e-6,
                            sym=True,

                            # for learning embeddings transformation
                            n_epochs=50,
                            force_orthogonal=True,
                            batch_size=1,
                            cosine=False,

                            ## bootstrap
                            num_boots=1,
                            n_procs=1)


!pip install bayesian-optimization
from bayes_opt import BayesianOptimization

# Bounded region of parameter space
pbounds = {'x': (0.001, 0.5), 'y': (0.001, 0.5)}

optimizer = BayesianOptimization(
    f=f,
    pbounds=pbounds,
    random_state=1,
)

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/72/0c/173ac467d0a53e33e41b521e4ceba74a8ac7c7873d7b857a8fbdca88302d/bayesian-optimization-1.0.1.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/1d/0d/3b/6b9d4477a34b3905f246ff4e7acf6aafd4cc9b77d473629b77
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.0.1


In [0]:
optimizer.maximize(
    init_points=2,
    n_iter=20,
)

  0%|          | 0/50 [00:00<?, ?it/s]

|   iter    |  target   |     x     |     y     |
-------------------------------------------------
new


100%|██████████| 50/50 [05:25<00:00,  6.29s/it]
  0%|          | 0/50 [00:00<?, ?it/s]

| [0m 1       [0m | [0m 5.819   [0m | [0m 0.2091  [0m | [0m 0.3604  [0m |
new


100%|██████████| 50/50 [05:08<00:00,  6.04s/it]


| [0m 2       [0m | [0m 5.505   [0m | [0m 0.001057[0m | [0m 0.1519  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [05:00<00:00,  5.88s/it]


| [0m 3       [0m | [0m 5.077   [0m | [0m 0.5     [0m | [0m 0.001   [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:58<00:00,  5.88s/it]


| [0m 4       [0m | [0m 3.724   [0m | [0m 0.001   [0m | [0m 0.5     [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:58<00:00,  5.86s/it]


| [95m 5       [0m | [95m 12.67   [0m | [95m 0.02711 [0m | [95m 0.25    [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [05:02<00:00,  5.97s/it]


| [0m 6       [0m | [0m 8.688   [0m | [0m 0.4429  [0m | [0m 0.4099  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:59<00:00,  5.85s/it]


| [0m 7       [0m | [0m 0.6019  [0m | [0m 0.00761 [0m | [0m 0.3799  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:58<00:00,  5.88s/it]


| [0m 8       [0m | [0m 3.743   [0m | [0m 0.4356  [0m | [0m 0.2406  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:59<00:00,  5.91s/it]


| [0m 9       [0m | [0m 3.155   [0m | [0m 0.2091  [0m | [0m 0.3606  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:59<00:00,  5.89s/it]


| [0m 10      [0m | [0m 4.719   [0m | [0m 0.4779  [0m | [0m 0.4186  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [05:00<00:00,  5.97s/it]


| [0m 11      [0m | [0m 6.932   [0m | [0m 0.2288  [0m | [0m 0.4808  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [05:06<00:00,  6.07s/it]


| [0m 12      [0m | [0m 9.404   [0m | [0m 0.006549[0m | [0m 0.377   [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [05:02<00:00,  5.98s/it]


| [0m 13      [0m | [0m 9.954   [0m | [0m 0.02708 [0m | [0m 0.2497  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:55<00:00,  5.79s/it]


| [0m 14      [0m | [0m 9.639   [0m | [0m 0.006675[0m | [0m 0.3773  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:57<00:00,  5.84s/it]


| [0m 15      [0m | [0m 6.342   [0m | [0m 0.4434  [0m | [0m 0.4099  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:56<00:00,  5.79s/it]


| [0m 16      [0m | [0m 1.475   [0m | [0m 0.006587[0m | [0m 0.3766  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:57<00:00,  5.81s/it]


| [0m 17      [0m | [0m-0.7483  [0m | [0m 0.006663[0m | [0m 0.3775  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [05:00<00:00,  5.86s/it]


| [0m 18      [0m | [0m 5.069   [0m | [0m 0.2288  [0m | [0m 0.4805  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:57<00:00,  5.81s/it]


| [0m 19      [0m | [0m 11.84   [0m | [0m 0.2218  [0m | [0m 0.4267  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:58<00:00,  5.88s/it]


| [0m 20      [0m | [0m 5.941   [0m | [0m 0.4431  [0m | [0m 0.4098  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:55<00:00,  5.78s/it]


| [0m 21      [0m | [0m 4.071   [0m | [0m 0.02667 [0m | [0m 0.2504  [0m |


  0%|          | 0/50 [00:00<?, ?it/s]

new


100%|██████████| 50/50 [04:57<00:00,  5.80s/it]

| [0m 22      [0m | [0m 11.86   [0m | [0m 0.443   [0m | [0m 0.4101  [0m |





In [0]:
optimizer.max

{'params': {'x': 0.027113930472524774, 'y': 0.25001579388326517},
 'target': 12.674488961696625}

In [0]:
polarities = run_method(positive_seeds, negative_seeds, 
                          common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                          method=densify, 
                          lr=optimizer.max['params']['x'], regularization_strength=optimizer.max['params']['y'],

                          **DEFAULT_ARGUMENTS)
from operator import itemgetter
myvalues = itemgetter(*list(set(eval_words) & set(polarities.keys())))(polarities)
# d2 = valmap(lambda x: -x, polarities)
myvalues,list(set(eval_words) & set(polarities.keys()))

  0%|          | 0/50 [00:00<?, ?it/s]

Preparing to learn embedding tranformation
Learning embedding transformation


100%|██████████| 50/50 [07:45<00:00,  9.36s/it]

-8.0





#**Word2vec + wordnet**

In [0]:
import numpy as np
from gensim import matutils
from six import string_types
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords

In [0]:
def findWordNet( key):
        """
        Parameters
            ----------
            key : str
                the word you want to find in zh_wordnet.
        Returns
            ----------
            list
                a list in wordnet format.
        """
        synonyms = []

        for syn in wn.synsets(key):
            for l in syn.lemmas():
                synonyms.append(syn)
        return(synonyms)


In [0]:
import gensim, logging
import os
class MySentences(object):
    def __init__(self):
      pass
 
    def __iter__(self):
        for fname in drive.ListFile({'q': "'1HrGCbwzvK2WPE559f9gWgzgbcBhTXRjD' in parents"}).GetList():
            fp = open(f['title'])
            for line in fp:
                yield line

In [0]:
!pip install -U -q PyDrive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('~/data')




In [0]:
sentences = MySentences() # a memory-friendly iterator


In [0]:
for i in sentences:
  print(i)

In [0]:
Label_index=positive_seeds + negative_seeds
Label_dict=dict(zip(positive_seeds,[1]*len(positive_seeds)))
Label_dict.update(dict(zip(negative_seeds,[-1]*len(negative_seeds))))

In [0]:
model = gensim.models.Word2Vec(sentences)
model = gensim.models.Word2Vec(sentences, min_count=20)
len_vector=model.trainables.layer1_size

In [0]:
from nltk.tokenize import sent_tokenize, word_tokenize
stopWords = set(stopwords.words('english'))
words = word_tokenize(data)
wordsFiltered = []

for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)

In [0]:
Label_vec=np.empty((len(Label_dict),len_vector))
Label_vec_u=np.empty((len(Label_dict),len_vector))
for i in range(len(Label_index)):
    try:
       
        Label_vec[i,:]=model.wv.__getitem__(Label_index[i])
        Label_vec_u[i,:]=matutils.unitvec(model.wv.__getitem__(Label_index[i]))
    except:
        Label_vec[i,:]=np.zeros((1,len_vector)) 
        Label_vec_u[i,:]=np.zeros((1,len_vector))

In [0]:
Label_wn=dict()
for key in Label_dict.keys():
    ll=findWordNet(key)
    Label_wn[key]=list()
    for l in ll:
        Label_wn[key].append(l)

In [0]:
def similarity_label( words, normalization=True):
        """
        you can calculate more than one word at the same time.
        """
        if model==None:
            raise Exception('no model.')
        if isinstance(words, string_types):
            words=[words]
        vectors=np.transpose(model.wv.__getitem__(words))
        if normalization:
            unit_vector=unitvec(vectors,ax=0) 
            dists=np.dot(Label_vec_u, unit_vector)
        else:
            dists=np.dot(Label_vec, vectors)
        return dists
    
    
def topn_similarity_label( words, topn=10, normalization=True):
        if model==None:
            raise Exception('no model.')
        if isinstance(words, string_types):
            words=[words]
    
        vectors=np.transpose(model.wv.__getitem__(words))
        
        if normalization:
            unit_vector=unitvec(vectors,ax=0)
            dists=np.dot(Label_vec_u, unit_vector)
        else:
            dists=np.dot(Label_vec, vectors)
        
        topwords=[]
        topsims=np.empty((topn,len(words)))
        best = np.argsort(dists, axis=0)
        for i in range(topn):
            topword=[]
            for j in range(len(words)):
                topword.append(Label_index[best[-i-1][j]])
                topsims[i][j]=dists[best[-i-1][j]][j]
            topwords.append(topword)
        result=[(topwords[i], topsims[i]) for i in range(topn)]
        return result
        """ print this result by:

            | for iword,isim in result:  |
            |     print(iword, isim)     |
            or
            | for iword, isim in b:                               |
            |     for i in range(len(b[0])):                      |
            |         print("%s:%f\t" %(iword[i],isim[i]),end="") |
            |     print("")                                       |
                
        """
                
def synonym_label( word, calc='all' ,calc_k=5):
        """
        you can only calculate one word for one time.
        """
        ww=list()
        for w in findWordNet(word):
            ww.append(w)
        if (len(ww)==0):
            #return 0
            raise Exception('cannot be found.')
        else:
            similarities=[0]*len(Label_index)
            if calc=='all': 
                for i in range(len(Label_index)):
                    count=0
                    for w in ww:
                        for l in Label_wn[Label_index[i]]:
                            
                            sim=w.path_similarity(l)
                            if(sim!=None):
                                similarities[i]+=sim
                            else:
                                count+=1
                    try:
                        similarities[i]/=(len(ww)*len(Label_wn[Label_index[i]])-count) 
                    except:
                        similarities[i]=0
                        
            elif calc=='calc_k': 
                for i in range(len(Label_index)):
                    count=0
                    simlist=[]
                    for w in ww:
                        for l in Label_wn[Label_index[i]]:
                            sim=w.path_similarity(l)
                            if(sim!=None):
                                simlist.append(sim)
                                count+=1
                    if count<=calc_k:
                        similarities[i]=np.mean(simlist)
                    else:
                        simlist=sorted(simlist,reverse=True)
                        similarities[i]=simlist[:calc_k-1]/calc_k 
        return np.array(similarities)


def topn_synonym_label( word, topn=10, calc='all', calc_k=5):
        ww=list()
        for w in findWordNet(word):
            ww.append(w)
        if (len(ww)==0):
            return 0
        else:
            similarities=[0]*len(Label_index)
            if calc=='all': 
                for i in range(len(Label_index)):
                    count=0
                    for w in ww:
                        for l in Label_wn[Label_index[i]]:
                            sim=w.path_similarity(l)
                            if(sim!=None):
                                similarities[i]+=sim
                            else:
                                count+=1
                    try:
                        similarities[i]/=(len(ww)*len(Label_wn[Label_index[i]])-count) 
                    except:
                        similarities[i]=0
                        
            elif calc=='calc_k':
                for i in range(len(Label_index)):
                    count=0
                    simlist=[]
                    for w in ww:
                        for l in Label_wn[Label_index[i]]:
                            sim=w.path_similarity(l)
                            if(sim!=None):
                                simlist.append(sim)
                                count+=1
                    if count<=calc_k:
                        similarities[i]=np.mean(simlist)
                    else:
                        simlist=sorted(simlist,reverse=True)
                        similarities[i]=simlist[:calc_k-1]/calc_k 
                        
        best=matutils.argsort(similarities, topn = topn, reverse=True)
        result = [(Label_index[sim], float(similarities[sim])) for sim in best]
        return result
      
def nlp_vector( words):
        """
        Parameters
            ----------
            words : list of str/str 
                wordbag
        Returns
            ----------
            ndarray(float)
                the corresponding vectors of words in wordbag.
                a vector contains the similarities calculated by word2vec and wordnet.
        """
        if isinstance(words, string_types):
            synonym=synonym_label(words)
            similarity=similarity_label(words)
        else:
            synonym=np.empty((len(Label_index),len(words)))
            similarity=np.empty((len(Label_index),len(words)))
            for i in range(len(words)):
                try:
                    synonym[:,i]=synonym_label(words[i])
                except:
                    synonym[:,i]=np.zeros((len(Label_index),1))[:,0]
                try:    
                    similarity[:,i]=similarity_label(words[i])[:,0]
                except:
                    similarity[:,i]=np.zeros((len(Label_index),1))[:,0]
        vector=np.concatenate((similarity, synonym.reshape(similarity.shape)))
        return vector
    
    

#---------------------------------math----------------------------------------
     
def unitvec(vector, ax=1):
    v=vector*vector
    if len(vector.shape)==1:
        sqrtv=np.sqrt(np.sum(v))
    elif len(vector.shape)==2:
        sqrtv=np.sqrt([np.sum(v, axis=ax)])
    else:
        raise Exception('It\'s too large.')
    if ax==1:
        result=np.divide(vector,sqrtv.T)
    elif ax==0:
        result=np.divide(vector,sqrtv)
    return result

In [0]:
synonym_label('gain')
topn_similarity_label('gain')
topn_synonym_label('gain')
similarity_label('gain')
nlp_vector(['gain','the']).shape

In [0]:
!pip install -U -q PyDrive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('~/data')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
file_list = drive.ListFile(
    {'q': "'1SooKSw8M4ACbznKjnNrYvJ5wxuqJ-YCk' in parents"}).GetList()

for f in file_list:
  # 3. Create & download by id.
  print('title: %s, id: %s' % (f['title'], f['id']))
  fname = os.path.join(local_download_path, f['title'])
  print('downloading to {}'.format(fname))
  f_ = drive.CreateFile({'id': f['id']})
  f_.GetContentFile(fname)


with open(fname, 'r') as f:
  print(f.read())