In [3]:
# Run in terminal or command prompt to download spacy dict
# python3 -m spacy download en
# copyright Felipe Castrollio

import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# topics library
from topics import prepare_topics
from topics import print_top_words_per_topic

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

# Keras tools
import keras
from keras.models import Model
from keras.layers import Input, Dense, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras import layers
from keras.engine.input_layer import Input
from keras import backend as K
import tensorflow as tf

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
#variables

window_size = 10
epochs = 200000
n_topics = 22 # for lda model
vector_dim = 100
batch_size = 1000

# validation 
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 50  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [6]:
df = pd.read_excel('/home/ccirelli2/Desktop/GSU/Fall_2019/Project_2/M_fund.xlsx')

In [7]:
df.head(5)

Unnamed: 0,accession#,filing_year,principal_strategies
0,0001193125-10-099241,2010,"normally, the portfolio invests at least 80% o..."
1,0001193125-10-099234,2010,"normally, the portfolio invests at least 80% o..."
2,0001193125-10-099244,2010,"normally, the portfolio invests at least 80% o..."
3,0001193125-10-099303,2010,"normally, the portfolio invests at least 80% o..."
4,0001193125-10-099357,2010,"the portfolio employs a “passive management,” ..."


In [13]:
len(df)

14621

In [18]:
print(df['principal_strategies'])

0        normally, the portfolio invests at least 80% o...
1        normally, the portfolio invests at least 80% o...
2        normally, the portfolio invests at least 80% o...
3        normally, the portfolio invests at least 80% o...
4        the portfolio employs a “passive management,” ...
5        investing in the stock, bond and money market ...
6        investing in the stock, bond and money market ...
7        normally, the portfolio invests at least 80% o...
8        normally, the portfolio invests at least 80% o...
9        the portfolio invests only in high quality, sh...
10       normally, the portfolio invests at least 80% o...
11       normally, the portfolio invests at least 80% o...
12       normally, the portfolio will invest at least 8...
13       normally, the portfolio invests at least 80% o...
14       normally, the portfolio invests at least 80% o...
15       the portfolio employs a “passive management,” ...
16       the portfolio invests primarily in the equity .

Refer to https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/ for explanation on data processing steps below.

In [19]:
# Convert to list
data = df.principal_risks.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', str(sent)) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', str(sent)) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", str(sent)) for sent in data]

pprint(data[:1])

  data = [re.sub('\S*@\S*\s?', '', str(sent)) for sent in data]
  data = [re.sub('\s+', ' ', str(sent)) for sent in data]


AttributeError: 'DataFrame' object has no attribute 'principal_risks'

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

#print(data_words[:1])

In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load("c:\\programdata\\anaconda3\\lib\\site-packages\\en_core_web_sm\\en_core_web_sm-2.0.0", disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(len(data_lemmatized))
print(data_lemmatized[0])

In [None]:
#tokenize every doc

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_lemmatized)
sequences = tokenizer.texts_to_sequences(data_lemmatized)
n_documents = len(sequences)

dictionary = tokenizer.word_index
dictionary["null"] = 0
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
vocab_size = len(dictionary)

In [None]:
# create dataset: word pairs and doc ids with positive and negative samples

window_size = 2
targets = []
contexts = []
labels = []
couples = []
doc_ids = []

for i in range(0,n_documents):
    if i % 1000 == 0 and i > 0:
        print (i)
    seq = sequences[i]
    sampling_table = sequence.make_sampling_table(vocab_size)
    couple, label = skipgrams(seq, vocab_size, window_size=window_size, sampling_table=sampling_table)
    if not couple:
        next
    try:
        target, context = zip(*couple)
        targets = targets + list(target)
        contexts = contexts + list(context)
        doc_ids = doc_ids + [i]*len(context)
        labels = labels + label
        couples = couples + couple
    except:
        print ("Error on " + str(seq))
    
data_target = np.array(targets, dtype='int32')
data_context = np.array(contexts, dtype='int32')
doc_ids = np.array(doc_ids, dtype='int32')
labels = np.array(labels, dtype='int32')

# split into train and test

from random import sample
training_split = 0.8
l = len(data_target) #length of data 
f = int(l * training_split) #number of elements you need
indices = sample(range(l),f)

train_data_target = data_target[indices]
test_data_target = np.delete(data_target,indices)
train_data_context = data_context[indices]
test_data_context = np.delete(data_context,indices)
train_doc_ids = doc_ids[indices]
test_doc_ids = np.delete(doc_ids,indices)
train_labels = labels[indices]
test_labels = np.delete(labels,indices)

print(couples[:10], labels[:10], doc_ids[:10])

In [None]:
print("size of training data " + str(len(train_data_target)))
print("size of testing data " + str(len(test_data_target)))
print("size of labels " + str(len(labels)))


This is where we start creating the model. The model consists of two parallel flows: word embedding (like word2vec) and topic embedding (like LDA). Please refer to the model image here: https://github.com/cemoody/lda2vec. You can see on the left the word embedding happens, and on the right the topic lda embedding happens. At the bottom the two vectors are added together to form the final context_vector. 

The model will have three training inputs: 
    1) input_context: pivot word
    2) input_target: word that we are trying to predict
    3) input_doc: document id 

And one training output:
    1) label: 0 or 1 which defines if input_context and input_target are similar taking into account input_doc
    
The model predictions are gien by "preds" which will output a similarity score between 0 to 1

In [None]:
# create input placeholder variables
input_target = Input((1,))
input_context = Input((1,))
input_doc = Input((1,), dtype='int32')
labels = Input((1,))

In [None]:
# create word2`vec layers
embedding = layers.Embedding(vocab_size, vector_dim, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
word_context = embedding(input_context)
word_context = Reshape((vector_dim, 1))(word_context)
word_context

In [None]:
# create lda layers

scalar = 1 / np.sqrt(n_documents + n_topics)
all_doc_topics_embedding =(tf.Variable(tf.random_normal([n_documents, n_topics], mean=0, stddev=50*scalar),name="doc_embeddings",trainable=True))  # Gaussian distribution
#all_doc_topics_embedding = keras.layers.Embedding(n_documents, n_topics, embeddings_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=1 / np.sqrt(n_documents + n_topics), seed=None))
#doc_topics = all_doc_topics_embedding(input_doc)
def embedding_lookup(x):
    ind = tf.cast(x, tf.int32)
    return tf.nn.embedding_lookup(all_doc_topics_embedding,ind,partition_strategy='mod',name="doc_proportions")

#doc_topics = all_doc_topics_embedding(input_doc)
print(input_doc)
doc_topics = keras.layers.Lambda(embedding_lookup)(input_doc)
doc_topics_norm = keras.layers.Activation(activation="softmax")(doc_topics)
#all_doc_topics_norm = keras.layers.Activation(activation="softmax",name="all_doc_topics_norm")(all_doc_topics)
#doc_topics_norm = keras.layers.Lambda(embedding_lookup)(input_doc)
transform = keras.layers.Dense(vector_dim, activation=None, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None)
topic_context = transform(doc_topics_norm)
topic_context = Reshape((vector_dim, 1))(topic_context)

In [None]:
# combine context layers
context = keras.layers.Add()([word_context, topic_context])

# now perform the dot product operation to get a similarity measure between target and context
similarity = layers.dot([target, context], axes=1, normalize=True)
similarity = Reshape((1,))(similarity)

# add the sigmoid output layer
preds = Dense(1, activation='sigmoid', name='similarity')(similarity)

In [None]:
# defnie custom loss functions

# lda loss model
lmbda = 1.0
fraction = 1/100000
alpha = None # defaults to 1/n_topics


def dirichlet_likelihood(weights, alpha=None):
    
    num_topics = n_topics
    
    if alpha is None:
        alpha = 1 / num_topics

    log_proportions = tf.nn.log_softmax(weights)

    loss = (alpha - 1) * log_proportions

    #return -tf.reduce_sum(loss) # log-sum-exp
    return tf.reduce_sum(loss) # log-sum-exp

def loss_lda(y_pred, y_true, topics_layer):
    return lmbda*fraction*dirichlet_likelihood(topics_layer)

def loss_word2vec(y_pred, y_true):
    #return tf.math.add(tf.math.multiply(y_true, (-tf.math.log(y_pred))), 
    #                   tf.math.multiply((1 - y_true),(-tf.math.log(1 - y_pred))))
    return keras.losses.binary_crossentropy(y_true, y_pred)
    
# lda2vec loss
def loss_sum(y_pred, y_true, topics_layer):
    word2vec_loss = loss_word2vec(y_pred, y_true)
    lda_loss = loss_lda(y_pred, y_true, topics_layer)
    sum_loss = word2vec_loss + lda_loss
    return sum_loss

In [None]:
# create evaluation models which are used to print out similar words during training.
# This is not needed for model training, but is used to check model outputs periodically to see if model is working

topic_context = Input(shape=(vector_dim, ))
topic_similarity = layers.dot([topic_context, word_context], axes=0)
topic2words_model = Model(input=[topic_context,input_context], output=topic_similarity)

words_similarity = layers.dot([target, word_context], axes=1, normalize=True)
nearby_words_model = Model(input=[input_target, input_context], output=words_similarity)


In [None]:
find_similar_words =['144a','active','adviser','allocation','country','region','arbitrage', \
      'asset','banking','index','passive','beta','bond', 'debt','brokerage', \
      'call','capitalization','cash','commodity','equity','close','collateral',\
      'company', 'interest','sector','conflict','investment', 'obligations', \
      'diversification','market','counterparty','currency','cybersecurity', \
      'derivative', 'diversification', 'expense','volatility','future','government', \
      'hedge', 'liquidity','insurance','issuer','legal','leverage','target', 'date',  \
      'fund','management','mortgage','over','counter','turnover','estate','settlement', \
      'short','swap','tax','yield']

In [None]:
# Evaluation functions to print similar words given a topic and similar words given another word
# This is not used for training, but for periodic evaluation of the model

class TopicSimilarityCallback:
    def run_sim(self, topics):
        for i in range(n_topics):
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(topics[i])
            nearest = (-sim).argsort()[0:top_k + 1]
            log_str = 'Closest words to topic %d:' % i
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)
    
    @staticmethod
    def _get_sim(topic):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.reshape(topic,(1,-1))
        in_arr2 = np.zeros((1,))
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = topic2words_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
t_sim_cb = TopicSimilarityCallback()


class WordsSimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 10  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[0:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = nearby_words_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
w_sim_cb = WordsSimilarityCallback()

class SpecificWordsSimilarityCallback:
    def run_sim(self, word):
        if word not in dictionary:
            print('Nearest to %s: Word does not exist in dictionary' % word)
            return
        word_index = dictionary[word]
        top_k = 10  # number of nearest neighbors
        sim = self._get_sim(word_index)
        nearest = (-sim).argsort()[0:top_k + 1]
        log_str = 'Nearest to %s:' % word
        for k in range(top_k):
            close_word = reverse_dictionary[nearest[k]]
            log_str = '%s %s,' % (log_str, close_word)
        print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = nearby_words_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sw_sim_cb = SpecificWordsSimilarityCallback()

In [None]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [None]:
# THIS PART IS NOT BEING USED
# train model using keras (not being used since lda is not training)

def train_with_keras(): 
    
    model = Model(inputs=[input_target, input_context, input_doc], outputs=preds)
    model.compile(loss=loss_lda2vec(all_doc_topics_embedding), metrics=[loss_word2vec, loss_lda], optimizer='rmsprop')
    model.summary()
    
    iterations = 100000
    batch_size = 100
    train_loss_every = 100
    test_loss_every = 10000
    sum_loss = 0
    word2vec_loss = 0
    lda_loss = 0

    for cnt in range(iterations):

        # print out training loss
        if cnt % train_loss_every == 0 and cnt > 0:
            print("Iteration {}, average sum_loss={}, average word2vec_loss={}, average lda_loss={}".format(cnt, sum_loss/train_loss_every, word2vec_loss/train_loss_every, lda_loss/train_loss_every))
            print(all_doc_topics_embedding.get_weights()[0][1])
            print(K.eval(dirichlet_likelihood(all_doc_topics_embedding))*fraction)
            print(softmax(all_doc_topics_embedding.get_weights()[0][0]))

        # print out test loss and similar words
        if cnt % test_loss_every == 0 and cnt > 0:
            t_sim_cb.run_sim(transform.get_weights()[0])
            w_sim_cb.run_sim()
            test_loss = model.evaluate(x=[test_data_target, test_data_context, test_doc_ids], y=test_labels)
            print("Iteration {}, test_loss={}\n".format(cnt, test_loss))

        # training happens here
        idx = np.random.randint(0, len(train_labels)-1, batch_size).tolist()
        loss =  model.fit(x=[train_data_target[idx], train_data_context[idx], train_doc_ids[idx]], y=train_labels[idx],epochs=1)
    

In [None]:
# I couldn't get the model to train in Keras (error in LDA loss), so I trained it with TensorFlow instead. 
# Remember that Keras is running TensorFlow in the back end. This is a bit messy since the model was created 
#in Keras language, but it works well.

import math
batch_size = 150
train_loss_every = 500
test_loss_every = 10000

# define loss functions to compute
loss = loss_sum(preds, labels, all_doc_topics_embedding)
loss_topics = loss_lda(preds, labels, all_doc_topics_embedding)
loss_words = loss_word2vec(preds, labels)

# define gradient descent and initialize variables
train_step = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
init_op = tf.global_variables_initializer()
sess = K.get_session() # get session from keras
sess.run(init_op)

# Run training loop
with sess.as_default():
    for i in range(200000):
        
        idx = np.random.randint(0, len(train_labels)-1, batch_size).tolist()
        # training happens here
        losses = sess.run([train_step, loss,loss_topics, loss_words,preds,labels], feed_dict= {input_target:np.reshape(train_data_target[idx],(-1,1)),
                                   input_context:np.reshape(train_data_context[idx],(-1,1)),
                                   input_doc:np.reshape(train_doc_ids[idx],(-1,1)),
                                   labels: np.reshape(train_labels[idx],(-1,1))})
            
        # print training loss
        if i % train_loss_every == 0:
            print("Iteration {}, average sum_loss={}, average lda_loss={}, average w2v_loss={}".format(i,np.mean(losses[1]),np.mean(losses[2]),np.mean(losses[3])))
        
        # print test\loss and similar words
        if i % test_loss_every == 0:
            test_loss = sess.run([loss], feed_dict= {input_target:np.reshape(test_data_target,(-1,1)),
                                   input_context:np.reshape(test_data_context,(-1,1)),
                                   input_doc:np.reshape(test_doc_ids,(-1,1)),
                                   labels: np.reshape(test_labels,(-1,1))})
            print("\n\n******Iteration {}, test_loss={}****\n\n".format(i, np.mean(test_loss[0])))
            #w_sim_cb.run_sim()
            t_sim_cb.run_sim(transform.get_weights()[0])
            for word in find_similar_words:
                sw_sim_cb.run_sim(word)
            


In [None]:
dictionary['market']