In [34]:
import pandas as pd
import numpy as np

# Data Extraction 

In [4]:
data = pd.read_csv('Finance_BIG__arxiv.csv')

In [5]:
data.shape

(800, 4)

In [6]:
data.head(5)

Unnamed: 0,ID,Topic,Title,Content
0,1,Finance,meanreverting portfolio design budget constraint,paper considers meanreverting portfolio design...
1,2,Finance,visualizing treasury issuance strategy,introduce simple cost risk proxy metric attach...
2,3,Finance,pricing financial derivative subject counterpa...,article present generic model pricing financia...
3,4,Finance,machine learning portfolio allocation,find economically statistically significant ga...
4,5,Finance,brexit bremain evidence bubble analysis,applied johansenledoitsornette jls model detec...


# Cleaning 

In [7]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

# Pre-processing the data for modelling 

In [8]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/richachoudhary/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/richachoudhary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    #tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [11]:
data['tokens']=  data['Content'].apply(lambda x: prepare_text_for_lda(x))

# Bag-Of-words 

In [13]:
from gensim import corpora
dictionary = corpora.Dictionary(data['tokens'])
bow_corpus = data['tokens'].apply(lambda x: dictionary.doc2bow(x))
import pickle
pickle.dump(bow_corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [14]:
bow_corpus.shape

(800,)

In [15]:
import gensim
NUM_TOPICS = 8
ldamodel = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=8)
for topic in topics:
    print(topic)

(0, '0.021*"risk" + 0.021*"market" + 0.011*"model" + 0.009*"financial" + 0.009*"measure" + 0.008*"portfolio" + 0.006*"problem" + 0.006*"strategy"')
(1, '0.030*"default" + 0.021*"model" + 0.011*"pricing" + 0.009*"bond" + 0.009*"option" + 0.008*"volatility" + 0.008*"rate" + 0.007*"value"')
(2, '0.018*"order" + 0.015*"price" + 0.014*"model" + 0.012*"market" + 0.011*"trading" + 0.010*"limit" + 0.008*"strategy" + 0.008*"risk"')
(3, '0.022*"model" + 0.016*"option" + 0.015*"price" + 0.011*"method" + 0.010*"volatility" + 0.009*"risk" + 0.009*"portfolio" + 0.009*"pricing"')
(4, '0.020*"method" + 0.013*"equation" + 0.009*"numerical" + 0.008*"model" + 0.007*"solution" + 0.007*"differential" + 0.006*"option" + 0.006*"jump"')
(5, '0.023*"model" + 0.022*"risk" + 0.011*"market" + 0.008*"measure" + 0.008*"financial" + 0.007*"factor" + 0.006*"data" + 0.006*"price"')
(6, '0.016*"model" + 0.010*"volatility" + 0.009*"function" + 0.006*"stochastic" + 0.006*"study" + 0.006*"show" + 0.006*"price" + 0.006*"va

# TFIDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
#no_features = 5000
vectorizer = TfidfVectorizer()
tfidf= vectorizer.fit_transform(data['tokens'].astype(str))
tfidf_feature_names= vectorizer.get_feature_names()

In [17]:
data.head(3)

Unnamed: 0,ID,Topic,Title,Content,tokens
0,1,Finance,meanreverting portfolio design budget constraint,paper considers meanreverting portfolio design...,"[paper, consider, meanreverting, portfolio, de..."
1,2,Finance,visualizing treasury issuance strategy,introduce simple cost risk proxy metric attach...,"[introduce, simple, cost, risk, proxy, metric,..."
2,3,Finance,pricing financial derivative subject counterpa...,article present generic model pricing financia...,"[article, present, generic, model, pricing, fi..."


In [18]:
tfidf

<800x6158 sparse matrix of type '<class 'numpy.float64'>'
	with 48890 stored elements in Compressed Sparse Row format>

In [29]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
tfidf_ldamodel = LatentDirichletAllocation(n_components= NUM_TOPICS, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tfidf)
tfidf_ldamodel_W = tfidf_ldamodel.transform(tfidf)
tfidf_ldamodel_H = tfidf_ldamodel.components_


In [39]:
tfidf_ldamodel_H

array([[0.28309842, 0.26125912, 0.28484486, ..., 0.26007688, 0.24562994,
        0.25501813],
       [0.22950828, 0.24185732, 0.30331123, ..., 0.25177199, 0.27382197,
        0.24926117],
       [0.23989379, 0.25016408, 0.27635932, ..., 0.25785646, 0.23997069,
        0.25132361],
       ...,
       [0.28634036, 0.2480968 , 0.27761797, ..., 0.26525539, 0.25916306,
        0.26166557],
       [0.26673686, 0.26769597, 0.25588556, ..., 0.27605371, 0.31008991,
        0.23894745],
       [0.24926203, 0.26253853, 0.26105726, ..., 0.23545497, 0.25971216,
        0.22879432]])

In [31]:
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print (documents[doc_index])

In [35]:
no_top_words = 4
no_top_documents = 4
display_topics(tfidf_ldamodel_W, tfidf_ldamodel_H, tfidf_feature_names, data['tokens'], no_top_words, no_top_documents)

Topic 0:
ability abnormality abnormal abandon
['find', 'economically', 'statistically', 'significant', 'gain', 'using', 'machine', 'learning', 'portfolio', 'allocation', 'market', 'index', 'riskfree', 'asset', 'optimal', 'portfolio', 'rule', 'timevarying', 'expect', 'return', 'volatility', 'implement', 'two', 'random', 'forest', 'model', 'one', 'model', 'employ', 'forecasting', 'sign', 'probability', 'excess', 'return', 'payout', 'yield', 'second', 'use', 'construct', 'optimize', 'volatility', 'estimate', 'rewardrisk', 'timing', 'machine', 'learning', 'provide', 'substantial', 'improvement', 'buyandhold', 'utility', 'riskadjusted', 'return', 'maximum', 'drawdowns', 'paper', 'present', 'new', 'theoretical', 'basis', 'unify', 'framework', 'machine', 'learning', 'apply', 'return', 'volatilitytiming']
['develop', 'model', 'price', 'long', 'term', 'loan', 'security', 'lending', 'business', 'longer', 'horizon', 'deal', 'view', 'contract', 'optionality', 'embed', 'price', 'using', 'establish'

['find', 'economically', 'statistically', 'significant', 'gain', 'using', 'machine', 'learning', 'portfolio', 'allocation', 'market', 'index', 'riskfree', 'asset', 'optimal', 'portfolio', 'rule', 'timevarying', 'expect', 'return', 'volatility', 'implement', 'two', 'random', 'forest', 'model', 'one', 'model', 'employ', 'forecasting', 'sign', 'probability', 'excess', 'return', 'payout', 'yield', 'second', 'use', 'construct', 'optimize', 'volatility', 'estimate', 'rewardrisk', 'timing', 'machine', 'learning', 'provide', 'substantial', 'improvement', 'buyandhold', 'utility', 'riskadjusted', 'return', 'maximum', 'drawdowns', 'paper', 'present', 'new', 'theoretical', 'basis', 'unify', 'framework', 'machine', 'learning', 'apply', 'return', 'volatilitytiming']
['diversification', 'represent', 'idea', 'choose', 'variety', 'uniformity', 'within', 'theory', 'choice', 'desirability', 'diversification', 'axiomatized', 'preference', 'convex', 'combination', 'choice', 'equivalently', 'rank', 'corresp

['paper', 'consider', 'meanreverting', 'portfolio', 'design', 'problem', 'arise', 'statistical', 'arbitrage', 'financial', 'market', 'first', 'propose', 'general', 'problem', 'formulation', 'aim', 'finding', 'portfolio', 'underlie', 'component', 'asset', 'optimize', 'meanreversion', 'criterion', 'characterize', 'meanreversion', 'strength', 'taking', 'consideration', 'variance', 'portfolio', 'investment', 'budget', 'constraint', 'several', 'specific', 'problem', 'consider', 'base', 'general', 'formulation', 'efficient', 'algorithm', 'propose', 'numerical', 'result', 'synthetic', 'market', 'data', 'show', 'propose', 'meanreverting', 'portfolio', 'design', 'method', 'generate', 'consistent', 'profit', 'outperform', 'traditional', 'design', 'method', 'benchmark', 'method', 'literature']
['article', 'present', 'generic', 'model', 'pricing', 'financial', 'derivative', 'subject', 'counterparty', 'credit', 'risk', 'unilateral', 'bilateral', 'type', 'credit', 'risk', 'consider', 'study', 'show'

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Predicting on test data using TFIDF

In [168]:
#https://stackoverflow.com/questions/40597075/python-sklearn-latent-dirichlet-allocation-transform-v-fittransform?noredirect=1&lq=1
# normalize the distribution (only needed if you want to work with the probabilities)
#doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)

In [169]:
#To find the top ranking topic you can do something like:
#doc_topic_dist.argmax(axis=1)

In [170]:
##x = vectorizer.transform(prepare_text_for_lda('abstraction'))

##y = tfidf_ldamodel.transform(x)

##doc_topic_dist_unnormalized = np.matrix(y)
#https://stackoverflow.com/questions/40597075/python-sklearn-latent-dirichlet-allocation-transform-v-fittransform?noredirect=1&lq=1
# normalize the distribution (only needed if you want to work with the probabilities)
##doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)
##doc_topic_dist.argmax(axis=1)

In [21]:
from sklearn.decomposition import NMF

nmf = NMF(n_components= NUM_TOPICS, random_state=42)
nmf.fit(tfidf )

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=8, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [41]:
nmf_W = nmf.transform(tfidf)
nmf_H = nmf.components_
display_topics(nmf_W, nmf_H, tfidf_feature_names, data['tokens'], no_top_words, no_top_documents)

Topic 0:
ability abandon abnormal abcem
['develop', 'model', 'price', 'long', 'term', 'loan', 'security', 'lending', 'business', 'longer', 'horizon', 'deal', 'view', 'contract', 'optionality', 'embed', 'price', 'using', 'establish', 'method', 'derivative', 'theory', 'become', 'limited', 'knowledge', 'first', 'application', 'lead', 'greater', 'synergy', 'operation', 'derivative', 'deltaone', 'trading', 'desk', 'perhaps', 'even', 'able', 'combine', 'certain', 'aspect', 'day', 'day', 'operation', 'seemingly', 'disparate', 'entity', 'run', 'numerical', 'simulation', 'demonstrate', 'practical', 'applicability', 'model', 'model', 'part', 'one', 'least', 'explore', 'yet', 'profit', 'laden', 'area', 'modern', 'investment', 'management', 'develop', 'heuristic', 'mitigate', 'loss', 'information', 'set', 'parameter', 'estimate', 'first', 'valuation', 'perform', 'directly', 'calculate', 'valuation', 'using', 'historical', 'time', 'series', 'lead', 'reduce', 'model', 'error', 'greater', 'financial'

['article', 'present', 'generic', 'model', 'pricing', 'financial', 'derivative', 'subject', 'counterparty', 'credit', 'risk', 'unilateral', 'bilateral', 'type', 'credit', 'risk', 'consider', 'study', 'show', 'credit', 'risk', 'model', 'american', 'style', 'option', 'case', 'require', 'backward', 'induction', 'valuation', 'correct', 'common', 'mistake', 'literature', 'emphasize', 'market', 'value', 'defaultable', 'derivative', 'actually', 'risky', 'value', 'rather', 'riskfree', 'value', 'credit', 'value', 'adjustment', 'cva', 'also', 'elaborate', 'practical', 'framework', 'develop', 'pricing', 'defaultable', 'derivative', 'calculate', 'cva', 'portfolio', 'level']
['find', 'economically', 'statistically', 'significant', 'gain', 'using', 'machine', 'learning', 'portfolio', 'allocation', 'market', 'index', 'riskfree', 'asset', 'optimal', 'portfolio', 'rule', 'timevarying', 'expect', 'return', 'volatility', 'implement', 'two', 'random', 'forest', 'model', 'one', 'model', 'employ', 'forecast

['paper', 'consider', 'meanreverting', 'portfolio', 'design', 'problem', 'arise', 'statistical', 'arbitrage', 'financial', 'market', 'first', 'propose', 'general', 'problem', 'formulation', 'aim', 'finding', 'portfolio', 'underlie', 'component', 'asset', 'optimize', 'meanreversion', 'criterion', 'characterize', 'meanreversion', 'strength', 'taking', 'consideration', 'variance', 'portfolio', 'investment', 'budget', 'constraint', 'several', 'specific', 'problem', 'consider', 'base', 'general', 'formulation', 'efficient', 'algorithm', 'propose', 'numerical', 'result', 'synthetic', 'market', 'data', 'show', 'propose', 'meanreverting', 'portfolio', 'design', 'method', 'generate', 'consistent', 'profit', 'outperform', 'traditional', 'design', 'method', 'benchmark', 'method', 'literature']
['develop', 'model', 'price', 'long', 'term', 'loan', 'security', 'lending', 'business', 'longer', 'horizon', 'deal', 'view', 'contract', 'optionality', 'embed', 'price', 'using', 'establish', 'method', 'de

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [22]:
display_topics(nmf, tfidf_feature_names, no_top_words)


0
['market', 'trading', 'price', 'agent', 'stock', 'financial', 'equilibrium', 'cost', 'impact', 'strategy']
1
['option', 'method', 'pricing', 'price', 'numerical', 'hedging', 'barrier', 'approximation', 'payoff', 'european']
2
['risk', 'measure', 'portfolio', 'systemic', 'financial', 'distribution', 'capital', 'var', 'set', 'allocation']
3
['problem', 'optimal', 'utility', 'portfolio', 'strategy', 'solution', 'function', 'optimization', 'asset', 'investment']
4
['default', 'credit', 'rate', 'model', 'bond', 'cd', 'interest', 'firm', 'valuation', 'pricing']
5
['volatility', 'model', 'imply', 'stochastic', 'process', 'local', 'heston', 'rough', 'expansion', 'fractional']
6
['alpha', 'factor', 'matrix', 'covariance', 'algorithm', 'return', 'portfolio', 'number', 'weight', 'model']
7
['order', 'limit', 'book', 'price', 'process', 'flow', 'liquidity', 'model', 'dynamic', 'large']


In [23]:
x = vectorizer.transform(prepare_text_for_lda('volatility model imply stochastic process local'))
y = nmf.transform(x)


In [24]:
y

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.36323999, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.08663379,
        0.24009755, 0.03417831, 0.00874053],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.13016862, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.0399035 , 0.        ,
        0.10783308, 0.        , 0.00125366],
       [0.        , 0.00818907, 0.        , 0.02153099, 0.        ,
        0.0708976 , 0.        , 0.07559755],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.08503563, 0.        , 0.        ]])

In [26]:
pd.set_option('display.max_colwidth', -1)
topic_values = nmf.transform(tfidf)
data['Topic'] = topic_values.argmax(axis=1)
data[['Title', 'Topic', 'Content']].head()

Unnamed: 0,Title,Topic,Content
0,meanreverting portfolio design budget constraint,3,paper considers meanreverting portfolio design problem arising statistical arbitrage financial market first propose general problem formulation aimed finding portfolio underlying component asset optimizing meanreversion criterion characterizing meanreversion strength taking consideration variance portfolio investment budget constraint several specific problem considered based general formulation efficient algorithm proposed numerical result synthetic market data show proposed meanreverting portfolio design method generate consistent profit outperform traditional design method benchmark method literature
1,visualizing treasury issuance strategy,2,introduce simple cost risk proxy metric attached treasury issuance strategy complement analysis resulting portfolio weightedaverage maturity wam metric based mapping issuance fraction longterm asymptotic portfolio implication cost risk mechanical debtrolling dynamic resulting mapping enables one visualize tradeoff involved contemplated issuance reallocation identify efficient frontier optimal tenor historical treasury issuance strategy analyzed empirically using cost risk metric illustrate change issuance need strategy translated structural shift cost risk stance treasury issuance
2,pricing financial derivative subject counterparty risk credit value adjustment,4,article present generic model pricing financial derivative subject counterparty credit risk unilateral bilateral type credit risk considered study show credit risk modeled american style option case require backward induction valuation correct common mistake literature emphasize market value defaultable derivative actually risky value rather riskfree value credit value adjustment cva also elaborated practical framework developed pricing defaultable derivative calculating cva portfolio level
3,machine learning portfolio allocation,5,find economically statistically significant gain using machine learning portfolio allocation market index riskfree asset optimal portfolio rule timevarying expected return volatility implemented two random forest model one model employed forecasting sign probability excess return payout yield second used construct optimized volatility estimate rewardrisk timing machine learning provides substantial improvement buyandhold utility riskadjusted return maximum drawdowns paper present new theoretical basis unifying framework machine learning applied return volatilitytiming
4,brexit bremain evidence bubble analysis,4,applied johansenledoitsornette jls model detect possible bubble crash related brexitbremain referendum scheduled june implementation includes enhanced model calibration using genetic algorithm selected historical financial series sensitive brexitbremain scenario representative multiple asset class found equity currency asset class show bubble signal rate credit real estate show superexponential behaviour instability typical bubble regime study suggests jls model equity currency market expect crash sharp rise following referendum result instead rate credit market consider referendum risky event expecting either bremain scenario brexit scenario edulcorated central bank intervention case real estate crash expected relationship referendum result unclear


In [193]:
data['Topic'].value_counts()

4    470
3    378
0    193
7    187
2    169
1    90 
6    89 
5    24 
Name: Topic, dtype: int64