In [1]:
import pandas as pd

# Data Extraction 

In [2]:
data = pd.read_csv('ComputerScience_BIG__arxiv.csv')

In [3]:
data.shape

(1600, 4)

In [4]:
data.head(5)

Unnamed: 0,ID,Topic,Title,Content
0,1,ComputerScience,draft task system item architecture tsia,execution task independent task application ex...
1,2,ComputerScience,compiler operating system third advance applic...,compiler operating system tsias third advance ...
2,3,ComputerScience,using propagation solving complex arithmetic c...,solving system nonlinear inequality important ...
3,4,ComputerScience,fast data moving beyond big data mapreduce,big data may solution many looking latest rise...
4,5,ComputerScience,brittle system analysis,goal paper define analyze system exhibit britt...


# Cleaning 

In [None]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

# Pre-processing the data for modelling 

In [7]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/richachoudhary/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/richachoudhary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    #tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [13]:
data['tokens']=  data['Content'].apply(lambda x: prepare_text_for_lda(x))

In [14]:
data.head(5)

Unnamed: 0,ID,Topic,Title,Content,tokens
0,1,ComputerScience,draft task system item architecture tsia,execution task independent task application ex...,"[execution, task, independent, task, applicati..."
1,2,ComputerScience,compiler operating system third advance applic...,compiler operating system tsias third advance ...,"[compiler, operate, system, tsias, third, adva..."
2,3,ComputerScience,using propagation solving complex arithmetic c...,solving system nonlinear inequality important ...,"[solving, system, nonlinear, inequality, impor..."
3,4,ComputerScience,fast data moving beyond big data mapreduce,big data may solution many looking latest rise...,"[big, data, may, solution, many, looking, late..."
4,5,ComputerScience,brittle system analysis,goal paper define analyze system exhibit britt...,"[goal, paper, define, analyze, system, exhibit..."


# Bag-Of-words 

In [35]:
from gensim import corpora
dictionary = corpora.Dictionary(data['tokens'])
bow_corpus = data['tokens'].apply(lambda x: dictionary.doc2bow(x))
import pickle
pickle.dump(bow_corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [44]:
bow_corpus

0       [(0, 1), (1, 1), (2, 7), (3, 2), (4, 1), (5, 1...
1       [(0, 1), (1, 1), (2, 9), (3, 1), (9, 3), (10, ...
2       [(1, 1), (37, 1), (38, 1), (74, 1), (83, 1), (...
3       [(25, 2), (30, 1), (38, 1), (64, 1), (74, 2), ...
4       [(8, 1), (38, 3), (67, 1), (71, 1), (116, 1), ...
                              ...                        
1595    [(2, 1), (16, 1), (19, 1), (37, 1), (38, 4), (...
1596    [(110, 1), (116, 1), (129, 3), (169, 1), (173,...
1597    [(48, 1), (90, 2), (104, 1), (116, 1), (140, 1...
1598    [(68, 1), (75, 1), (78, 1), (79, 1), (90, 1), ...
1599    [(2, 3), (3, 2), (25, 1), (38, 4), (41, 1), (4...
Name: tokens, Length: 1600, dtype: object

In [164]:
bow_corpus.shape

(1600,)

In [39]:
import gensim
NUM_TOPICS = 8
ldamodel = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=8)
for topic in topics:
    print(topic)

(0, '0.007*"model" + 0.007*"performance" + 0.006*"propose" + 0.006*"show" + 0.006*"use" + 0.005*"number" + 0.005*"network" + 0.005*"hardware"')
(1, '0.011*"algorithm" + 0.009*"problem" + 0.005*"result" + 0.005*"computing" + 0.005*"show" + 0.005*"complexity" + 0.005*"n" + 0.005*"graph"')
(2, '0.017*"system" + 0.017*"memory" + 0.014*"performance" + 0.010*"data" + 0.007*"application" + 0.006*"paper" + 0.006*"use" + 0.005*"architecture"')
(3, '0.009*"algorithm" + 0.008*"problem" + 0.007*"graph" + 0.006*"model" + 0.006*"show" + 0.006*"query" + 0.006*"number" + 0.005*"result"')
(4, '0.011*"system" + 0.008*"user" + 0.007*"data" + 0.005*"paper" + 0.005*"propose" + 0.005*"application" + 0.005*"task" + 0.004*"using"')
(5, '0.010*"model" + 0.008*"system" + 0.006*"research" + 0.005*"problem" + 0.005*"paper" + 0.005*"use" + 0.005*"present" + 0.005*"network"')
(6, '0.016*"algorithm" + 0.010*"system" + 0.009*"time" + 0.008*"task" + 0.008*"data" + 0.008*"scheduling" + 0.008*"problem" + 0.008*"approach

Predicting on test data using Bag-of-words 

In [40]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(90, 1), (435, 1), (436, 1), (808, 1), (897, 1), (7165, 1)]
[(0, 0.017901136), (1, 0.017891614), (2, 0.017903758), (3, 0.017896537), (4, 0.01789893), (5, 0.0178844), (6, 0.017914208), (7, 0.8747094)]


In [41]:
new_doc = 'solving black box problem normative framework explainable artificial intelligence'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(74, 1), (83, 1), (95, 1), (796, 1), (1871, 1), (2039, 1), (2203, 1), (2208, 1), (2216, 1)]
[(0, 0.012520391), (1, 0.012527472), (2, 0.6654342), (3, 0.01253471), (4, 0.012535758), (5, 0.012523551), (6, 0.25939345), (7, 0.01253049)]


# TFIDF

In [162]:
from sklearn.feature_extraction.text import TfidfVectorizer
#no_features = 5000
vectorizer = TfidfVectorizer()
tfidf= vectorizer.fit_transform(data['tokens'].astype(str))
tfidf_feature_names= vectorizer.get_feature_names()

In [178]:
data['tokens'].astype(str).head(3)

0    ['execution', 'task', 'independent', 'task', '...
1    ['compiler', 'operate', 'system', 'tsias', 'th...
2    ['solving', 'system', 'nonlinear', 'inequality...
Name: tokens, dtype: object

In [179]:
data.head(3)

Unnamed: 0,ID,Topic,Title,Content,tokens
0,1,ComputerScience,draft task system item architecture tsia,execution task independent task application ex...,"[execution, task, independent, task, applicati..."
1,2,ComputerScience,compiler operating system third advance applic...,compiler operating system tsias third advance ...,"[compiler, operate, system, tsias, third, adva..."
2,3,ComputerScience,using propagation solving complex arithmetic c...,solving system nonlinear inequality important ...,"[solving, system, nonlinear, inequality, impor..."


In [163]:
tfidf

<1600x12513 sparse matrix of type '<class 'numpy.float64'>'
	with 108744 stored elements in Compressed Sparse Row format>

In [154]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
tfidf_ldamodel = LatentDirichletAllocation(n_components= NUM_TOPICS, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tfidf)



In [155]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print (topic_idx)
        print ([" ".join([feature_names[i]])
                        for i in topic.argsort()[:-no_top_words - 1:-1]])

no_top_words = 10
display_topics(tfidf_ldamodel, tfidf_feature_names, no_top_words)

0
['oprema', 'czj', 'kortum', 'relay', 'kaemmerer', 'system', 'gdr', 'selenium', 'heinz', 'wilhelm']
1
['system', 'algorithm', 'performance', 'data', 'model', 'memory', 'problem', 'application', 'paper', 'use']
2
['data', 'memory', 'computation', 'performance', 'compiler', 'concern', 'algorithm', 'point', 'present', 'ontology']
3
['latch', 'invariant', 'web', 'infographics', 'gr', 'eusprig', 'aim', 'manage', 'circuit', 'efft']
4
['futamura', 'staging', 'projection', 'humancomputer', 'data', 'dialog', 'tag', 'need', 'task', 'development']
5
['memory', 'write', 'cache', 'performance', 'proof', 'time', 'abstraction', 'propose', 'attack', 'storage']
6
['edsger', 'wybe', 'dijkstra', 'mathsf', 'proponent', 'ptolemaic', 'philosopher', 'copernican', 'openminded', 'neutral']
7
['gesture', 'latch', 'business', 'gppc', 'issue', 'computing', 'reality', 'knn', 'circuit', 'tsias']


In [165]:
#test_tfidf= vectorizer.fit_transform(['Practical Bayesian Optimization of Machine Learning Algorithms', 'saas', 'maths'])
#tfidf_ldamodel.transform(test_tfidf)

Predicting on test data using TFIDF

In [166]:
#y = tfidf_ldamodel.transform(x)

In [140]:
y

array([[0.06259207, 0.06258221, 0.06261731, 0.06256668, 0.56191697,
        0.06256809, 0.06258385, 0.06257283],
       [0.06259836, 0.06259344, 0.06260903, 0.06256334, 0.06259348,
        0.56180959, 0.06262626, 0.06260649],
       [0.06258893, 0.56178158, 0.06265892, 0.06257623, 0.06258473,
        0.06258699, 0.06260892, 0.0626137 ],
       [0.06257026, 0.06259051, 0.06256751, 0.56186049, 0.06259097,
        0.06258922, 0.0625984 , 0.06263263],
       [0.06261577, 0.06259084, 0.06263428, 0.5618239 , 0.06256778,
        0.06258343, 0.0626068 , 0.06257719],
       [0.56188934, 0.06262404, 0.06255488, 0.06257612, 0.06258879,
        0.06259943, 0.06257093, 0.06259648]])

In [167]:
#doc_topic_dist_unnormalized = np.matrix(y)

In [168]:
#https://stackoverflow.com/questions/40597075/python-sklearn-latent-dirichlet-allocation-transform-v-fittransform?noredirect=1&lq=1
# normalize the distribution (only needed if you want to work with the probabilities)
#doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)

In [169]:
#To find the top ranking topic you can do something like:
#doc_topic_dist.argmax(axis=1)

In [170]:
##x = vectorizer.transform(prepare_text_for_lda('abstraction'))

##y = tfidf_ldamodel.transform(x)

##doc_topic_dist_unnormalized = np.matrix(y)
#https://stackoverflow.com/questions/40597075/python-sklearn-latent-dirichlet-allocation-transform-v-fittransform?noredirect=1&lq=1
# normalize the distribution (only needed if you want to work with the probabilities)
##doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)
##doc_topic_dist.argmax(axis=1)

In [171]:
from sklearn.decomposition import NMF

nmf = NMF(n_components= NUM_TOPICS, random_state=42)
nmf.fit(tfidf )

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=8, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [173]:
display_topics(nmf, tfidf_feature_names, no_top_words)

0
['system', 'file', 'operate', 'kernel', 'device', 'application', 'hardware', 'virtualization', 'service', 'io']
1
['task', 'scheduling', 'realtime', 'schedulability', 'multiprocessor', 'scheduler', 'processor', 'deadline', 'propose', 'resource']
2
['memory', 'performance', 'data', 'cache', 'access', 'page', 'application', 'nvm', 'dram', 'energy']
3
['algorithm', 'problem', 'polynomial', 'method', 'complexity', 'matrix', 'graph', 'number', 'implementation', 'result']
4
['data', 'language', 'research', 'software', 'user', 'programming', 'model', 'analysis', 'science', 'information']
5
['time', 'round', 'robin', 'algorithm', 'scheduling', 'rr', 'quantum', 'cpu', 'average', 'waiting']
6
['game', 'player', 'agent', 'strategy', 'problem', 'equilibrium', 'graph', 'nash', 'automaton', 'winning']
7
['network', 'neural', 'music', 'model', 'deep', 'feature', 'learning', 'energy', 'architecture', 'node']


In [180]:
prepare_text_for_lda('music deep energy network')

['music', 'deep', 'energy', 'network']

In [177]:
x = vectorizer.transform(prepare_text_for_lda('music deep energy network'))
y = nmf.transform(x)


In [175]:
y

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.08277931],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.07343387],
       [0.        , 0.01997873, 0.03261225, 0.        , 0.        ,
        0.        , 0.        , 0.05632942],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.32630181]])

In [192]:
topic_values = nmf.transform(tfidf)
data['Topic'] = topic_values.argmax(axis=1)
data[['Title', 'Topic', 'Content']].head()

Unnamed: 0,Title,Topic,Content
0,draft task system item architecture tsia,1,execution task independent task application executes term task application definition free detail execution many project demonstrated task system t provide application parallel distributed heterogeneous adaptive dynamic realtime interactive reliable secure execution task consists item thus application defined term item item architecture ia support array routine structure item thus allowing structured application definition taking property many project support extend currying application defined type conditional item stream definition element task system item architecture tsia thus promise unprecedented level support application execution definition
1,compiler operating system third advance application support,0,compiler operating system tsias third advance application support compiler support high level application definition programming language operating system support high level interface resource used application execution task system item architecture tsia provides application transparent reliable distributed heterogeneous adaptive dynamic realtime interactive parallel secure execution addition supporting application execution tsia also support application definition runtime support definition complementary compiletime support compiler example allows language similar fortran c deliver feature promised functional computing many tsias exist previously recognized served particular type application existing tsias project demonstrate tsias feasible application next paradigm application support tsia simplifies unifies existing computing practice research solving many outstanding problem tsia open many many new opportunity computing
2,using propagation solving complex arithmetic constraint,3,solving system nonlinear inequality important problem conventional numerical analysis satisfactory method boxconsistency algorithm one compute cover solution set arbitrarily close approximation difficulty use propagation complex arithmetic expression box consistency computed interval arithmetic paper present theorem support simple modification propagation allows complex arithmetic expression handled efficiently version box consistency obtained way stronger interval arithmetic used
3,fast data moving beyond big data mapreduce,4,big data may solution many looking latest rise big data method system partly due new ability technique provide partly simplicity software design partly buzzword value investor client said popularity measure suitability big data approach might best solution even applicable one many common problem namely time dependent problem whose solution may bound cached manner benefit greatly moving partly stateless flow oriented function data model paper present model substitute traditional mapshufflereduce model
4,brittle system analysis,0,goal paper define analyze system exhibit brittle behavior behavior characterized sudden steep decline performance system approach limit tolerance due input parameter exceed specified input environmental condition exceed specified operating boundary analogy made brittle commmunication system particular material science


In [191]:
pd.set_option('display.max_colwidth', -1)

In [193]:
data['Topic'].value_counts()

4    470
3    378
0    193
7    187
2    169
1    90 
6    89 
5    24 
Name: Topic, dtype: int64