## General Utilities

In [1]:
import numpy as np
import pandas as pd
import csv
import json

In [2]:
def print_first_n_dict_items(d: dict, dict_name: str, n: int):
    print("\nPrinting first", n, "items of dict named", dict_name, "\n========================================================")
    i = 0
    for key, value in d.items():
        print(key, ":", value)
        i += 1
        if i == n:
            break

In [3]:
def print_document_word_scores(document, doc_word_dict, mode):
    if mode == 'BOW':
        for i in range(len(document)):
            print("Word {} (\"{}\") appears {} time.".format(document[i][0], doc_word_dict[document[i][0]], document[i][1]))
    elif mode == 'TF_IDF':
        for i in range(len(document)):
            print("Word {} (\"{}\")'s TF-IDF is {}.".format(document[i][0], doc_word_dict[document[i][0]], document[i][1]))

In [4]:
def print_topics_from_model(model):
    for idx, topic in model.print_topics(-1):
        print('Topic: {} \nWords: {}'.format(idx, topic))

## Load Data

In [5]:
f = open("./data/titles_lemmatized.json","r")
title_orig_lemmatized_map = json.load(f)
f.close()

In [6]:
type(title_orig_lemmatized_map)
len(title_orig_lemmatized_map)

1952578

In [7]:
f = open("./data/keyword_counts.json","r")
keyword_counts_raw = json.load(f)
f.close()

In [8]:
type(keyword_counts_raw)
len(keyword_counts_raw)

448770

In [9]:
print_first_n_dict_items(keyword_counts_raw, "Keyword-counts", 10)


Printing first 10 items of dict named Keyword-counts 
system : 191799
network : 166093
model : 111842
analysis : 101884
algorithm : 97362
data : 83604
method : 81866
approach : 73121
problem : 66562
application : 66391


In [10]:
print_first_n_dict_items(title_orig_lemmatized_map, "Titles: Original-Lemmatized Map", 10)


Printing first 10 items of dict named Titles: Original-Lemmatized Map 
Parallel Integer Sorting and Simulation Amongst CRCW Models : ['parallel', 'integer', 'sorting', 'simulation', 'crcw', 'model']
Pattern Matching in Trees and Nets : ['pattern', 'matching', 'tree', 'net']
NP-complete Problems Simplified on Tree Schemas : ['np-complete', 'problem', 'simplified', 'tree', 'schema']
On the Power of Chain Rules in Context Free Grammars : ['power', 'chain', 'rule', 'context', 'free', 'grammar']
Schnelle Multiplikation von Polynomen über Körpern der Charakteristik 2 : ['schnelle', 'multiplikation', 'von', 'polynomen', 'über', 'körpern', 'der', 'charakteristik', '2']
A characterization of rational D0L power series : ['characterization', 'rational', 'd0l', 'power', 'series']
The Derivation of Systolic Implementations of Programs : ['derivation', 'systolic', 'implementation', 'program']
Fifo Nets Without Order Deadlock : ['fifo', 'net', 'order', 'deadlock']
On the Complementation Rule for Mul

In [11]:
title_lemma = list(title_orig_lemmatized_map.values())
title_lemma[:10]

[['parallel', 'integer', 'sorting', 'simulation', 'crcw', 'model'],
 ['pattern', 'matching', 'tree', 'net'],
 ['np-complete', 'problem', 'simplified', 'tree', 'schema'],
 ['power', 'chain', 'rule', 'context', 'free', 'grammar'],
 ['schnelle',
  'multiplikation',
  'von',
  'polynomen',
  'über',
  'körpern',
  'der',
  'charakteristik',
  '2'],
 ['characterization', 'rational', 'd0l', 'power', 'series'],
 ['derivation', 'systolic', 'implementation', 'program'],
 ['fifo', 'net', 'order', 'deadlock'],
 ['complementation',
  'rule',
  'multivalued',
  'dependency',
  'database',
  'relation'],
 ['equational', 'weighted', 'tree', 'transformation']]

### Random checks

In [12]:
i = 0
for orig_title, lemmatized_title in title_orig_lemmatized_map.items():
    if 'de' in lemmatized_title:
        print(orig_title)
        i += 1
    if i == 10:
        break

## Bag-of-Words (BOW)

In [13]:
import gensim
from gensim import corpora, models

In [14]:
title_lemma_dict = gensim.corpora.Dictionary(title_lemma)

In [15]:
count = 0
for k, v in title_lemma_dict.iteritems():
    print(k, v)
    count += 1
    if count > 15:
        break

0 crcw
1 integer
2 model
3 parallel
4 simulation
5 sorting
6 matching
7 net
8 pattern
9 tree
10 np-complete
11 problem
12 schema
13 simplified
14 chain
15 context


### Filter extremes - Filter tokens that appear in:
1. fewer than 15 documents (absolute number) or,
2. more than 0.5 documents (fraction of total corpus size, not absolute number).
3. After the above two steps, keep only the first 100000 most frequent tokens.

In [16]:
title_lemma_dict.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [17]:
len(title_lemma_dict)

31010

### Doc2BOW

In [18]:
bow_corpus = [title_lemma_dict.doc2bow(lemmatized_title_word) for lemmatized_title_word in title_lemma]

In [19]:
bow_corpus[4310]

[(67, 1),
 (260, 1),
 (282, 1),
 (296, 1),
 (1741, 1),
 (2060, 1),
 (3143, 1),
 (4115, 1)]

In [20]:
len(bow_corpus)

1952578

In [21]:
print_document_word_scores(bow_corpus[4310], title_lemma_dict, mode='BOW')

Word 67 ("algorithm") appears 1 time.
Word 260 ("execution") appears 1 time.
Word 282 ("composite") appears 1 time.
Word 296 ("service") appears 1 time.
Word 1741 ("partitioning") appears 1 time.
Word 2060 ("web") appears 1 time.
Word 3143 ("genetic") appears 1 time.
Word 4115 ("decentralized") appears 1 time.


## TF-IDF

In [22]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [23]:
len(corpus_tfidf)
type(corpus_tfidf)

gensim.interfaces.TransformedCorpus

In [24]:
from pprint import pprint

In [25]:
for doc in corpus_tfidf[:3]:
    pprint(doc)
    #break

[(0, 0.5308074793145152),
 (1, 0.2462960348359242),
 (2, 0.3895247938433522),
 (3, 0.3755981642789065),
 (4, 0.6039686557260081)]
[(5, 0.5067294312228873),
 (6, 0.5737547463930259),
 (7, 0.450967134840957),
 (8, 0.4589764893949818)]
[(8, 0.32738298819751377),
 (9, 0.59199325173433),
 (10, 0.23170406395473353),
 (11, 0.49497688257556743),
 (12, 0.4936349677671899)]


In [26]:
print_document_word_scores(corpus_tfidf[4310], title_lemma_dict, mode='TF_IDF')

Word 67 ("algorithm")'s TF-IDF is 0.1916219769949481.
Word 260 ("execution")'s TF-IDF is 0.42409763898777847.
Word 282 ("composite")'s TF-IDF is 0.4173157286634347.
Word 296 ("service")'s TF-IDF is 0.27425479081388066.
Word 1741 ("partitioning")'s TF-IDF is 0.40872772221798553.
Word 2060 ("web")'s TF-IDF is 0.2978792503904838.
Word 3143 ("genetic")'s TF-IDF is 0.33236993722554525.
Word 4115 ("decentralized")'s TF-IDF is 0.40962657258687246.


## LDA on BOW

In [82]:
lda_model_bow = gensim.models.LdaMulticore(bow_corpus, id2word=title_lemma_dict, num_topics=10, passes=2, workers=2)

In [83]:
print_topics_from_model(lda_model_bow)

Topic: 0 
Words: 0.060*"graph" + 0.022*"set" + 0.021*"recognition" + 0.016*"function" + 0.015*"space" + 0.014*"number" + 0.012*"pattern" + 0.010*"algorithm" + 0.009*"theorem" + 0.009*"group"
Topic: 1 
Words: 0.055*"network" + 0.025*"system" + 0.020*"wireless" + 0.020*"sensor" + 0.017*"scheme" + 0.015*"based" + 0.014*"channel" + 0.014*"communication" + 0.013*"performance" + 0.013*"power"
Topic: 2 
Words: 0.047*"image" + 0.021*"based" + 0.021*"classification" + 0.019*"feature" + 0.016*"selection" + 0.014*"method" + 0.013*"automatic" + 0.012*"point" + 0.011*"3d" + 0.011*"detection"
Topic: 3 
Words: 0.020*"system" + 0.017*"design" + 0.017*"issue" + 0.016*"special" + 0.015*"test" + 0.013*"analysis" + 0.013*"computer" + 0.012*"noise" + 0.012*"measurement" + 0.011*"simulation"
Topic: 4 
Words: 0.062*"system" + 0.038*"control" + 0.021*"nonlinear" + 0.020*"linear" + 0.019*"method" + 0.017*"solution" + 0.017*"class" + 0.013*"equation" + 0.013*"problem" + 0.013*"order"
Topic: 5 
Words: 0.049*"alg

## LDA on TF-IDF

In [44]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, id2word=title_lemma_dict, num_topics=8, passes=2, workers=4)

In [45]:
print_topics_from_model(lda_model_tfidf)

Topic: 0 
Words: 0.014*"network" + 0.011*"wireless" + 0.011*"system" + 0.008*"channel" + 0.007*"communication" + 0.007*"sensor" + 0.007*"performance" + 0.006*"power" + 0.005*"und" + 0.005*"radio"
Topic: 1 
Words: 0.009*"system" + 0.008*"model" + 0.008*"polynomial" + 0.006*"control" + 0.006*"analysis" + 0.005*"time" + 0.005*"nonlinear" + 0.005*"estimation" + 0.005*"algorithm" + 0.005*"delay"
Topic: 2 
Words: 0.013*"image" + 0.006*"segmentation" + 0.005*"based" + 0.005*"feature" + 0.005*"retrieval" + 0.005*"shape" + 0.004*"object" + 0.004*"data" + 0.004*"code" + 0.004*"classification"
Topic: 3 
Words: 0.010*"algorithm" + 0.008*"network" + 0.007*"optimization" + 0.007*"problem" + 0.006*"routing" + 0.005*"system" + 0.005*"based" + 0.005*"control" + 0.004*"vehicle" + 0.004*"swarm"
Topic: 4 
Words: 0.009*"recognition" + 0.008*"robot" + 0.007*"neural" + 0.007*"learning" + 0.006*"human" + 0.006*"special" + 0.006*"system" + 0.006*"detection" + 0.006*"brain" + 0.006*"based"
Topic: 5 
Words: 0.00

## Performance Evaluation

In [49]:
def generate_topic_prediction_scores(document, model):
    for index, score in sorted(model[document], key=lambda tup: -1*tup[1]):
        print("\nScore: {}\t \nTopic: {}".format(score, model.print_topic(index, 10)))

### 1. LDA(BOW)

In [76]:
random_doc_id = 4310
title_lemma[random_doc_id]
#print_document_word_counts(bow_corpus[4310], title_lemma_dict, mode='BOW')

['partitioning',
 'composite',
 'web',
 'service',
 'decentralized',
 'execution',
 'genetic',
 'algorithm']

In [77]:
lda_model_bow[bow_corpus[random_doc_id]]

[(13, 0.1122162),
 (16, 0.11222506),
 (18, 0.11222527),
 (35, 0.11221849),
 (39, 0.11222506),
 (40, 0.11222008),
 (76, 0.11222123),
 (81, 0.112223156)]

In [78]:
generate_topic_prediction_scores(bow_corpus[random_doc_id], lda_model_bow)


Score: 0.11222528666257858	 
Topic: 0.383*"algorithm" + 0.235*"problem" + 0.121*"efficient" + 0.046*"spatial" + 0.037*"solving" + 0.033*"simple" + 0.030*"minimum" + 0.027*"temporal" + 0.009*"searching" + 0.009*"reflection"

Score: 0.11222506314516068	 
Topic: 0.273*"service" + 0.062*"training" + 0.048*"receiver" + 0.047*"variation" + 0.043*"recovery" + 0.038*"variational" + 0.035*"maintenance" + 0.035*"estimate" + 0.033*"loop" + 0.032*"controlled"

Score: 0.11222506314516068	 
Topic: 0.269*"web" + 0.051*"formula" + 0.051*"site" + 0.050*"attribute" + 0.047*"usability" + 0.046*"price" + 0.036*"limited" + 0.033*"consideration" + 0.024*"company" + 0.024*"application"

Score: 0.11222314089536667	 
Topic: 0.248*"detection" + 0.116*"novel" + 0.097*"complex" + 0.075*"genetic" + 0.070*"interactive" + 0.042*"system" + 0.041*"based" + 0.037*"operator" + 0.034*"convergence" + 0.032*"approach"

Score: 0.11222121864557266	 
Topic: 0.263*"test" + 0.103*"series" + 0.093*"ii" + 0.071*"simultaneous" + 

### 2. LDA (TF-IDF)

In [60]:
print_document_word_counts(corpus_tfidf[4310], title_lemma_dict, mode='TF_IDF')

Word 67 ("algorithm")'s TF-IDF is 0.1916219769949481.
Word 261 ("execution")'s TF-IDF is 0.42409763898777847.
Word 283 ("composite")'s TF-IDF is 0.4173157286634347.
Word 297 ("service")'s TF-IDF is 0.27425479081388066.
Word 1742 ("partitioning")'s TF-IDF is 0.40872772221798553.
Word 2061 ("web")'s TF-IDF is 0.2978792503904838.
Word 3144 ("genetic")'s TF-IDF is 0.33236993722554525.
Word 4116 ("decentralized")'s TF-IDF is 0.40962657258687246.


In [79]:
lda_model_tfidf[corpus_tfidf[random_doc_id]]

[(0, 0.7662858),
 (1, 0.03337222),
 (2, 0.033349838),
 (3, 0.03349388),
 (4, 0.033338945),
 (5, 0.033419743),
 (6, 0.03340059),
 (7, 0.033338983)]

In [80]:
generate_topic_prediction_scores(corpus_tfidf[random_doc_id], lda_model_tfidf)


Score: 0.7662761211395264	 
Topic: 0.014*"network" + 0.011*"wireless" + 0.011*"system" + 0.008*"channel" + 0.007*"communication" + 0.007*"sensor" + 0.007*"performance" + 0.006*"power" + 0.005*"und" + 0.005*"radio"

Score: 0.03350401297211647	 
Topic: 0.010*"algorithm" + 0.008*"network" + 0.007*"optimization" + 0.007*"problem" + 0.006*"routing" + 0.005*"system" + 0.005*"based" + 0.005*"control" + 0.004*"vehicle" + 0.004*"swarm"

Score: 0.03341921046376228	 
Topic: 0.008*"system" + 0.008*"information" + 0.008*"software" + 0.006*"research" + 0.006*"study" + 0.006*"technology" + 0.006*"management" + 0.006*"social" + 0.006*"development" + 0.005*"knowledge"

Score: 0.03340056911110878	 
Topic: 0.018*"graph" + 0.009*"set" + 0.007*"logic" + 0.007*"number" + 0.006*"tree" + 0.005*"algorithm" + 0.005*"theorem" + 0.005*"cycle" + 0.005*"problem" + 0.005*"complexity"

Score: 0.033372242003679276	 
Topic: 0.009*"system" + 0.008*"model" + 0.008*"polynomial" + 0.006*"control" + 0.006*"analysis" + 0.00

## Test Predictions on unseen document

In [97]:
# test_title = 'TensorFlow - A system for large-scale machine learning'
# test_title_lemma = ['TensorFlow', 'system', 'large', 'scale', 'machine', 'learning']

test_title = 'An efficient algorithm to improve CPU cache hit-rate in x86 architectures'
test_title_lemma = ['efficient', 'algorithm', 'improve', 'CPU', 'cache', 'hit-rate', 'x86', 'architecture']

In [98]:
test_title_lemma_dict = gensim.corpora.Dictionary([test_title_lemma])
test_bow_vector = test_title_lemma_dict.doc2bow(test_title_lemma)

### 1. LDA(BOW) Test

In [99]:
lda_model_bow.get_document_topics(test_bow_vector)

[(0, 0.19466658),
 (1, 0.011112118),
 (2, 0.011115115),
 (3, 0.011116207),
 (4, 0.0111123705),
 (5, 0.533563),
 (6, 0.1939754),
 (7, 0.011112384),
 (8, 0.011114642),
 (9, 0.011112221)]

In [100]:
generate_topic_prediction_scores(test_bow_vector, lda_model_bow)


Score: 0.533555805683136	 
Topic: 0.049*"algorithm" + 0.036*"problem" + 0.030*"optimization" + 0.024*"network" + 0.022*"time" + 0.019*"model" + 0.018*"robot" + 0.017*"dynamic" + 0.015*"neural" + 0.012*"approach"

Score: 0.19465483725070953	 
Topic: 0.060*"graph" + 0.022*"set" + 0.021*"recognition" + 0.016*"function" + 0.015*"space" + 0.014*"number" + 0.012*"pattern" + 0.010*"algorithm" + 0.009*"theorem" + 0.009*"group"

Score: 0.19399428367614746	 
Topic: 0.023*"code" + 0.019*"algorithm" + 0.019*"tree" + 0.016*"search" + 0.016*"complexity" + 0.014*"machine" + 0.014*"random" + 0.013*"property" + 0.012*"polynomial" + 0.012*"vector"

Score: 0.011116203851997852	 
Topic: 0.020*"system" + 0.017*"design" + 0.017*"issue" + 0.016*"special" + 0.015*"test" + 0.013*"analysis" + 0.013*"computer" + 0.012*"noise" + 0.012*"measurement" + 0.011*"simulation"

Score: 0.011115114204585552	 
Topic: 0.047*"image" + 0.021*"based" + 0.021*"classification" + 0.019*"feature" + 0.016*"selection" + 0.014*"metho

### 2. LDA(TF-IDF) Test

In [74]:
generate_topic_prediction_scores(test_bow_vector, lda_model_tfidf)


Score: 0.8747457265853882	 
Topic: 0.018*"graph" + 0.009*"set" + 0.007*"logic" + 0.007*"number" + 0.006*"tree" + 0.005*"algorithm" + 0.005*"theorem" + 0.005*"cycle" + 0.005*"problem" + 0.005*"complexity"

Score: 0.0179138220846653	 
Topic: 0.009*"recognition" + 0.008*"robot" + 0.007*"neural" + 0.007*"learning" + 0.006*"human" + 0.006*"special" + 0.006*"system" + 0.006*"detection" + 0.006*"brain" + 0.006*"based"

Score: 0.017906084656715393	 
Topic: 0.010*"algorithm" + 0.008*"network" + 0.007*"optimization" + 0.007*"problem" + 0.006*"routing" + 0.005*"system" + 0.005*"based" + 0.005*"control" + 0.004*"vehicle" + 0.004*"swarm"

Score: 0.017888925969600677	 
Topic: 0.014*"network" + 0.011*"wireless" + 0.011*"system" + 0.008*"channel" + 0.007*"communication" + 0.007*"sensor" + 0.007*"performance" + 0.006*"power" + 0.005*"und" + 0.005*"radio"

Score: 0.017888149246573448	 
Topic: 0.008*"system" + 0.008*"information" + 0.008*"software" + 0.006*"research" + 0.006*"study" + 0.006*"technology"

## Generate topics for top keywords

In [27]:
lda_model_bow = models.LdaModel.load('./model/lda_bow.model', mmap='r')

In [28]:
# Sort as it is a dict() and order need not have been preserved
keyword_counts = sorted(keyword_counts_raw.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)
keyword_counts

[('system', 191799),
 ('network', 166093),
 ('model', 111842),
 ('analysis', 101884),
 ('algorithm', 97362),
 ('data', 83604),
 ('method', 81866),
 ('approach', 73121),
 ('problem', 66562),
 ('application', 66391),
 ('control', 65290),
 ('learning', 60695),
 ('design', 60176),
 ('image', 55137),
 ('information', 53651),
 ('dynamic', 45930),
 ('graph', 45692),
 ('detection', 42851),
 ('study', 41767),
 ('new', 41368),
 ('performance', 40169),
 ('optimization', 37933),
 ('wireless', 36651),
 ('neural', 36613),
 ('estimation', 34629),
 ('adaptive', 34213),
 ('efficient', 33874),
 ('fuzzy', 33855),
 ('sensor', 31748),
 ('linear', 31313),
 ('modeling', 31255),
 ('function', 31018),
 ('scheme', 29853),
 ('optimal', 29646),
 ('mobile', 29518),
 ('management', 29274),
 ('distributed', 29161),
 ('time', 29007),
 ('communication', 28883),
 ('channel', 28602),
 ('process', 28520),
 ('equation', 27539),
 ('service', 27432),
 ('framework', 26778),
 ('power', 26750),
 ('classification', 26229),
 ('e

In [29]:
top_keywords = [word_count[0] for word_count in keyword_counts[:100]]
print(top_keywords)

['system', 'network', 'model', 'analysis', 'algorithm', 'data', 'method', 'approach', 'problem', 'application', 'control', 'learning', 'design', 'image', 'information', 'dynamic', 'graph', 'detection', 'study', 'new', 'performance', 'optimization', 'wireless', 'neural', 'estimation', 'adaptive', 'efficient', 'fuzzy', 'sensor', 'linear', 'modeling', 'function', 'scheme', 'optimal', 'mobile', 'management', 'distributed', 'time', 'communication', 'channel', 'process', 'equation', 'service', 'framework', 'power', 'classification', 'evaluation', 'set', 'structure', 'software', 'technique', 'simulation', 'code', 'multiple', 'environment', 'computing', 'recognition', 'solution', 'robust', 'nonlinear', 'effect', 'theory', 'machine', 'feature', 'case', 'technology', 'social', 'parallel', 'development', 'architecture', 'scheduling', 'selection', 'hybrid', 'programming', 'digital', 'decision', 'space', 'signal', 'web', 'search', 'protocol', 'novel', 'prediction', 'online', 'logic', 'knowledge', '

In [30]:
top_keyword_topics = dict()
for top_keyword in top_keywords:
    top_keyword_topics[top_keyword] = sorted(lda_model_bow.get_term_topics(top_keyword), key=lambda tup:tup[1], reverse=True)

In [31]:
top_keyword_topics

{'system': [(9, 0.14086567),
  (0, 0.033819497),
  (7, 0.033084434),
  (10, 0.017416175),
  (2, 0.00021401412),
  (3, 9.423286e-06),
  (6, 4.832772e-06)],
 'network': [(10, 0.09500348), (8, 0.025520677)],
 'model': [(2, 0.030992398),
  (1, 0.030761773),
  (9, 0.013784084),
  (8, 0.009588864),
  (3, 0.008281472),
  (0, 0.008184194),
  (6, 0.006767093),
  (11, 0.0030651824),
  (10, 0.00012178734),
  (12, 2.3604172e-05),
  (5, 6.0191064e-06),
  (7, 1.5290514e-07)],
 'analysis': [(2, 0.019118698),
  (3, 0.016167935),
  (1, 0.014031882),
  (0, 0.00980979),
  (9, 0.00855227),
  (8, 0.007563124),
  (7, 0.007501577),
  (4, 0.0071428004),
  (10, 0.006907436),
  (11, 0.005732793),
  (6, 0.0052868496),
  (12, 0.0025187656),
  (5, 0.00247623)],
 'algorithm': [(6, 0.054443173),
  (5, 0.015100211),
  (8, 0.01159434),
  (0, 0.010894981),
  (10, 0.009656792),
  (3, 1.16432425e-07)],
 'data': [(1, 0.061110325),
  (4, 0.019647451),
  (2, 0.015235735),
  (8, 8.657257e-07)],
 'method': [(6, 0.037847985),


In [32]:
top_keyword_top_topic = {term:-1 if topics == [] else topics[0][0] for term,topics in top_keyword_topics.items()}
top_keyword_top_topic

{'system': 9,
 'network': 10,
 'model': 2,
 'analysis': 2,
 'algorithm': 6,
 'data': 1,
 'method': 6,
 'approach': 2,
 'problem': 6,
 'application': 7,
 'control': 9,
 'learning': 2,
 'design': 7,
 'image': 8,
 'information': 12,
 'dynamic': 9,
 'graph': 5,
 'detection': 8,
 'study': 11,
 'new': 6,
 'performance': 3,
 'optimization': 6,
 'wireless': 10,
 'neural': 8,
 'estimation': 3,
 'adaptive': 9,
 'efficient': 0,
 'fuzzy': 9,
 'sensor': 10,
 'linear': 6,
 'modeling': 2,
 'function': 5,
 'scheme': 0,
 'optimal': 9,
 'mobile': 10,
 'management': 1,
 'distributed': 10,
 'time': 6,
 'communication': 10,
 'channel': 3,
 'process': 1,
 'equation': 6,
 'service': 12,
 'framework': 1,
 'power': 10,
 'classification': 8,
 'evaluation': 7,
 'set': 5,
 'structure': 1,
 'software': 7,
 'technique': 4,
 'simulation': 3,
 'code': 3,
 'multiple': 10,
 'environment': 7,
 'computing': 0,
 'recognition': 8,
 'solution': 6,
 'robust': 9,
 'nonlinear': 6,
 'effect': 3,
 'theory': 11,
 'machine': 2,
 '

## Save Outputs

### 1. Trained Models

In [125]:
lda_model_bow.save('./model/lda_bow.model')

In [37]:
# Just testing if it can be read
test_model_read = models.LdaModel.load('./model/lda_bow.model', mmap='r')
for idx, topic in test_model_read.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [126]:
lda_model_tfidf.save('./model/lda_tfidf.model')

### 2. Keyword-topics 

In [34]:
f = open("./data/top_keyword_top_topic.json","w")
json.dump(top_keyword_top_topic, f)
f.close()

## Clear Data

In [38]:
del test_title
del test_title_lemma
del test_title_lemma_dict


del test_model_read