## General Utilities

In [48]:
import string
import csv
import json

import gensim
from gensim import corpora, models

In [2]:
def print_first_n_dict_items(d: dict, dict_name: str, n: int):
    print("\nPrinting first", n, "items of dict named", dict_name, "\n========================================================")
    i = 0
    for key, value in d.items():
        print(key, ":", value)
        i += 1
        if i == n:
            break

In [60]:
def print_document_word_counts(document, doc_word_dict, mode):
    if mode == 'BOW':
        for i in range(len(document)):
            print("Word {} (\"{}\") appears {} time.".format(document[i][0], doc_word_dict[document[i][0]], document[i][1]))
    elif mode == 'TF_IDF':
        for i in range(len(document)):
            print("Word {} (\"{}\")'s TF-IDF is {}.".format(document[i][0], doc_word_dict[document[i][0]], document[i][1]))

In [4]:
def print_topics_from_model(model):
    for idx, topic in model.print_topics(-1):
        print('Topic: {} \nWords: {}'.format(idx, topic))

In [37]:
def generate_topic_prediction_scores(document, model):
    for index, score in sorted(model[document], key=lambda tup: -1*tup[1]):
        print("\nScore: {}\t \nTopic: {}\n{}".format(score, index, model.print_topic(index, 10)))

## Load data

In [6]:
f = open("./data/titles_lemmatized.json","r")
title_terms_map = json.load(f)
f.close()

In [7]:
type(title_terms_map)
len(title_terms_map)

1952578

In [8]:
print_first_n_dict_items(title_terms_map, "Titles: Original-Terms Map", 10)


Printing first 10 items of dict named Titles: Original-Terms Map 
Parallel Integer Sorting and Simulation Amongst CRCW Models : ['parallel', 'integer', 'sorting', 'simulation', 'crcw', 'model']
Pattern Matching in Trees and Nets : ['pattern', 'matching', 'tree', 'net']
NP-complete Problems Simplified on Tree Schemas : ['np-complete', 'problem', 'simplified', 'tree', 'schema']
On the Power of Chain Rules in Context Free Grammars : ['power', 'chain', 'rule', 'context', 'free', 'grammar']
Schnelle Multiplikation von Polynomen über Körpern der Charakteristik 2 : ['schnelle', 'multiplikation', 'von', 'polynomen', 'über', 'körpern', 'der', 'charakteristik', '2']
A characterization of rational D0L power series : ['characterization', 'rational', 'd0l', 'power', 'series']
The Derivation of Systolic Implementations of Programs : ['derivation', 'systolic', 'implementation', 'program']
Fifo Nets Without Order Deadlock : ['fifo', 'net', 'order', 'deadlock']
On the Complementation Rule for Multival

In [47]:
title_terms_map["A characterization of rational D0L power series"]

['characterization', 'rational', 'd0l', 'power', 'series']

## BOW

In [9]:
random_doc_id = 4310 # for testing

In [52]:
title_terms_list = list(title_terms_map.values())
title_terms_list[:10]

[['parallel', 'integer', 'sorting', 'simulation', 'crcw', 'model'],
 ['pattern', 'matching', 'tree', 'net'],
 ['np-complete', 'problem', 'simplified', 'tree', 'schema'],
 ['power', 'chain', 'rule', 'context', 'free', 'grammar'],
 ['schnelle',
  'multiplikation',
  'von',
  'polynomen',
  'über',
  'körpern',
  'der',
  'charakteristik',
  '2'],
 ['characterization', 'rational', 'd0l', 'power', 'series'],
 ['derivation', 'systolic', 'implementation', 'program'],
 ['fifo', 'net', 'order', 'deadlock'],
 ['complementation',
  'rule',
  'multivalued',
  'dependency',
  'database',
  'relation'],
 ['equational', 'weighted', 'tree', 'transformation']]

In [53]:
title_terms_dict = gensim.corpora.Dictionary(title_terms_list)

In [54]:
title_terms_dict.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [55]:
print(type(title_terms_dict))
print(len(title_terms_dict))

<class 'gensim.corpora.dictionary.Dictionary'>
31010


In [59]:
count = 0
for k, v in title_terms_dict.iteritems():
    print(k, v)
    count += 1
    if count > 15:
        break

0 integer
1 model
2 parallel
3 simulation
4 sorting
5 matching
6 net
7 pattern
8 tree
9 np-complete
10 problem
11 schema
12 simplified
13 chain
14 context
15 free


In [56]:
bow_corpus = [title_terms_dict.doc2bow(title_terms) for title_terms in title_terms_list]

In [63]:
print(type(bow_corpus))
print(len(bow_corpus))
print(title_terms_dict.doc2bow(title_terms_list[0]))
print_document_word_counts(title_terms_dict.doc2bow(title_terms_list[0]), title_terms_dict, mode='BOW')

<class 'list'>
1952578
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]
Word 0 ("integer") appears 1 time.
Word 1 ("model") appears 1 time.
Word 2 ("parallel") appears 1 time.
Word 3 ("simulation") appears 1 time.
Word 4 ("sorting") appears 1 time.


In [16]:
print_document_word_counts(bow_corpus[random_doc_id], title_terms_dict, mode='BOW')

Word 67 ("algorithm") appears 1 time.
Word 260 ("execution") appears 1 time.
Word 282 ("composite") appears 1 time.
Word 296 ("service") appears 1 time.
Word 1741 ("partitioning") appears 1 time.
Word 2060 ("web") appears 1 time.
Word 3143 ("genetic") appears 1 time.
Word 4115 ("decentralized") appears 1 time.


## Load and Test model predictions

In [24]:
# Loading model
lda_model_bow = models.LdaModel.load('./model/lda_bow.model', mmap='r')

In [81]:
lda_model_bow = gensim.models.LdaMulticore(bow_corpus, id2word=title_terms_dict, num_topics=13, minimum_probability=0, 
                                           passes=2, workers=2)

In [82]:
for idx, topic in lda_model_bow.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.034*"system" + 0.026*"computing" + 0.024*"parallel" + 0.020*"implementation" + 0.018*"scheme" + 0.015*"noise" + 0.014*"architecture" + 0.014*"security" + 0.013*"survey" + 0.013*"queue"
Topic: 1 Word: 0.061*"data" + 0.041*"process" + 0.031*"model" + 0.030*"knowledge" + 0.016*"und" + 0.015*"management" + 0.014*"analysis" + 0.013*"integration" + 0.013*"approach" + 0.013*"change"
Topic: 2 Word: 0.033*"approach" + 0.031*"model" + 0.030*"learning" + 0.025*"modeling" + 0.019*"analysis" + 0.015*"pattern" + 0.015*"data" + 0.014*"planning" + 0.014*"prediction" + 0.013*"machine"
Topic: 3 Word: 0.030*"measurement" + 0.028*"estimation" + 0.026*"channel" + 0.023*"simulation" + 0.017*"error" + 0.016*"code" + 0.016*"analysis" + 0.015*"cognitive" + 0.014*"performance" + 0.013*"field"
Topic: 4 Word: 0.031*"processing" + 0.025*"large" + 0.024*"frequency" + 0.020*"data" + 0.019*"domain" + 0.015*"für" + 0.015*"signal" + 0.014*"transform" + 0.014*"challenge" + 0.013*"continuous"
Topic: 5 Wo

In [119]:
topic_labels = ['Systems', 'Data', 'ML', 'Signals', 'Signals', 'Algo', 'Algo',
               'Software', 'Vision', 'Control', 'Networks', 'Misc', 'Social']
labels_merged_map = {0:0, 1:1, 2:2, 3:3, 4:3, 5:4, 6:4, 7:5, 8:6, 9:7, 10:8, 11:9, 12:10}
topic_labels_merged = []
for topic_label in topic_labels:
    if topic_label not in topic_labels_merged:
        topic_labels_merged.append(topic_label)
topic_labels_merged

['Systems',
 'Data',
 'ML',
 'Signals',
 'Algo',
 'Software',
 'Vision',
 'Control',
 'Networks',
 'Misc',
 'Social']

In [83]:
lda_model_bow[bow_corpus[random_doc_id]]

[(0, 0.15104198),
 (1, 0.008548858),
 (2, 0.008548858),
 (3, 0.11962713),
 (4, 0.11965558),
 (5, 0.008548885),
 (6, 0.21878205),
 (7, 0.008548858),
 (8, 0.008548877),
 (9, 0.119605064),
 (10, 0.008548916),
 (11, 0.008548858),
 (12, 0.21144612)]

In [84]:
print_document_word_scores(bow_corpus[random_doc_id], title_terms_dict, mode='BOW')

Word 67 ("algorithm") appears 1 time.
Word 260 ("execution") appears 1 time.
Word 282 ("composite") appears 1 time.
Word 296 ("service") appears 1 time.
Word 1741 ("partitioning") appears 1 time.
Word 2060 ("web") appears 1 time.
Word 3143 ("genetic") appears 1 time.
Word 4115 ("decentralized") appears 1 time.


In [85]:
generate_topic_prediction_scores(bow_corpus[random_doc_id], lda_model_bow)


Score: 0.2188308984041214	 
Topic: 6
0.064*"problem" + 0.054*"algorithm" + 0.038*"method" + 0.031*"time" + 0.030*"optimization" + 0.024*"nonlinear" + 0.020*"solution" + 0.019*"class" + 0.018*"linear" + 0.017*"finite"

Score: 0.21153035759925842	 
Topic: 12
0.037*"information" + 0.027*"social" + 0.025*"web" + 0.021*"video" + 0.020*"online" + 0.020*"game" + 0.016*"service" + 0.015*"der" + 0.015*"internet" + 0.013*"semantic"

Score: 0.15090879797935486	 
Topic: 0
0.034*"system" + 0.026*"computing" + 0.024*"parallel" + 0.020*"implementation" + 0.018*"scheme" + 0.015*"noise" + 0.014*"architecture" + 0.014*"security" + 0.013*"survey" + 0.013*"queue"

Score: 0.11965559422969818	 
Topic: 4
0.031*"processing" + 0.025*"large" + 0.024*"frequency" + 0.020*"data" + 0.019*"domain" + 0.015*"für" + 0.015*"signal" + 0.014*"transform" + 0.014*"challenge" + 0.013*"continuous"

Score: 0.11962714791297913	 
Topic: 3
0.030*"measurement" + 0.028*"estimation" + 0.026*"channel" + 0.023*"simulation" + 0.017*"e

In [31]:
test_document_topics = lda_model_bow.get_document_topics(bow_corpus[random_doc_id], minimum_probability=0)

In [32]:
test_document_topics

[(0, 0.012530396),
 (1, 0.13756263),
 (2, 0.13767186),
 (3, 0.13778837),
 (4, 0.012529179),
 (5, 0.13780819),
 (6, 0.012531596),
 (7, 0.2623018),
 (8, 0.012529179),
 (9, 0.13674678)]

In [46]:
sorted(lda_model_bow.get_document_topics(bow_corpus[random_doc_id]), key=lambda tup: -1*tup[1])[0]

(7, 0.2623018)

## Get title topics and author-count

In [102]:
def generate_title_topics_and_author_count(filename, display_count):
    line_count = 0
    dashboard_output_format = []
    with open(filename, encoding="UTF", newline='') as csvfile:
        articles = csv.reader(csvfile, delimiter=',')
        
        for row in articles:
            if row:
                if line_count > 0:
                    article_id = int(row[0])
                    year = 0 if row[-1] == '' else int(row[-1])
                    title = row[-6].translate(str.maketrans('', '', ",.;:'?!/`()[]{}<>\\"))
                    try:
                        #title_bow = bow_corpus[article_id]                        
                        title_bow = title_terms_dict.doc2bow(title_terms_map[title])
                    except:
                        print(title)
                        continue
                    title_top_topic_id = sorted(lda_model_bow.get_document_topics(title_bow), key=lambda tup: -1*tup[1])[0][0]
                    title_top_topic_id = labels_merged_map[title_top_topic_id]
                    dashboard_output_format.append([year, title, title_top_topic_id, row[1].count('|')+1])

                    if line_count < display_count:
                        print(row)
                        print(dashboard_output_format[article_id])
                else:
                    print(row, end='\n----------------------------------------------------------------------------------------\n')
                line_count += 1
    
    print("No. of Lines:", line_count)
    
    return dashboard_output_format

In [103]:
dashboard_output_format = generate_title_topics_and_author_count("./data/dblp_article.csv", 5)

['id', 'author', 'author-aux', 'author-orcid', 'booktitle', 'cdate', 'cdrom', 'cite', 'cite-label', 'crossref', 'editor', 'editor-orcid', 'ee', 'ee-type', 'i', 'journal', 'key', 'mdate', 'month', 'note', 'note-type', 'number', 'pages', 'publisher', 'publtype', 'sub', 'sup', 'title', 'title-bibtex', 'tt', 'url', 'volume', 'year']
----------------------------------------------------------------------------------------
['0', 'Sanjeev Saxena', '', '', '', '', '', '', '', '', '', '', 'https://doi.org/10.1007/BF03036466', '', '', 'Acta Inf.', 'journals/acta/Saxena96', '2017-05-28', '', '', '', '7', '607-619', '', '', '', '', 'Parallel Integer Sorting and Simulation Amongst CRCW Models.', '', '', 'db/journals/acta/acta33.html#Saxena96', '33', '1996']
[1996, 'Parallel Integer Sorting and Simulation Amongst CRCW Models', 0, 1]
['1', 'Hans Ulrich Simon', '', '', '', '', '', '', '', '', '', '', 'https://doi.org/10.1007/BF01257084', '', '', 'Acta Inf.', 'journals/acta/Simon83', '2017-05-28', '', '

In [104]:
print(len(dashboard_output_format))
dashboard_output_format[:7]

2008467


[[1996, 'Parallel Integer Sorting and Simulation Amongst CRCW Models', 0, 1],
 [1983, 'Pattern Matching in Trees and Nets', 6, 1],
 [1983, 'NP-complete Problems Simplified on Tree Schemas', 4, 2],
 [1982, 'On the Power of Chain Rules in Context Free Grammars', 10, 1],
 [1977,
  'Schnelle Multiplikation von Polynomen über Körpern der Charakteristik 2',
  10,
  1],
 [2011, 'A characterization of rational D0L power series', 3, 1],
 [1987, 'The Derivation of Systolic Implementations of Programs', 0, 2]]

In [80]:
sorted(dashboard_output_format, key=lambda lst: -1*lst[-1])

[[2015,
  'The IceProd framework Distributed data processing for the IceCube neutrino observatory',
  1,
  287],
 [2013,
  'The IceProd Framework Distributed Data Processing for the IceCube Neutrino Observatory',
  1,
  287],
 [2014, 'A promoter-level mammalian expression atlas', 1, 263],
 [2006,
  'Length Sensing and Control in the Virgo Gravitational Wave Interferometer',
  6,
  119],
 [2018,
  'Machine Learning in High Energy Physics Community White Paper',
  9,
  118],
 [2016,
  'Theano A Python framework for fast computation of mathematical expressions',
  8,
  112],
 [2015,
  'The BioMart community portal an innovative alternative to large centralized data repositories',
  7,
  105],
 [2014,
  'Finding needles in haystacks linking scientific names reference specimens and molecular data for Fungi',
  8,
  101],
 [2015,
  'New vegetation type map of India prepared using satellite remote sensing Comparison with global vegetation maps and utilities',
  0,
  99],
 [2015, 'Integrative 

## Save outputs

In [122]:
lda_model_bow.save('./model/lda_bow.model')

In [109]:
writer = csv.writer(open("./data/title_topics_authors.csv", "w", encoding="UTF", newline=''))
writer.writerow(['Year','Title','Topic','nAuthors'])
writer.writerows(dashboard_output_format)

In [110]:
reader = csv.reader(open("./data/title_topics_authors.csv", "r", encoding="UTF", newline=''))
line_count = 0
last_row = -1
for row in reader:
    if line_count < 10 and line_count > 0:
        if int(row[0]) != dashboard_output_format[line_count][0]:
            print("row[0]:", row[0], "\t var:", dashboard_output_format[line_count][0])
    line_count += 1
    last_row = row
print("Lines read:", line_count)
print("Last row:", last_row)

row[0]: 1996 	 var: 1983
row[0]: 1983 	 var: 1982
row[0]: 1982 	 var: 1977
row[0]: 1977 	 var: 2011
row[0]: 2011 	 var: 1987
row[0]: 1987 	 var: 1988
row[0]: 1988 	 var: 1978
row[0]: 1978 	 var: 2012
Lines read: 2008468
Last row: ['1991', 'Object ADTs with improvements for Value ADTs', '6', '3']


In [120]:
f = open("./data/topic_labels.txt", "w")
f.write("Label\n")
for topic_label in topic_labels:
    f.write(topic_label+"\n")
f.close()

In [121]:
f = open("./data/topic_labels_unique.txt", "w")
f.write("Label\n")
for topic_label in topic_labels_merged:
    f.write(topic_label+"\n")
f.close()