In [12]:
import numpy as np
from numpy import genfromtxt
import pandas as pd

import csv
import json

## 1. Word-cloud

In [13]:
def generate_word_cloud_output_format(term_counts, size=20, save_mode=True):
    # Assumes that term_counts is sorted by count
    output = [[term_count[0], term_count[1]] for term_count in term_counts[:size]]
    if save_mode:
        f = open("./data/wordcloud_output_format.json","w")
        json.dump(output, f)
        f.close()
        
    return output

In [14]:
f = open("./data/keyword_counts.json","r")
keyword_counts_raw = json.load(f)
f.close()

In [15]:
# Sort as it is a dict() and order need not have been preserved
keyword_counts = sorted(keyword_counts_raw.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)
keyword_counts[:100]

[('system', 191799),
 ('network', 166093),
 ('model', 111842),
 ('analysis', 101884),
 ('algorithm', 97362),
 ('data', 83604),
 ('method', 81866),
 ('approach', 73121),
 ('problem', 66562),
 ('application', 66391),
 ('control', 65290),
 ('learning', 60695),
 ('design', 60176),
 ('image', 55137),
 ('information', 53651),
 ('dynamic', 45930),
 ('graph', 45692),
 ('detection', 42851),
 ('study', 41767),
 ('new', 41368),
 ('performance', 40169),
 ('optimization', 37933),
 ('wireless', 36651),
 ('neural', 36613),
 ('estimation', 34629),
 ('adaptive', 34213),
 ('efficient', 33874),
 ('fuzzy', 33855),
 ('sensor', 31748),
 ('linear', 31313),
 ('modeling', 31255),
 ('function', 31018),
 ('scheme', 29853),
 ('optimal', 29646),
 ('mobile', 29518),
 ('management', 29274),
 ('distributed', 29161),
 ('time', 29007),
 ('communication', 28883),
 ('channel', 28602),
 ('process', 28520),
 ('equation', 27539),
 ('service', 27432),
 ('framework', 26778),
 ('power', 26750),
 ('classification', 26229),
 ('e

In [16]:
generate_word_cloud_output_format(keyword_counts, size=25, save_mode=True)

[['system', 191799],
 ['network', 166093],
 ['model', 111842],
 ['analysis', 101884],
 ['algorithm', 97362],
 ['data', 83604],
 ['method', 81866],
 ['approach', 73121],
 ['problem', 66562],
 ['application', 66391],
 ['control', 65290],
 ['learning', 60695],
 ['design', 60176],
 ['image', 55137],
 ['information', 53651],
 ['dynamic', 45930],
 ['graph', 45692],
 ['detection', 42851],
 ['study', 41767],
 ['new', 41368],
 ['performance', 40169],
 ['optimization', 37933],
 ['wireless', 36651],
 ['neural', 36613],
 ['estimation', 34629]]

## 2. Co-occurrence Matrix

In [17]:
def generate_cooccurrence_matrix_output_format(term_counts, term_topic, term_cooccurrence_matrix, size=20, log_scale=False,
                                               sort_order='count', save_mode=True, savefile_name=None):
    nodes, links = [], []
    
    if sort_order == 'count':
        sorted_terms = sorted(term_counts, key = lambda kv:kv[1])
    elif sort_order == 'name':
        sorted_terms = sorted(term_counts, key = lambda kv:kv[0])
    
    for (term, count) in sorted_terms:
        nodes.append({"name":term, "count":count, "group":term_topic[term]})
        if len(nodes) == size:
            break
    
    for i, row_node_info in enumerate(nodes):
        for j, col_node_info in enumerate(nodes):
            cooccurrence_count = int(term_cooccurrence_matrix.loc[row_node_info["name"], col_node_info["name"]])
            links.append({"source":i, "target":j, 
                          "value": round(np.log(cooccurrence_count), 2) if log_scale else cooccurrence_count})

    output = {"nodes":nodes, "links":links}
    if save_mode:
        f = open(savefile_name,"w")
        json.dump(output, f)
        f.close()
        
    return json.dumps(output)

In [18]:
f = open("./data/top_keyword_top_topic.json","r")
top_term_top_topic = json.load(f)
f.close()
top_term_top_topic

{'system': 9,
 'network': 10,
 'model': 2,
 'analysis': 2,
 'algorithm': 6,
 'data': 1,
 'method': 6,
 'approach': 2,
 'problem': 6,
 'application': 7,
 'control': 9,
 'learning': 2,
 'design': 7,
 'image': 8,
 'information': 12,
 'dynamic': 9,
 'graph': 5,
 'detection': 8,
 'study': 11,
 'new': 6,
 'performance': 3,
 'optimization': 6,
 'wireless': 10,
 'neural': 8,
 'estimation': 3,
 'adaptive': 9,
 'efficient': 0,
 'fuzzy': 9,
 'sensor': 10,
 'linear': 6,
 'modeling': 2,
 'function': 5,
 'scheme': 0,
 'optimal': 9,
 'mobile': 10,
 'management': 1,
 'distributed': 10,
 'time': 6,
 'communication': 10,
 'channel': 3,
 'process': 1,
 'equation': 6,
 'service': 12,
 'framework': 1,
 'power': 10,
 'classification': 8,
 'evaluation': 7,
 'set': 5,
 'structure': 1,
 'software': 7,
 'technique': 4,
 'simulation': 3,
 'code': 3,
 'multiple': 10,
 'environment': 7,
 'computing': 0,
 'recognition': 8,
 'solution': 6,
 'robust': 9,
 'nonlinear': 6,
 'effect': 3,
 'theory': 11,
 'machine': 2,
 '

In [19]:
term_cooccurrence_matrix = pd.read_csv("./data/term_cooccurrence_matrix.csv", index_col=0)

In [20]:
term_cooccurrence_matrix

Unnamed: 0,system,network,model,analysis,algorithm,data,method,approach,problem,application,...,video,strategy,computer,pattern,matrix,support,review,stochastic,random,implementation
system,3717,7639,8918,11606,6256,5962,6327,8270,2660,7208,...,1178,1776,2126,1024,951,4405,1499,3123,889,2830
network,7639,4050,8339,10373,9344,7429,3653,6834,3027,4820,...,1827,1913,1138,1229,569,1175,668,2065,1838,1126
model,8918,8339,1740,6362,3483,5616,3313,3598,1924,4392,...,667,823,797,870,489,989,631,2463,1468,912
analysis,11606,10373,6362,853,4365,8615,5359,4075,2039,4288,...,848,765,828,1345,751,784,756,1291,800,862
algorithm,6256,9344,3483,4365,765,3888,2032,1662,10158,3554,...,825,611,519,845,1575,570,374,1146,764,1495
data,5962,7429,5616,8615,3888,2833,3717,4230,888,3532,...,448,765,398,1060,489,976,657,290,531,507
method,6327,3653,3313,5359,2032,3717,824,1123,6722,3514,...,498,374,480,672,1275,556,572,1252,605,730
approach,8270,6834,3598,4075,1662,4230,1123,96,3882,2055,...,531,424,413,791,636,732,505,1053,428,383
problem,2660,3027,1924,2039,10158,888,6722,3882,769,2552,...,29,597,346,257,727,251,254,1449,525,272
application,7208,4820,4392,4288,3554,3532,3514,2055,2552,197,...,677,450,758,713,850,738,802,672,550,692


In [21]:
generate_cooccurrence_matrix_output_format(keyword_counts[:25], top_term_top_topic, term_cooccurrence_matrix,
                                           size=25, log_scale=True, sort_order='name',
                                           save_mode=True, savefile_name="./data/cooccurrence_matrix_log_output_format.json")

'{"nodes": [{"name": "algorithm", "count": 97362, "group": 6}, {"name": "analysis", "count": 101884, "group": 2}, {"name": "application", "count": 66391, "group": 7}, {"name": "approach", "count": 73121, "group": 2}, {"name": "control", "count": 65290, "group": 9}, {"name": "data", "count": 83604, "group": 1}, {"name": "design", "count": 60176, "group": 7}, {"name": "detection", "count": 42851, "group": 8}, {"name": "dynamic", "count": 45930, "group": 9}, {"name": "estimation", "count": 34629, "group": 3}, {"name": "graph", "count": 45692, "group": 5}, {"name": "image", "count": 55137, "group": 8}, {"name": "information", "count": 53651, "group": 12}, {"name": "learning", "count": 60695, "group": 2}, {"name": "method", "count": 81866, "group": 6}, {"name": "model", "count": 111842, "group": 2}, {"name": "network", "count": 166093, "group": 10}, {"name": "neural", "count": 36613, "group": 8}, {"name": "new", "count": 41368, "group": 6}, {"name": "optimization", "count": 37933, "group": 