## General Utilities

In [1]:
import numpy as np
import pandas as pd
import csv
import json

In [2]:
def print_first_n_dict_items(d: dict, dict_name: str, n: int):
    print("\nPrinting first", n, "items of dict named", dict_name, "\n========================================================")
    i = 0
    for key, value in d.items():
        print(key, ":", value)
        i += 1
        if i == n:
            break

## Load Data

In [3]:
f = open("./data/titles_lemmatized.json","r")
title_orig_lemmatized_map = json.load(f)
f.close()

In [4]:
type(title_orig_lemmatized_map)
len(title_orig_lemmatized_map)

1952578

In [5]:
f = open("./data/keyword_counts.json","r")
keyword_counts_raw = json.load(f)
f.close()

In [6]:
type(keyword_counts_raw)
len(keyword_counts_raw)

448770

In [7]:
print_first_n_dict_items(keyword_counts_raw, "Keyword-counts", 10)


Printing first 10 items of dict named Keyword-counts 
system : 191799
network : 166093
model : 111842
analysis : 101884
algorithm : 97362
data : 83604
method : 81866
approach : 73121
problem : 66562
application : 66391


In [8]:
print_first_n_dict_items(title_orig_lemmatized_map, "Titles: Original-Lemmatized Map", 10)


Printing first 10 items of dict named Titles: Original-Lemmatized Map 
Parallel Integer Sorting and Simulation Amongst CRCW Models : ['parallel', 'integer', 'sorting', 'simulation', 'crcw', 'model']
Pattern Matching in Trees and Nets : ['pattern', 'matching', 'tree', 'net']
NP-complete Problems Simplified on Tree Schemas : ['np-complete', 'problem', 'simplified', 'tree', 'schema']
On the Power of Chain Rules in Context Free Grammars : ['power', 'chain', 'rule', 'context', 'free', 'grammar']
Schnelle Multiplikation von Polynomen über Körpern der Charakteristik 2 : ['schnelle', 'multiplikation', 'von', 'polynomen', 'über', 'körpern', 'der', 'charakteristik', '2']
A characterization of rational D0L power series : ['characterization', 'rational', 'd0l', 'power', 'series']
The Derivation of Systolic Implementations of Programs : ['derivation', 'systolic', 'implementation', 'program']
Fifo Nets Without Order Deadlock : ['fifo', 'net', 'order', 'deadlock']
On the Complementation Rule for Mul

In [9]:
title_lemma = list(title_orig_lemmatized_map.values())
title_lemma[:10]

[['parallel', 'integer', 'sorting', 'simulation', 'crcw', 'model'],
 ['pattern', 'matching', 'tree', 'net'],
 ['np-complete', 'problem', 'simplified', 'tree', 'schema'],
 ['power', 'chain', 'rule', 'context', 'free', 'grammar'],
 ['schnelle',
  'multiplikation',
  'von',
  'polynomen',
  'über',
  'körpern',
  'der',
  'charakteristik',
  '2'],
 ['characterization', 'rational', 'd0l', 'power', 'series'],
 ['derivation', 'systolic', 'implementation', 'program'],
 ['fifo', 'net', 'order', 'deadlock'],
 ['complementation',
  'rule',
  'multivalued',
  'dependency',
  'database',
  'relation'],
 ['equational', 'weighted', 'tree', 'transformation']]

In [10]:
# Sort as it is a dict() and order need not have been preserved
keyword_counts = sorted(keyword_counts_raw.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)

In [11]:
top_keywords = [word_count[0] for word_count in keyword_counts[:100]]
print(top_keywords)

['system', 'network', 'model', 'analysis', 'algorithm', 'data', 'method', 'approach', 'problem', 'application', 'control', 'learning', 'design', 'image', 'information', 'dynamic', 'graph', 'detection', 'study', 'new', 'performance', 'optimization', 'wireless', 'neural', 'estimation', 'adaptive', 'efficient', 'fuzzy', 'sensor', 'linear', 'modeling', 'function', 'scheme', 'optimal', 'mobile', 'management', 'distributed', 'time', 'communication', 'channel', 'process', 'equation', 'service', 'framework', 'power', 'classification', 'evaluation', 'set', 'structure', 'software', 'technique', 'simulation', 'code', 'multiple', 'environment', 'computing', 'recognition', 'solution', 'robust', 'nonlinear', 'effect', 'theory', 'machine', 'feature', 'case', 'technology', 'social', 'parallel', 'development', 'architecture', 'scheduling', 'selection', 'hybrid', 'programming', 'digital', 'decision', 'space', 'signal', 'web', 'search', 'protocol', 'novel', 'prediction', 'online', 'logic', 'knowledge', '

In [12]:
top_keywords[5], keyword_counts_raw[top_keywords[5]]

('data', 83604)

## Term Co-occurrence

In [13]:
def compute_term_pair_cooccurrences(terms, lemmatized_documents, term_pair_map, display_count):
    for document_words in lemmatized_documents:
        same_word_cooccurring = set()
        for i, doc_term1 in enumerate(document_words):
            for j, doc_term2 in enumerate(document_words):
                if (doc_term1, doc_term2) in term_pair_map:
                    if doc_term1 != doc_term2:
                        term_pair_map[(doc_term1, doc_term2)][-1] += 1
                    elif i != j and doc_term1 not in same_word_cooccurring:
                        same_word_cooccurring.add(doc_term1)
                        term_pair_map[(doc_term1, doc_term2)][-1] += 1

In [14]:
def build_term_pair_cooccurrences(terms, lemmatized_documents, display_count):
    term_pair_map = dict()
    for i,term1 in enumerate(terms):
        for j,term2 in enumerate(terms):
            if j >= i:
                term_pair_map[(term1, term2)] = [i, j, 0]
    
    #print_first_n_dict_items(term_pair_cooccurrences, "terms pairwise cooccurrences", 5)
    compute_term_pair_cooccurrences(terms, lemmatized_documents, term_pair_map, display_count)

    return term_pair_map

In [25]:
def build_term_co_occurrence_matrix(terms, lemmatized_documents, display_count):
    n = len(terms)
    #term_cooccurrence_matrix = np.zeros([n, n], dtype=np.int64)
    term_cooccurrence_matrix = pd.DataFrame(index=terms, columns=terms)
    
    term_pair_map = build_term_pair_cooccurrences(terms, lemmatized_documents, display_count)
    
    for index_and_cooccurence in term_pair_map.values():
        i, j = index_and_cooccurence[0], index_and_cooccurence[1]
        term_cooccurrence_matrix.iat[i,j] = index_and_cooccurence[-1]
        term_cooccurrence_matrix.iat[j,i] = index_and_cooccurence[-1]
    
    return term_cooccurrence_matrix

In [26]:
term_cooccurrence_matrix = build_term_co_occurrence_matrix(top_keywords, title_lemma, 5)

In [27]:
term_cooccurrence_matrix

Unnamed: 0,system,network,model,analysis,algorithm,data,method,approach,problem,application,...,video,strategy,computer,pattern,matrix,support,review,stochastic,random,implementation
system,3717,7639,8918,11606,6256,5962,6327,8270,2660,7208,...,1178,1776,2126,1024,951,4405,1499,3123,889,2830
network,7639,4050,8339,10373,9344,7429,3653,6834,3027,4820,...,1827,1913,1138,1229,569,1175,668,2065,1838,1126
model,8918,8339,1740,6362,3483,5616,3313,3598,1924,4392,...,667,823,797,870,489,989,631,2463,1468,912
analysis,11606,10373,6362,853,4365,8615,5359,4075,2039,4288,...,848,765,828,1345,751,784,756,1291,800,862
algorithm,6256,9344,3483,4365,765,3888,2032,1662,10158,3554,...,825,611,519,845,1575,570,374,1146,764,1495
data,5962,7429,5616,8615,3888,2833,3717,4230,888,3532,...,448,765,398,1060,489,976,657,290,531,507
method,6327,3653,3313,5359,2032,3717,824,1123,6722,3514,...,498,374,480,672,1275,556,572,1252,605,730
approach,8270,6834,3598,4075,1662,4230,1123,96,3882,2055,...,531,424,413,791,636,732,505,1053,428,383
problem,2660,3027,1924,2039,10158,888,6722,3882,769,2552,...,29,597,346,257,727,251,254,1449,525,272
application,7208,4820,4392,4288,3554,3532,3514,2055,2552,197,...,677,450,758,713,850,738,802,672,550,692


## Save outputs 

In [30]:
term_cooccurrence_matrix.to_csv("./data/term_cooccurrence_matrix.csv")