In [3]:
import json
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import collections
import random
from scipy import stats

In [4]:
d32 = np.loadtxt('./emb/d32combined.txt')

In [5]:
d32_ids = d32[:,0].astype(int)

In [6]:
json_file = open('inv_id_map.json','r')
for line in json_file:
    inv_id_map = json.loads(line)

json_file = open('id_map.json','r')    
for line in json_file:
    id_map = json.loads(line)
    
json_file = open('citation_map.json','r')    
for line in json_file:
    citation_map = json.loads(line)
    
json_file = open('fos_map.json','r')    
for line in json_file:
    fos_map = json.loads(line)    
    
json_file = open('name_map.json','r')    
for line in json_file:
    name_map = json.loads(line)    


In [7]:
import json
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import collections
import random
from scipy import stats

def fos_distance(id_1, id_2):
    id_1str = str(id_1)
    id_2str = str(id_2)
    
    if not(id_1str in fos_map):
        return 100.0
    if not(id_2str in fos_map):
        return 100.0
    id_1_fos = fos_map[id_1str]
    id_2_fos = fos_map[id_2str]
    
    combined_fos = set()
    
    for key in id_1_fos.keys():
        combined_fos.add(key)
    for key in id_2_fos.keys():
        combined_fos.add(key)
    
    distance = 0.0    
    for key in combined_fos:

        w1 = key in id_1_fos and id_1_fos[key] or 0.0
        w2 = key in id_2_fos and id_2_fos[key] or 0.0
        distance += np.power(w1 - w2,2)

    return distance   

def pub2vec_distance(id_1, id_2):
    idx_1 = np.nonzero(d32_ids==id_map[str(id_1)])[0][0]
    idx_2 = np.nonzero(d32_ids==id_map[str(id_2)])[0][0]
    
    feature_1 = np.copy(d32[idx_1, 1:])
    feature_2 = np.copy(d32[idx_2, 1:])
    
    return np.linalg.norm(feature_1 - feature_2)

def pub2vec_closest_k(node_id, k):
    idx = np.nonzero(d32_ids==id_map[str(node_id)])[0][0]
    featurevec = np.copy(d32[idx, 1:])
    
    l2_diff = np.power(d32[:, 1:] - featurevec, 2)
    l2_diff = np.sum(l2_diff, axis=1)
    
    sorted_indices = np.argsort(l2_diff)
    
    
    return d32_ids[sorted_indices[0:k]]
    
def return_distances(node_idx, k):
    top_k = pub2vec_closest_k(node_idx, k+1)
    
    fos_vector = []
    p2v_vector = []
    
    for i in range(1, k+1):
        fos_vector.append(fos_distance(id_map[str(node_idx)], str(top_k[i])) )
        p2v_vector.append(pub2vec_distance(node_idx, inv_id_map[str(top_k[i])]))
    return (fos_vector, p2v_vector)        

def random_distances(node_idx, k):
    
    node_ids = random.sample(id_map.keys(),k)
    fos_vector = []
    p2v_vector = []   
    
    for i in range(k):
        random_node = node_ids[i]
#         print(random_node)
        fos_d = 0
        p2v_d = 0
        try:
            fos_d = fos_distance(id_map[str(node_idx)], id_map[str(random_node)])
            p2v_d = pub2vec_distance(node_idx, random_node)
        
        except:
            pass
        
        if not(fos_d == 0 and p2v_d == 0):
            fos_vector.append(fos_d)
            p2v_vector.append(p2v_d)

    return (fos_vector, p2v_vector)

# Here are some node IDs for The papers I have used

K-means++: 2073459066

Googlenet: 2097117768

Adagrad: 2146502635

node2vec: 2366141641

Edge Detection: 2145023731

RL: 2107726111

Quicksort: 2082357899

Latent Dirichelt Allocation: 1880262756

A*: 1969483458

Spark: 2189465200

In [8]:
top_k = pub2vec_closest_k(2366141641, 10)
for node_id in top_k:
    similar_node_id = inv_id_map[str(node_id)]
    print(name_map[str(node_id)])


node2vec: Scalable Feature Learning for Networks
LINE: Large-scale Information Network Embedding
A High-Performance Semi-Supervised Learning Method for Text Chunking
Dependency Tree-based Sentiment Classification using CRFs with Hidden Variables
Learning Sentiment-Specific Word Embedding for Twitter Sentiment Classification
Zero-Shot Learning Through Cross-Modal Transfer
Learning Multilevel Distributed Representations for High-Dimensional Sequences
Relational learning via latent social dimensions
Baselines and Bigrams: Simple, Good Sentiment and Topic Classification
A New Baseline for Image Annotation


In [9]:
top_k = pub2vec_closest_k(2073459066, 10)
for node_id in top_k:
    similar_node_id = inv_id_map[str(node_id)]
    print(name_map[str(node_id)])

k-means++: the advantages of careful seeding
Clustering of the self-organizing map
Integrating constraints and metric learning in semi-supervised clustering
Data clustering: 50 years beyond K-means
Using the triangle inequality to accelerate k-means
Comparing Clusterings by the Variation of Information
A stability based method for discovering structure in clustered data
Comparing clusterings: an axiomatic view
SIMPLIcity: semantics-sensitive integrated matching for picture libraries
Consensus Clustering: A Resampling-Based Method for Class Discovery and Visualization of Gene Expression Microarray Data


In [10]:
top_k = pub2vec_closest_k(2189465200, 10)
for node_id in top_k:
    similar_node_id = inv_id_map[str(node_id)]
    print(name_map[str(node_id)])

Spark: cluster computing with working sets
Resilient distributed datasets: a fault-tolerant abstraction for in-memory cluster computing
A comparison of approaches to large-scale data analysis
Map-reduce-merge: simplified relational data processing on large clusters
Evaluating MapReduce for Multi-core and Multiprocessor Systems
SCOPE: easy and efficient parallel processing of massive data sets
Pig latin: a not-so-foreign language for data processing
Hive: a warehousing solution over a map-reduce framework
Improving MapReduce performance in heterogeneous environments
Dryad: distributed data-parallel programs from sequential building blocks


In [11]:
top_k = pub2vec_closest_k(2097117768, 10)
for node_id in top_k:
    similar_node_id = inv_id_map[str(node_id)]
    print(name_map[str(node_id)])

Going deeper with convolutions
Very Deep Convolutional Networks for Large-Scale Image Recognition
Caffe: Convolutional Architecture for Fast Feature Embedding
ImageNet Classification with Deep Convolutional Neural Networks
Dropout: a simple way to prevent neural networks from overfitting
OverFeat: Integrated Recognition, Localization and Detection using Convolutional Networks
Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation
Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift
Fully convolutional networks for semantic segmentation
ImageNet classification with deep convolutional neural networks
