In [1]:
import pandas as pd
import numpy as np
import os
import csv
import networkx as nx
from networkx.algorithms import community

from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
import umap
from bqplot import (
    LogScale, LinearScale, OrdinalColorScale, ColorAxis,
    Axis, Scatter, Lines, CATEGORY10, Label, Figure, Tooltip
)
from ipywidgets import HBox, VBox, IntSlider, Play, jslink
from ipywidgets.embed import embed_minimal_html
from ipywidgets import interact, interactive, fixed, interact_manual
import matplotlib.pyplot as plt
import seaborn
from gensim.models import KeyedVectors
from sklearn.preprocessing import MinMaxScaler

In [2]:
NODES_FILENAME = 'dblp_nodes.csv'
EDGES_FILENAME = 'dblp_edges.csv'
EMBEDDING_FILENAME = 'embedding.csv'
UMAP_FILENAME = 'embedding_umap.csv'

In [3]:
nodes_titles = {}
with open(NODES_FILENAME) as f:
    csv_reader = csv.reader(f, delimiter='|')
    for row in csv_reader:
        nodes_titles[int(row[1])] = row[2]

In [4]:
G = nx.Graph()
attrs = {}
min_year = 2500
max_year = 1000

with open(EDGES_FILENAME) as f:
    csv_reader = csv.reader(f, delimiter=' ')
    for row in csv_reader:
        e = (row[0], row[1])
        year = int(row[2])
        min_year = min(min_year,year)
        max_year = max(max_year,year)
        G.add_edge(row[0], row[1])
        attrs[e] = {'year': year} 
        
nx.set_edge_attributes(G, attrs)

In [5]:
# node2vec
node2vec = Node2Vec(G, dimensions=32, walk_length=10, num_walks=100, workers=4)  # Use temp_folder for big graphs
model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
model.wv.save_word2vec_format(EMBEDDING_FILENAME)

Computing transition probabilities: 100%|██████████| 114/114 [00:00<00:00, 7927.69it/s]


In [6]:
from networkx.algorithms.community.centrality import girvan_newman

communities_iter = girvan_newman(G)
communities_gn = []

for community_gn in next(communities_iter):
    communities_gn.append(community_gn)

In [7]:
from networkx.algorithms.community.modularity_max import greedy_modularity_communities

communities_gmc = greedy_modularity_communities(G)

communities_gmc = [list(c) for c in communities_gmc]
communities_gmc_dict = {}

c_idx = 0
for c in communities_gmc:
    for node in c:
        communities_gmc_dict[node] = c_idx
    c_idx += 1

In [8]:
# node2vec
idx_list = []
communities_dict = {}
data = []
with open(EMBEDDING_FILENAME) as f:
    csv_reader = csv.reader(f, delimiter=' ')
    next(csv_reader)
    for row in csv_reader:
        idx_list.append(row[0])
        community = communities_gmc_dict[row[0]]
        communities_dict[int(row[0])] = community
        data.append((row[0],community,float(row[1]),float(row[2])))

In [9]:
# UMAP
node_vectors_loaded = KeyedVectors.load_word2vec_format(EMBEDDING_FILENAME)
wv = node_vectors_loaded.vectors
umap_obj = umap.UMAP(n_neighbors=15, min_dist=0.5, n_components=2, metric='correlation')
transformed = umap_obj.fit_transform(wv) # as per https://umap-learn.readthedocs.io/en/latest/basic_usage.html it preserves original ordering
np.savetxt(UMAP_FILENAME, transformed, delimiter=' ')

In [10]:
# UMAP
data_umap = []
with open(UMAP_FILENAME) as f:
    csv_reader = csv.reader(f, delimiter=' ')
    i = 0
    for row in csv_reader:
        data_umap.append((idx_list[i],nodes_titles[int(idx_list[i])],communities_dict[int(idx_list[i])],row[0],row[1]))
        i += 1

In [11]:
df = pd.DataFrame(data_umap, columns = ['id', 'name', 'community', 'x', 'y']) 
x_data = round(df['x'].astype(float),3)
y_data = round(df['y'].astype(float),3)

In [12]:
time_label = Label(x=[0.75], y=[0.10], default_size=46, font_weight='bolder', colors=['orange'],
                   text=[str(0)], enable_move=True)

In [13]:
x_sc = LinearScale(min=0, max=1)
y_sc = LinearScale(min=0, max=1)
c_sc = OrdinalColorScale(domain=df['community'].unique().tolist(), colors=CATEGORY10)

In [14]:
ax_y = Axis(label='y', scale=y_sc, orientation='vertical', grid_lines='solid')
ax_x = Axis(label='x', scale=x_sc, orientation='horizontal', grid_lines='solid')

In [15]:
tt = Tooltip(fields=['name', 'x', 'y'], labels=['name', 'x', 'y'])

In [16]:
scat = Scatter(x=x_data, y=y_data, color=df['community'],
          names=df['name'], display_names=False,
          scales={'x': x_sc, 'y': y_sc, 'color': c_sc},
          default_size=500, tooltip=tt,
          unhovered_style={'opacity': 0.5}, animate=True)

In [17]:
fig = Figure(marks=[scat, time_label], axes=[ax_x, ax_y], title='Graph')

In [18]:
time_slider = IntSlider(min=min_year, max=max_year, step=1, description='Time', value=0)

In [19]:
time_interval = 1

In [20]:
def edge_or_reverse_matches_time(e,attrs,year):
    if e in attrs:
        if int(attrs[e]['year']) <= int(year):
            return True
        else:
            return False
    if (e[1],e[0] in attrs):
        if int(attrs[(e[1],e[0])]['year']) <= int(year):
            return True
        else:
            return False

In [21]:
def generate_filtered_UMAP_embeddings(years):
    data_dict = {}
    
    for year in years:
        filtered_edges = [e for e in G.edges() if edge_or_reverse_matches_time(e,attrs,year)]
        
        if len(filtered_edges)<=1:
            data_dict[year]=(None,None,None,None,None)
        else:
            G_filtered = G.__class__()
            G_filtered.add_edges_from(filtered_edges)

            # node2vec
            node2vec = Node2Vec(G_filtered, dimensions=32, walk_length=10, num_walks=100, workers=4)  # Use temp_folder for big graphs
            model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
            edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
            model.wv.save_word2vec_format(EMBEDDING_FILENAME)                    

            # UMAP
            node_vectors_loaded = KeyedVectors.load_word2vec_format(EMBEDDING_FILENAME)
            wv = node_vectors_loaded.vectors
            umap_obj = umap.UMAP(n_neighbors=15, min_dist=0.5, n_components=2, metric='correlation')
            transformed = umap_obj.fit_transform(wv) # as per https://umap-learn.readthedocs.io/en/latest/basic_usage.html it preserves original ordering
            np.savetxt(UMAP_FILENAME, transformed, delimiter=' ')

            order = []
            with open(EMBEDDING_FILENAME) as f:
                csv_reader = csv.reader(f, delimiter=' ')
                next(csv_reader)
                for row in csv_reader:
                    order.append(int(row[0]))

            # UMAP
            data_umap = []
            with open(UMAP_FILENAME) as f:
                csv_reader = csv.reader(f, delimiter=' ')
                i = 0
                for row in csv_reader:
                    idx = order[i]
                    data_umap.append((idx,nodes_titles[idx],communities_dict[idx],row[0],row[1]))
                    i += 1

            df = pd.DataFrame(data_umap, columns = ['id', 'name', 'community', 'x', 'y']) 

            names = df['name']
            x_data = round(df['x'].astype(float),3)
            y_data = round(df['y'].astype(float),3)
            colors = df['community']

            data_dict[year] = (names,colors,x_data,y_data)
    
    return data_dict


In [22]:
years = [y+min_year for y in list(range(max_year-min_year+1))]

In [23]:
data_dict = generate_filtered_UMAP_embeddings(years)

Computing transition probabilities: 100%|██████████| 5/5 [00:00<00:00, 3872.86it/s]
  "n_neighbors is larger than the dataset size; truncating to "
Computing transition probabilities: 100%|██████████| 5/5 [00:00<00:00, 14915.73it/s]
  "n_neighbors is larger than the dataset size; truncating to "
Computing transition probabilities: 100%|██████████| 5/5 [00:00<00:00, 14614.30it/s]
  "n_neighbors is larger than the dataset size; truncating to "
Computing transition probabilities: 100%|██████████| 16/16 [00:00<00:00, 15283.28it/s]
Computing transition probabilities: 100%|██████████| 21/21 [00:00<00:00, 12529.22it/s]
Computing transition probabilities: 100%|██████████| 30/30 [00:00<00:00, 15673.78it/s]
Computing transition probabilities: 100%|██████████| 33/33 [00:00<00:00, 11113.86it/s]
Computing transition probabilities: 100%|██████████| 36/36 [00:00<00:00, 13413.43it/s]
Computing transition probabilities: 100%|██████████| 40/40 [00:00<00:00, 18224.22it/s]
Computing transition probabiliti

In [24]:
scaler = MinMaxScaler()
    
def get_data(time):
    names = data_dict[time][0]
    colors = data_dict[time][1]
    result_x = data_dict[time][2]
    result_y = data_dict[time][3]
    
    if result_x is not None:
        result_x = (result_x - result_x.min()) / (result_x.max() - result_x.min())
    if result_y is not None:
        result_y = (result_y - result_y.min()) / (result_y.max() - result_y.min())

    return names, colors, result_x, result_y

In [25]:
def time_changed(change):
    scat.names, scat.color, scat.x, scat.y = get_data(time_slider.value)
    time_label.text = [str(time_slider.value)]

time_slider.observe(time_changed, 'value')

In [26]:
play_button = Play(min=min_year, max=max_year, interval=time_interval)
jslink((play_button, 'value'), (time_slider, 'value'))

In [27]:
VBox([HBox([play_button, time_slider]), fig])

VBox(children=(HBox(children=(Play(value=1985, interval=1, max=2009, min=1985), IntSlider(value=1985, descript…

In [28]:
embed_minimal_html('export.html', views=[VBox([fig])], title='Widgets export')

In [29]:
def interactive_umap(n_neighbors, min_dist, n_components):
    umap_obj = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components)
    transformed = umap_obj.fit_transform(wv)
    plt.figure(figsize=(7.5,7.5))
    seaborn.regplot(x=transformed[:, 0], y=transformed[:, 1], fit_reg=False)    
    
    plt.show()

In [30]:
interactive(interactive_umap, n_neighbors=15, min_dist=0.5, n_components=2, metric='correlation')

interactive(children=(IntSlider(value=15, description='n_neighbors', max=45, min=-15), FloatSlider(value=0.5, …