In [1]:
import pandas as pd
import numpy as np
import os
import csv
import networkx as nx
from networkx.algorithms import community

from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
import umap
from bqplot import (
    LogScale, LinearScale, OrdinalColorScale, ColorAxis,
    Axis, Scatter, Lines, CATEGORY10, Label, Figure, Tooltip
)
from ipywidgets import HBox, VBox, IntSlider, Play, jslink
from ipywidgets.embed import embed_minimal_html
from ipywidgets import interact, interactive, fixed, interact_manual
import matplotlib.pyplot as plt
import seaborn
from gensim.models import KeyedVectors

In [2]:
DATA_FILENAME = 'data.csv'
EMBEDDING_FILENAME = 'embedding.csv'
UMAP_FILENAME = 'embedding_umap.csv'

In [63]:
G = nx.Graph()
attrs = {}

with open(DATA_FILENAME) as f:
    csv_reader = csv.reader(f, delimiter=' ')
    for row in csv_reader:
        e = (row[0], row[1])
        G.add_edge(row[0], row[1])
        attrs[e] = {'year': int(row[0])%10} 
        
nx.set_edge_attributes(G, attrs)

In [64]:
# node2vec
node2vec = Node2Vec(G, dimensions=32, walk_length=10, num_walks=100, workers=4)  # Use temp_folder for big graphs
model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
model.wv.save_word2vec_format(EMBEDDING_FILENAME)

Computing transition probabilities: 100%|██████████| 62/62 [00:00<00:00, 4566.47it/s]


In [65]:
from networkx.algorithms.community.centrality import girvan_newman

communities_iter = girvan_newman(G)
communities_gn = []

for community_gn in next(communities_iter):
    communities_gn.append(community_gn)

In [66]:
from networkx.algorithms.community.modularity_max import greedy_modularity_communities

communities_gmc = greedy_modularity_communities(G)

communities_gmc = [list(c) for c in communities_gmc]
communities_gmc_dict = {}

c_idx = 0
for c in communities_gmc:
    for node in c:
        communities_gmc_dict[node] = c_idx
    c_idx += 1

In [67]:
# node2vec
idx_list = []
communities_dict = {}
data = []
with open(EMBEDDING_FILENAME) as f:
    csv_reader = csv.reader(f, delimiter=' ')
    next(csv_reader)
    for row in csv_reader:
        idx_list.append(row[0])
        community = communities_gmc_dict[row[0]]
        communities_dict[int(row[0])] = community
        data.append((row[0],community,float(row[1]),float(row[2])))

In [68]:
# UMAP
node_vectors_loaded = KeyedVectors.load_word2vec_format(EMBEDDING_FILENAME)
wv = node_vectors_loaded.vectors
umap_obj = umap.UMAP(n_neighbors=15, min_dist=0.5, n_components=2, metric='correlation')
transformed = umap_obj.fit_transform(wv) # as per https://umap-learn.readthedocs.io/en/latest/basic_usage.html it preserves original ordering
np.savetxt(UMAP_FILENAME, transformed, delimiter=' ')

In [69]:
# UMAP
data_umap = []
with open(UMAP_FILENAME) as f:
    csv_reader = csv.reader(f, delimiter=' ')
    i = 0
    for row in csv_reader:
        data_umap.append((idx_list[i],communities_dict[int(idx_list[i])],row[0],row[1]))
        i += 1

In [70]:
df = pd.DataFrame(data_umap, columns = ['name', 'community', 'x', 'y']) 
x_data = round(df['x'].astype(float),3)
y_data = round(df['y'].astype(float),3)

In [71]:
time_label = Label(x=[0.75], y=[0.10], default_size=46, font_weight='bolder', colors=['orange'],
                   text=[str(0)], enable_move=True)

In [72]:
x_sc = LinearScale(min=-15, max=15)
y_sc = LinearScale(min=-15, max=15)
c_sc = OrdinalColorScale(domain=df['community'].unique().tolist(), colors=CATEGORY10)

In [73]:
ax_y = Axis(label='y', scale=y_sc, orientation='vertical', grid_lines='solid')
ax_x = Axis(label='x', scale=x_sc, orientation='horizontal', grid_lines='solid')

In [74]:
tt = Tooltip(fields=['name', 'community', 'x', 'y'], labels=['name', 'community', 'x', 'y'])

In [75]:
scat = Scatter(x=x_data, y=y_data, color=df['community'],
          names=df['name'], display_names=False,
          scales={'x': x_sc, 'y': y_sc, 'color': c_sc},
          default_size=500, tooltip=tt,
          unhovered_style={'opacity': 0.5}, animate=True)

In [76]:
time_interval = 1

In [77]:
fig = Figure(marks=[scat, time_label], axes=[ax_x, ax_y], title='Graph')

In [94]:
time_slider = IntSlider(min=0, max=9, step=1, description='Time', value=0)

In [95]:
# x_min, x_max = float(np.min(x_data.apply(np.min))), float(np.max(x_data.apply(np.max)))
# y_min, y_max = float(np.min(y_data.apply(np.min))), float(np.max(y_data.apply(np.max)))
# x_sc = LinearScale(min=x_min, max=x_max)
# y_sc = LinearScale(min=y_min, max=y_max)

In [96]:
def edge_or_reverse_matches_time(e,attrs,year):
    if e in attrs:
        if int(attrs[e]['year']) <= int(year):
            return True
        else:
            return False
    if (e[1],e[0] in attrs):
        if int(attrs[(e[1],e[0])]['year']) <= int(year):
            return True
        else:
            return False

In [97]:
def generate_filtered_UMAP_embeddings(years):
    data_dict = {}
    
    for year in years:
        filtered_edges = [e for e in G.edges() if edge_or_reverse_matches_time(e,attrs,year)]
        
        if len(filtered_edges)==0:
            data_dict[year]=(None,None,None,None)
        else:
            G_filtered = G.__class__()
            G_filtered.add_edges_from(filtered_edges)

            # node2vec
            node2vec = Node2Vec(G_filtered, dimensions=32, walk_length=10, num_walks=100, workers=4)  # Use temp_folder for big graphs
            model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
            edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
            model.wv.save_word2vec_format(EMBEDDING_FILENAME)                    

            # UMAP
            node_vectors_loaded = KeyedVectors.load_word2vec_format(EMBEDDING_FILENAME)
            wv = node_vectors_loaded.vectors
            umap_obj = umap.UMAP(n_neighbors=15, min_dist=0.5, n_components=2, metric='correlation')
            transformed = umap_obj.fit_transform(wv) # as per https://umap-learn.readthedocs.io/en/latest/basic_usage.html it preserves original ordering
            np.savetxt(UMAP_FILENAME, transformed, delimiter=' ')

            order = []
            with open(EMBEDDING_FILENAME) as f:
                csv_reader = csv.reader(f, delimiter=' ')
                next(csv_reader)
                for row in csv_reader:
                    order.append(int(row[0]))

            # UMAP
            data_umap = []
            with open(UMAP_FILENAME) as f:
                csv_reader = csv.reader(f, delimiter=' ')
                i = 0
                for row in csv_reader:
                    idx = order[i]
                    data_umap.append((idx,communities_dict[idx],row[0],row[1]))
                    i += 1

            df = pd.DataFrame(data_umap, columns = ['name', 'community', 'x', 'y']) 
            names = df['name']
            x_data = round(df['x'].astype(float),3)
            y_data = round(df['y'].astype(float),3)
            colors = df['community']

            data_dict[year] = (names,colors,x_data,y_data)
    
    return data_dict


Computing transition probabilities: 100%|██████████| 21/21 [00:00<00:00, 11259.16it/s]
Computing transition probabilities: 100%|██████████| 35/35 [00:00<00:00, 12423.89it/s]
Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 9598.44it/s]
Computing transition probabilities: 100%|██████████| 48/48 [00:00<00:00, 9592.92it/s]


In [102]:
data_dict = generate_filtered_UMAP_embeddings([0,1,2,3,4,5,6,7,8,9])

Computing transition probabilities: 100%|██████████| 21/21 [00:00<00:00, 10615.93it/s]
Computing transition probabilities: 100%|██████████| 35/35 [00:00<00:00, 10782.27it/s]
Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 11320.32it/s]
Computing transition probabilities: 100%|██████████| 48/48 [00:00<00:00, 7672.80it/s]
Computing transition probabilities: 100%|██████████| 50/50 [00:00<00:00, 5085.48it/s]
Computing transition probabilities: 100%|██████████| 52/52 [00:00<00:00, 6894.60it/s]
Computing transition probabilities: 100%|██████████| 55/55 [00:00<00:00, 7598.63it/s]
Computing transition probabilities: 100%|██████████| 57/57 [00:00<00:00, 4480.84it/s]
Computing transition probabilities: 100%|██████████| 61/61 [00:00<00:00, 2332.00it/s]
Computing transition probabilities: 100%|██████████| 62/62 [00:00<00:00, 5201.35it/s]


In [103]:
def get_data(time):
    names = data_dict[time][0]
    colors = data_dict[time][1]
    result_x = data_dict[time][2]
    result_y = data_dict[time][3]
    return names, colors, result_x, result_y

In [104]:
def time_changed(change):
    scat.names, scat.color, scat.x, scat.y = get_data(time_slider.value)
    time_label.text = [str(time_slider.value)]
    print(str(time_slider.value) + " " + str(len(scat.x)))

time_slider.observe(time_changed, 'value')

In [105]:
play_button = Play(min=0, max=9, interval=time_interval)
jslink((play_button, 'value'), (time_slider, 'value'))

In [106]:
VBox([HBox([play_button, time_slider]), fig])

VBox(children=(HBox(children=(Play(value=0, interval=1, max=9), IntSlider(value=5, description='Time', max=9))…

1 35
1 35
2 45
2 45
3 48
3 48
4 50
4 50
5 52
5 52
6 55
6 55
7 57
7 57
8 61
8 61
9 62
9 62
8 61
8 61
7 57
7 57
6 55
6 55
5 52
5 52
4 50
4 50
3 48
3 48
2 45
2 45
1 35
1 35
0 21
0 21
1 35
1 35
2 45
2 45
3 48
3 48
4 50
4 50
5 52
5 52
6 55
6 55
7 57
7 57
8 61
8 61
9 62
9 62
8 61
8 61
6 55
6 55
0 21
0 21
1 35
1 35
2 45
2 45
3 48
3 48
4 50
4 50
5 52
5 52
6 55
6 55
7 57
7 57
8 61
8 61
7 57
7 57
6 55
6 55
5 52
5 52
4 50
4 50
3 48
3 48
2 45
2 45


In [26]:
embed_minimal_html('export.html', views=[VBox([fig])], title='Widgets export')

In [27]:
def interactive_umap(n_neighbors, min_dist, n_components):
    umap_obj = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components)
    transformed = umap_obj.fit_transform(wv)
    plt.figure(figsize=(7.5,7.5))
    seaborn.regplot(x=transformed[:, 0], y=transformed[:, 1], fit_reg=False)    
    
    plt.show()

In [28]:
interactive(interactive_umap, n_neighbors=15, min_dist=0.5, n_components=2, metric='correlation')

interactive(children=(IntSlider(value=15, description='n_neighbors', max=45, min=-15), FloatSlider(value=0.5, …