In [165]:
import pandas as pd
import numpy as np
import os
import csv
import networkx as nx
from networkx.algorithms import community

from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
import umap
from bqplot import (
    LogScale, LinearScale, OrdinalColorScale, ColorAxis,
    Axis, Scatter, Lines, CATEGORY10, Label, Figure, Tooltip
)
from ipywidgets import HBox, VBox, IntSlider, Play, jslink
from ipywidgets.embed import embed_minimal_html
from ipywidgets import interact, interactive, fixed, interact_manual
import matplotlib.pyplot as plt
import seaborn
from gensim.models import KeyedVectors

In [166]:
DATA_FILENAME = 'data.csv'
EMBEDDING_FILENAME = 'embedding.csv'
UMAP_FILENAME = 'embedding_umap.csv'

In [167]:
G = nx.Graph()
attrs = {}

with open(DATA_FILENAME) as f:
    csv_reader = csv.reader(f, delimiter=' ')
    for row in csv_reader:
        G.add_edge(row[0],row[1])
        attrs[(row[0],row[1])] = {'year': 2007}
        
nx.set_edge_attributes(G, attrs)

In [168]:
# node2vec
node2vec = Node2Vec(G, dimensions=32, walk_length=10, num_walks=100, workers=4)  # Use temp_folder for big graphs
model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
model.wv.save_word2vec_format(EMBEDDING_FILENAME)

Computing transition probabilities: 100%|██████████| 62/62 [00:00<00:00, 3998.44it/s]


In [169]:
from networkx.algorithms.community.centrality import girvan_newman

communities_iter = girvan_newman(G)
communities_gn = []

for community_gn in next(communities_iter):
    communities_gn.append(community_gn)

In [170]:
from networkx.algorithms.community.modularity_max import greedy_modularity_communities

communities_gmc = greedy_modularity_communities(G)

communities_gmc = [list(c) for c in communities_gmc]
communities_gmc_dict = {}

c_idx = 0
for c in communities_gmc:
    for node in c:
        communities_gmc_dict[node] = c_idx
    c_idx += 1

In [171]:
# UMAP
node_vectors_loaded = KeyedVectors.load_word2vec_format(EMBEDDING_FILENAME)
wv = node_vectors_loaded.vectors
umap_obj = umap.UMAP(n_neighbors=15, min_dist=0.5, n_components=2, metric='correlation')
transformed = umap_obj.fit_transform(wv) # as per https://umap-learn.readthedocs.io/en/latest/basic_usage.html it preserves original ordering
np.savetxt(UMAP_FILENAME, transformed, delimiter=' ')

In [172]:
# node2vec
idx_list = []
communities_idx = []
data = []
with open(EMBEDDING_FILENAME) as f:
    csv_reader = csv.reader(f, delimiter=' ')
    next(csv_reader)
    for row in csv_reader:
        idx_list.append(row[0])
        community = communities_gmc_dict[row[0]]
        communities_idx.append(community)
        data.append((row[0],0,float(row[1]),float(row[2])))

In [173]:
# UMAP
data_umap = []
with open(UMAP_FILENAME) as f:
    csv_reader = csv.reader(f, delimiter=' ')
    idx = 0
    for row in csv_reader:
        data_umap.append((idx_list[idx],communities_idx[idx],row[0],row[1]))
        idx += 1

In [174]:
df = pd.DataFrame(data_umap, columns = ['name', 'community', 'x', 'y']) 
x_data = round(df['x'].astype(float),3)
y_data = round(df['y'].astype(float),3)

In [175]:
# df['community']

In [176]:
x_min, x_max = float(np.min(x_data.apply(np.min))), float(np.max(x_data.apply(np.max)))
y_min, y_max = float(np.min(y_data.apply(np.min))), float(np.max(y_data.apply(np.max)))

In [177]:
x_sc = LinearScale(min=x_min, max=x_max)
y_sc = LinearScale(min=y_min, max=y_max)
c_sc = OrdinalColorScale(domain=df['community'].unique().tolist(), colors=CATEGORY10)

In [178]:
color_dict = {}
for domain in c_sc.domain:
    color_dict[domain] = c_sc.colors[domain]

color_list = []
for idx in range(len(x_data)):
    color_list.append(color_dict[communities_idx[idx]])
    
color_list[:3]

color_df = pd.DataFrame(color_list, columns = ['color']) 

In [179]:
ax_y = Axis(label='y', scale=y_sc, orientation='vertical', grid_lines='solid')
ax_x = Axis(label='x', scale=x_sc, orientation='horizontal', grid_lines='solid')

In [180]:
tt = Tooltip(fields=['name', 'x', 'y'], labels=['name', 'x', 'y'])

In [181]:
scat = Scatter(x=x_data, y=y_data, color=df['community'],
          names=df['name'], display_names=False,
          scales={'x': x_sc, 'y': y_sc, 'color': c_sc},
          default_size=500, tooltip=tt,
          unhovered_style={'opacity': 0.5})

In [182]:
time_interval = 10

In [183]:
fig = Figure(marks=[scat], axes=[ax_x, ax_y], title='Graph')

In [184]:
VBox([fig])

VBox(children=(Figure(axes=[Axis(label='x', scale=LinearScale(max=5.958, min=-3.23)), Axis(label='y', orientat…

In [185]:
embed_minimal_html('export.html', views=[VBox([fig])], title='Widgets export')

In [64]:
def interactive_umap(n_neighbors, min_dist, n_components):
    umap_obj = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components)
    transformed = umap_obj.fit_transform(wv)
    plt.figure(figsize=(7.5,7.5))
    seaborn.regplot(x=transformed[:, 0], y=transformed[:, 1], fit_reg=False)    
    
    plt.show()

In [65]:
interactive(interactive_umap, n_neighbors=15, min_dist=0.5, n_components=2, metric='correlation')

interactive(children=(IntSlider(value=15, description='n_neighbors', max=45, min=-15), FloatSlider(value=0.5, …