In [1]:
from igraph import *
import numpy as np
import xnet
from collections import defaultdict
import glob
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from itertools import combinations
from scipy import spatial
from sklearn.metrics import silhouette_samples, silhouette_score
import pandas as pd
from floweaver import *



In [2]:
%matplotlib notebook

In [3]:
files = 'colabs/wbasic/*selected*pac5_cluster.xnet'

files = glob.glob(files)
files = sorted(files)

# data = xnet.xnet2igraph('../data/citation_network_ge1990_pacs.xnet')

In [None]:
bows = []
names = []
i = 0
for f in files:
    print(f)
    colab_net = xnet.xnet2igraph(f)
    bows.append(colab_net.vs['reduced_bow'])
    names.append(colab_net.vs['name'])

In [4]:
def to_list(bow):
    return [ast.literal_eval(b) for b in bow]

def bows_to_list(bows):
    new_bows = []
    for b in bows:
        new_bows.append(to_list(b))
    return new_bows

In [None]:
new_bows = bows_to_list(bows)

In [5]:
def find_k(new_bow):
    range_n_clusters = [50,60,70,80,90,100,110,120]
    X = new_bow
    sils = []
    for n_clusters in range_n_clusters:
        clusterer = KMeans(n_clusters=n_clusters,random_state=10)
        cluster_labels = clusterer.fit_predict(X)
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)
        sils.append(silhouette_avg)
    return np.argmax(sils)

In [6]:
range_n_clusters = [50,60,70,80,90,100,110,120]

def clustering(bow,names):
    #arg = find_k(bow)
    #k = range_n_clusters[arg]
    k = 100
    alg = KMeans(n_clusters=k,random_state=10)
    alg.fit(bow)
    c = 0
    return alg.labels_

def to_sets(names,labels):
    sets = dict()
    unique = set(labels)
    for l in unique:
        sets[l] = set()
    for n,l in zip(names,labels):
        sets[l].add(n)
    return sets

In [None]:
sets_series = []
for bow,name in zip(new_bows,names):
    labels = clustering(bow,name)
    sets = to_sets(name,labels)
    sets_series.append(sets)

In [7]:
def match_pair(sets_t,sets_t1,mink=50):
    out = []
    for l1,s1 in sets_t.items():
        if len(s1) < mink:
            continue
        min_dist = 100000
        label = -1
        for l2,s2 in sets_t1.items():
            if len(s2) < mink:
                continue
            d = 1 - len(s1&s2)/len(s1|s2)
            if d < min_dist:
                label = l2
                min_dist = d
        out.append((l1,label))
    # retorna quem dá continuidade a quem
    return out

def match(sets_t,sets_t1,mink=50):
    out1 = match_pair(sets_t,sets_t1,mink)
    out2 = match_pair(sets_t1,sets_t,mink)
    
    path = []
    for (a,b) in out1:
        if (b,a) in out2:
            path.append((a,b))
    return path

In [8]:
def match_all_series(sets_series,mink):
    n = len(sets_series)
    paths = []
    for i in range(n-1):
        ps = match(sets_series[i],sets_series[i+1],mink)
        paths.append(ps)
    return paths

In [9]:
def get_arg(i,paths):
    for arg,p in enumerate(paths):
        if p[0] == i:
            return arg
    return -1

def get_paths_graph(paths,nv=100):
    paths_graph = Graph(directed=True)
    n = len(paths)
    names = []
    for i in range(n+1):
        for j in range(nv):
            names.append(str(i) + '_' + str(j))
    paths_graph.add_vertices(len(names))
    paths_graph.vs['name'] = names

    for idx_begin in range(n):
        for i in range(nv):
            idx = idx_begin
            arg = get_arg(i,paths[idx])
            while arg >= 0:
                step = paths[idx][arg]
                a = str(idx) + '_' + str(step[0])
                b = str(idx+1) + '_' + str(step[1])
                paths_graph.add_edge(a,b)
                j = paths[idx][arg][1]
                idx += 1
                if idx == len(paths):
                    break
                arg = get_arg(j,paths[idx])
    paths_graph.simplify()
    return paths_graph

In [10]:
def get_paths(paths_graph,mink):
    paths_graph.to_undirected()
    valid_paths = []
    for comp in paths_graph.decompose(mode=WEAK,minelements=mink):
        valid_paths.append(comp.vs['name'])
    return valid_paths

In [66]:
files = 'pacs/infomap/pac_net_*.xnet'

files = glob.glob(files)
files = sorted(files)

In [67]:
names = []
communities = []
for f in files:
    print(f)
    pac_net = xnet.xnet2igraph(f)
    names.append(pac_net.vs['name'])
    communities.append([int(c) for c in pac_net.vs['community']])

pacs/infomap/pac_net_1990_infomap.xnet
pacs/infomap/pac_net_1991_infomap.xnet
pacs/infomap/pac_net_1992_infomap.xnet
pacs/infomap/pac_net_1993_infomap.xnet
pacs/infomap/pac_net_1994_infomap.xnet
pacs/infomap/pac_net_1995_infomap.xnet
pacs/infomap/pac_net_1996_infomap.xnet
pacs/infomap/pac_net_1997_infomap.xnet
pacs/infomap/pac_net_1998_infomap.xnet
pacs/infomap/pac_net_1999_infomap.xnet
pacs/infomap/pac_net_2000_infomap.xnet
pacs/infomap/pac_net_2001_infomap.xnet
pacs/infomap/pac_net_2002_infomap.xnet
pacs/infomap/pac_net_2003_infomap.xnet
pacs/infomap/pac_net_2004_infomap.xnet
pacs/infomap/pac_net_2005_infomap.xnet
pacs/infomap/pac_net_2006_infomap.xnet
pacs/infomap/pac_net_2007_infomap.xnet
pacs/infomap/pac_net_2008_infomap.xnet
pacs/infomap/pac_net_2009_infomap.xnet
pacs/infomap/pac_net_2010_infomap.xnet


In [68]:
sets = [to_sets(names1,communities1) for names1,communities1 in zip(names,communities)]

In [69]:
paths_sets = match_all_series(sets,1)
g_pacs_paths = get_paths_graph(paths_sets,nv=len(names[0]))
valid_paths = get_paths(g_pacs_paths,2)

In [93]:
pacs_names = {'01' : 'Communication, education, history, and philosophy'
,'02' : 'Mathematical methods in physics'
,'03' : 'Quantum mechanics, field theories, and special relativity'
,'04' : 'General relativity and gravitation'
,'05' : 'Statistical physics, thermodynamics, and nonlinear dynamical systems'
,'06' : 'Metrology, measurements, and laboratory procedures'
,'07' : 'Instruments, apparatus, and components common to several branches of physics and astronomy'
,'11' : 'General theory of fields and particles'
,'12' : 'Specific theories and interaction models; particle systematics'
,'13' : 'Specific reactions and phenomenology'
,'14' : 'Properties of specific particles'
,'21' : 'Nuclear structure'
,'23' : 'Radioactive decay and in-beam spectroscopy'
,'24' : 'Nuclear reactions: general'
,'25' : 'Nuclear reactions: specific reactions'
,'26' : 'Nuclear astrophysics'
,'27' : 'Properties of specific nuclei listed by mass ranges'
,'28' : 'Nuclear engineering and nuclear power studies'
,'29' : 'Experimental methods and instrumentation for elementary-particle and nuclear physics'
,'31' : 'Electronic structure of atoms and molecules: theory'
,'32' : 'Atomic properties and interactions with photons'
,'33' : 'Molecular properties and interactions with photons'
,'34' : 'Atomic and molecular collision processes and interactions'
,'36' : 'Exotic atoms and molecules; macromolecules; clusters'
,'37' : 'Mechanical control of atoms, molecules, and ions'
,'41' : 'Electromagnetism; electron and ion optics'
,'42' : 'Optics'
,'43' : 'Acoustics'
,'44' : 'Heat transfer'
,'45' : 'Classical mechanics of discrete systems'
,'46' : 'Continuum mechanics of solids'
,'47' : 'Fluid dynamics'
,'51' : 'Physics of gases'
,'52' : 'Physics of plasmas and electric discharges'
,'61' : 'Structure of solids and liquids; crystallography'
,'62' : 'Mechanical and acoustical properties of condensed matter'
,'63' : 'Lattice dynamics'
,'64' : 'Equations of state, phase equilibria, and phase transitions'
,'65' : 'Thermal properties of condensed matter'
,'66' : 'Nonelectronic transport properties of condensed matter'
,'67' : 'Quantum fluids and solids'
,'68' : 'Surfaces and interfaces; thin films and nanosystems (structure and nonelectronic properties)'
,'71' : 'Electronic structure of bulk materials'
,'72' : 'Electronic transport in condensed matter'
,'73' : 'Electronic structure and electrical properties of surfaces, interfaces, thin films, and low-dimensional structures'
,'74' : 'Superconductivity'
,'75' : 'Magnetic properties and materials'
,'76' : 'Magnetic resonances and relaxations in condensed matter, Mössbauer effect'
,'77' : 'Dielectrics, piezoelectrics, and ferroelectrics and their properties'
,'78' : 'Optical properties, condensed-matter spectroscopy and other interactions of radiation and particles with condensed matter'
,'79' : 'Electron and ion emission by liquids and solids; impact phenomena'
,'81' : 'Materials science'
,'82' : 'Physical chemistry and chemical physics'
,'83' : 'Rheology'
,'84' : 'Electronics; radiowave and microwave technology; direct energy conversion and storage'
,'85' : 'Electronic and magnetic devices; microelectronics'
,'87' : 'Biological and medical physics'
,'88' : 'Renewable energy resources and applications'
,'89' : 'Other areas of applied and interdisciplinary physics'
,'91' : 'Solid Earth physics'
,'92' : 'Hydrospheric and atmospheric geophysics'
,'93' : 'Geophysical observations, instrumentation, and techniques'
,'94' : 'Physics of the ionosphere and magnetosphere'
,'95' : 'Fundamental astronomy and astrophysics; instrumentation, techniques, and astronomical observations'
,'96' : 'Solar system; planetology'
,'97' : 'Stars'
,'98' : 'Stellar systems; interstellar medium; galactic and extragalactic objects and systems; the Universe'}

In [94]:
for c,ps in enumerate(valid_paths):
    pacs_in_cluster = set()
    for p in ps:
        p = p.split('_')
        idx = int(p[0])
        comm = int(p[1])
        pacs_in_cluster |= sets[idx][comm]
    print(c)
    for p in pacs_in_cluster:
        print(pacs_names[p])
    print()

0
Physical chemistry and chemical physics
Renewable energy resources and applications
Electron and ion emission by liquids and solids; impact phenomena
Electronic and magnetic devices; microelectronics
Electromagnetism; electron and ion optics
Surfaces and interfaces; thin films and nanosystems (structure and nonelectronic properties)
Electronic transport in condensed matter
Molecular properties and interactions with photons
Quantum fluids and solids
Mathematical methods in physics
Other areas of applied and interdisciplinary physics
Heat transfer
Hydrospheric and atmospheric geophysics


KeyError: '86'

In [70]:
def get_groups(valid_paths):
    groups_by_pac = dict()
    for i,ps in enumerate(valid_paths):
        for p in ps:
            p = p.split('_')
            idx = int(p[0])
            c = int(p[1])
            for pac in sets[idx][c]:
                if not pac in groups_by_pac:
                    groups_by_pac[pac] = []
                groups_by_pac[pac].append((idx+1990,i))
    return groups_by_pac

In [71]:
pac_hist = get_groups(valid_paths)

In [72]:
n = len(valid_paths)
transitions = dict()
for i in range(1990,2010):
    for j in range(n):
        for k in range(n):
            transitions[((i,j),(i+1,k))] = 0
for pac,hist in pac_hist.items():
    hist = sorted(hist)
    hist_len = len(hist)
    for i in range(hist_len-1):
        key = (hist[i],hist[i+1])
        if key in transitions:
            transitions[key] += 1

In [73]:
from collections import defaultdict

# conv = defaultdict(lambda:len(conv)+1)
source = []
target = []
value = []
colors = []
for k,v in transitions.items():
    print(k,v)
    if v > 0:
        colors.append(k[0][1])
        source.append(str(k[0]))
        target.append(str(k[1]))
        value.append(v)

((1990, 0), (1991, 0)) 49
((1990, 0), (1991, 1)) 0
((1990, 0), (1991, 2)) 0
((1990, 0), (1991, 3)) 0
((1990, 0), (1991, 4)) 0
((1990, 0), (1991, 5)) 0
((1990, 0), (1991, 6)) 0
((1990, 0), (1991, 7)) 0
((1990, 0), (1991, 8)) 0
((1990, 0), (1991, 9)) 0
((1990, 0), (1991, 10)) 1
((1990, 0), (1991, 11)) 0
((1990, 0), (1991, 12)) 0
((1990, 0), (1991, 13)) 0
((1990, 0), (1991, 14)) 0
((1990, 0), (1991, 15)) 0
((1990, 1), (1991, 0)) 0
((1990, 1), (1991, 1)) 5
((1990, 1), (1991, 2)) 0
((1990, 1), (1991, 3)) 0
((1990, 1), (1991, 4)) 0
((1990, 1), (1991, 5)) 0
((1990, 1), (1991, 6)) 0
((1990, 1), (1991, 7)) 0
((1990, 1), (1991, 8)) 0
((1990, 1), (1991, 9)) 0
((1990, 1), (1991, 10)) 0
((1990, 1), (1991, 11)) 0
((1990, 1), (1991, 12)) 0
((1990, 1), (1991, 13)) 0
((1990, 1), (1991, 14)) 0
((1990, 1), (1991, 15)) 0
((1990, 2), (1991, 0)) 0
((1990, 2), (1991, 1)) 0
((1990, 2), (1991, 2)) 5
((1990, 2), (1991, 3)) 0
((1990, 2), (1991, 4)) 0
((1990, 2), (1991, 5)) 0
((1990, 2), (1991, 6)) 0
((1990, 2), 

((1994, 9), (1995, 15)) 0
((1994, 10), (1995, 0)) 0
((1994, 10), (1995, 1)) 0
((1994, 10), (1995, 2)) 0
((1994, 10), (1995, 3)) 0
((1994, 10), (1995, 4)) 0
((1994, 10), (1995, 5)) 0
((1994, 10), (1995, 6)) 0
((1994, 10), (1995, 7)) 0
((1994, 10), (1995, 8)) 0
((1994, 10), (1995, 9)) 0
((1994, 10), (1995, 10)) 1
((1994, 10), (1995, 11)) 0
((1994, 10), (1995, 12)) 0
((1994, 10), (1995, 13)) 0
((1994, 10), (1995, 14)) 0
((1994, 10), (1995, 15)) 0
((1994, 11), (1995, 0)) 0
((1994, 11), (1995, 1)) 0
((1994, 11), (1995, 2)) 0
((1994, 11), (1995, 3)) 0
((1994, 11), (1995, 4)) 0
((1994, 11), (1995, 5)) 0
((1994, 11), (1995, 6)) 0
((1994, 11), (1995, 7)) 0
((1994, 11), (1995, 8)) 0
((1994, 11), (1995, 9)) 0
((1994, 11), (1995, 10)) 0
((1994, 11), (1995, 11)) 0
((1994, 11), (1995, 12)) 0
((1994, 11), (1995, 13)) 0
((1994, 11), (1995, 14)) 0
((1994, 11), (1995, 15)) 0
((1994, 12), (1995, 0)) 0
((1994, 12), (1995, 1)) 0
((1994, 12), (1995, 2)) 0
((1994, 12), (1995, 3)) 0
((1994, 12), (1995, 4)) 0


((1998, 8), (1999, 7)) 0
((1998, 8), (1999, 8)) 0
((1998, 8), (1999, 9)) 0
((1998, 8), (1999, 10)) 0
((1998, 8), (1999, 11)) 0
((1998, 8), (1999, 12)) 0
((1998, 8), (1999, 13)) 0
((1998, 8), (1999, 14)) 0
((1998, 8), (1999, 15)) 0
((1998, 9), (1999, 0)) 0
((1998, 9), (1999, 1)) 0
((1998, 9), (1999, 2)) 0
((1998, 9), (1999, 3)) 0
((1998, 9), (1999, 4)) 0
((1998, 9), (1999, 5)) 0
((1998, 9), (1999, 6)) 0
((1998, 9), (1999, 7)) 0
((1998, 9), (1999, 8)) 0
((1998, 9), (1999, 9)) 0
((1998, 9), (1999, 10)) 0
((1998, 9), (1999, 11)) 0
((1998, 9), (1999, 12)) 0
((1998, 9), (1999, 13)) 0
((1998, 9), (1999, 14)) 0
((1998, 9), (1999, 15)) 0
((1998, 10), (1999, 0)) 0
((1998, 10), (1999, 1)) 0
((1998, 10), (1999, 2)) 0
((1998, 10), (1999, 3)) 0
((1998, 10), (1999, 4)) 0
((1998, 10), (1999, 5)) 0
((1998, 10), (1999, 6)) 0
((1998, 10), (1999, 7)) 0
((1998, 10), (1999, 8)) 0
((1998, 10), (1999, 9)) 0
((1998, 10), (1999, 10)) 0
((1998, 10), (1999, 11)) 0
((1998, 10), (1999, 12)) 0
((1998, 10), (1999, 13

((2003, 6), (2004, 9)) 0
((2003, 6), (2004, 10)) 0
((2003, 6), (2004, 11)) 0
((2003, 6), (2004, 12)) 0
((2003, 6), (2004, 13)) 0
((2003, 6), (2004, 14)) 0
((2003, 6), (2004, 15)) 0
((2003, 7), (2004, 0)) 0
((2003, 7), (2004, 1)) 0
((2003, 7), (2004, 2)) 0
((2003, 7), (2004, 3)) 0
((2003, 7), (2004, 4)) 0
((2003, 7), (2004, 5)) 0
((2003, 7), (2004, 6)) 0
((2003, 7), (2004, 7)) 1
((2003, 7), (2004, 8)) 0
((2003, 7), (2004, 9)) 0
((2003, 7), (2004, 10)) 0
((2003, 7), (2004, 11)) 0
((2003, 7), (2004, 12)) 0
((2003, 7), (2004, 13)) 0
((2003, 7), (2004, 14)) 0
((2003, 7), (2004, 15)) 0
((2003, 8), (2004, 0)) 0
((2003, 8), (2004, 1)) 0
((2003, 8), (2004, 2)) 0
((2003, 8), (2004, 3)) 0
((2003, 8), (2004, 4)) 0
((2003, 8), (2004, 5)) 0
((2003, 8), (2004, 6)) 0
((2003, 8), (2004, 7)) 0
((2003, 8), (2004, 8)) 0
((2003, 8), (2004, 9)) 0
((2003, 8), (2004, 10)) 0
((2003, 8), (2004, 11)) 0
((2003, 8), (2004, 12)) 0
((2003, 8), (2004, 13)) 0
((2003, 8), (2004, 14)) 0
((2003, 8), (2004, 15)) 0
((2003,

((2008, 4), (2009, 10)) 0
((2008, 4), (2009, 11)) 0
((2008, 4), (2009, 12)) 0
((2008, 4), (2009, 13)) 0
((2008, 4), (2009, 14)) 0
((2008, 4), (2009, 15)) 0
((2008, 5), (2009, 0)) 0
((2008, 5), (2009, 1)) 0
((2008, 5), (2009, 2)) 0
((2008, 5), (2009, 3)) 0
((2008, 5), (2009, 4)) 0
((2008, 5), (2009, 5)) 0
((2008, 5), (2009, 6)) 0
((2008, 5), (2009, 7)) 0
((2008, 5), (2009, 8)) 0
((2008, 5), (2009, 9)) 0
((2008, 5), (2009, 10)) 0
((2008, 5), (2009, 11)) 0
((2008, 5), (2009, 12)) 0
((2008, 5), (2009, 13)) 0
((2008, 5), (2009, 14)) 0
((2008, 5), (2009, 15)) 0
((2008, 6), (2009, 0)) 0
((2008, 6), (2009, 1)) 0
((2008, 6), (2009, 2)) 0
((2008, 6), (2009, 3)) 0
((2008, 6), (2009, 4)) 0
((2008, 6), (2009, 5)) 0
((2008, 6), (2009, 6)) 0
((2008, 6), (2009, 7)) 0
((2008, 6), (2009, 8)) 0
((2008, 6), (2009, 9)) 0
((2008, 6), (2009, 10)) 0
((2008, 6), (2009, 11)) 0
((2008, 6), (2009, 12)) 0
((2008, 6), (2009, 13)) 0
((2008, 6), (2009, 14)) 0
((2008, 6), (2009, 15)) 0
((2008, 7), (2009, 0)) 0
((2008,

In [74]:
d = {'source':source,'target':target,'value':value,'color':colors}
dataframe = Dataset(pd.DataFrame(data=d))

In [75]:
def get_clusters_idx(year,source,target):
    clusters = set()
    for s in source:
        if year in s:
            clusters.add(s)
            
    for t in target:
        if year in t:
            clusters.add(t)
    
    clusters = sorted(list(clusters))
    return clusters

size = dict(width=1800, height=800)

nodes = dict()
ordering = []
for y in range(1990,2011):
    y = str(y)
    clusters = get_clusters_idx(y,source,target)
    nodes[y] = ProcessGroup(clusters)
    nodes[y].partition = Partition.Simple('process',clusters)
    ordering.append([y])

In [76]:
bundles = [Bundle(ordering[i][0],ordering[i+1][0]) for i in range(len(ordering)-1)]
sdd = SankeyDefinition(nodes, bundles, ordering,flow_partition=dataframe.partition('color'))
weave(sdd, dataframe, palette='Paired_12').to_widget(**size).auto_save_png('pacs_tracking.png')

SankeyWidget(groups=[{'id': '1990', 'type': 'process', 'title': '', 'nodes': ['1990^(1990, 0)', '1990^(1990, 1…

In [19]:
# pytl.set_credentials_file(username='caro_mb', api_key='HIu2L6F757oTdLhP5db9')

# data_trace = dict(
#     type='sankey',
#     domain = dict(
#       x =  [0,1],
#       y =  [0,1]
#     ),
#     orientation = "h",
#     valueformat = ".0f",
#     node = dict(
#       pad = 5,
#       thickness = 20,
#       line = dict(
#         color = "black",
#         width = 0.5
#       ),
#       label =  labels,
#       color = 'gray'
#     ),
#     link = dict(
#       source = source,
#       target = target,
#       value = value,
#   )
# )

# layout =  dict(
#     title = "Clusters tracking",
#     height = 800,
#     width = 1800,
#     font = dict(
#       size = 10
#     ),    
    
# )


# fig = dict(data=[data_trace], layout=layout)
# py.iplot(fig, validate=False)

In [32]:
# https://plot.ly/~alishobeiri/1591/plotly-sankey-diagrams/#/