In [1]:
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans

from collections import defaultdict
from itertools import combinations
from scipy import spatial
from floweaver import *
from igraph import *

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import xnet
import glob
import ast

In [2]:
%matplotlib notebook

In [3]:
def to_list(bow):
    return [ast.literal_eval(b) for b in bow]

def bows_to_list(bows):
    new_bows = []
    for b in bows:
        new_bows.append(to_list(b))
    return new_bows

In [4]:
def find_k(new_bow):
    range_n_clusters = [50,60,70,80,90,100,110,120]
    X = new_bow
    sils = []
    for n_clusters in range_n_clusters:
        clusterer = KMeans(n_clusters=n_clusters,random_state=10)
        cluster_labels = clusterer.fit_predict(X)
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)
        sils.append(silhouette_avg)
    return np.argmax(sils)

In [5]:
range_n_clusters = [50,60,70,80,90,100,110,120]

def clustering(bow,names):
    #arg = find_k(bow)
    #k = range_n_clusters[arg]
    k = 100
    alg = KMeans(n_clusters=k,random_state=10)
    alg.fit(bow)
    c = 0
    return alg.labels_

def to_sets(names,labels):
    sets = dict()
    unique = set(labels)
    for l in unique:
        sets[l] = set()
    for n,l in zip(names,labels):
        sets[l].add(n)
    return sets

In [6]:
def match_pair(sets_t,sets_t1,mink=50):
    out = []
    for l1,s1 in sets_t.items():
        if len(s1) < mink:
            continue
        min_dist = 100000
        label = -1
        for l2,s2 in sets_t1.items():
            if len(s2) < mink:
                continue
            d = 1 - len(s1&s2)/len(s1|s2)
            if d < min_dist:
                label = l2
                min_dist = d
        out.append((l1,label))
    # retorna quem dá continuidade a quem
    return out

def match(sets_t,sets_t1,mink=50):
    out1 = match_pair(sets_t,sets_t1,mink)
    out2 = match_pair(sets_t1,sets_t,mink)
    
    path = []
    for (a,b) in out1:
        if (b,a) in out2:
            path.append((a,b))
    return path

In [7]:
def match_all_series(sets_series,mink):
    n = len(sets_series)
    paths = []
    for i in range(n-1):
        ps = match(sets_series[i],sets_series[i+1],mink)
        paths.append(ps)
    return paths

In [8]:
def get_arg(i,paths):
    for arg,p in enumerate(paths):
        if p[0] == i:
            return arg
    return -1

def get_paths_graph(paths,nv=100):
    paths_graph = Graph(directed=True)
    n = len(paths)
    names = []
    for i in range(n+1):
        for j in range(nv):
            names.append(str(i) + '_' + str(j))
    paths_graph.add_vertices(len(names))
    paths_graph.vs['name'] = names

    for idx_begin in range(n):
        for i in range(nv):
            idx = idx_begin
            arg = get_arg(i,paths[idx])
            while arg >= 0:
                step = paths[idx][arg]
                a = str(idx) + '_' + str(step[0])
                b = str(idx+1) + '_' + str(step[1])
                paths_graph.add_edge(a,b)
                j = paths[idx][arg][1]
                idx += 1
                if idx == len(paths):
                    break
                arg = get_arg(j,paths[idx])
    paths_graph.simplify()
    return paths_graph

In [9]:
def get_paths(paths_graph,mink):
    paths_graph.to_undirected()
    valid_paths = []
    for comp in paths_graph.decompose(mode=WEAK,minelements=mink):
        valid_paths.append(comp.vs['name'])
    return valid_paths

In [10]:
def get_groups(valid_paths,sets):
    groups_by_pac = dict()
    for i,ps in enumerate(valid_paths):
        for p in ps:
            p = p.split('_')
            idx = int(p[0])
            c = int(p[1])
            for pac in sets[idx][c]:
                if not pac in groups_by_pac:
                    groups_by_pac[pac] = []
                groups_by_pac[pac].append((idx+1990,i))
    return groups_by_pac

def get_transitions(valid_paths,hists):
    n = len(valid_paths)
    transitions = dict()
    for i in range(1990,2010):
        for j in range(n):
            for k in range(n):
                transitions[((i,j),(i+1,k))] = 0
    for pac,hist in hists.items():
        hist = sorted(hist)
        hist_len = len(hist)
        for i in range(hist_len-1):
            key = (hist[i],hist[i+1])
            if key in transitions:
                transitions[key] += 1
    return transitions

def get_dataframe(transitions):
    source = []
    target = []
    value = []
    colors = []
    for k,v in transitions.items():
        if v > 0:
            colors.append(k[0][1])
            source.append(str(k[0]))
            target.append(str(k[1]))
            value.append(v)
    d = {'source':source,'target':target,'value':value,'color':colors}
    dataframe = Dataset(pd.DataFrame(data=d))
    return dataframe,source,target

def get_clusters_idx(year,source,target):
    clusters = set()
    for s in source:
        if year in s:
            clusters.add(s)
            
    for t in target:
        if year in t:
            clusters.add(t)
    
    clusters = sorted(list(clusters))
    return clusters

In [11]:
def pre_plot(series_sets,bows1,names1,years,minsize=1000,minlen=5):

    paths_sets = match_all_series(sets_series,minsize)
    g_pacs_paths = get_paths_graph(paths_sets)
    valid_paths = get_paths(g_pacs_paths,minlen)
    print(len(valid_paths))
    
    hists = get_groups(valid_paths,sets_series)
    transitions = get_transitions(valid_paths,hists)
    dataframe,source,target = get_dataframe(transitions)
    
    nodes = dict()
    ordering = []
    for y in years:
        y = str(y)
        clusters = get_clusters_idx(y,source,target)
        nodes[y] = ProcessGroup(clusters)
        nodes[y].partition = Partition.Simple('process',clusters)
        ordering.append([y])

    bundles = [Bundle(ordering[i][0],ordering[i+1][0]) for i in range(len(ordering)-1)]
    sdd = SankeyDefinition(nodes, bundles, ordering,flow_partition=dataframe.partition('color'))

    return sdd,dataframe

In [31]:
files = 'colabs/wbasic/*selected*pac5.xnet'

files = glob.glob(files)
files = sorted(files)

# data = xnet.xnet2igraph('../data/citation_network_ge1990_pacs.xnet')

bows = []
names = []
i = 0
for f in files:
    print(f)
    colab_net = xnet.xnet2igraph(f)
    bows.append(colab_net.vs['bow'])
    names.append(colab_net.vs['name'])

colabs/wbasic/colab_1990_1993.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_1991_1994.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_1992_1995.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_1993_1996.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_1994_1997.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_1995_1998.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_1996_1999.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_1997_2000.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_1998_2001.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_1999_2002.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_2000_2003.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_2001_2004.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_2002_2005.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_2003_2006.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_2004_2007.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_2005_2008.xnet_0.5_selected_wb_pac5.xnet
colabs/wbasic/colab_2006

In [32]:
bows = bows_to_list(bows)
# bows1 = bows[:10]
# bows2 = bows[10:]
# names1 = names[:10]
# names2 = names[10:]

In [33]:
years = [1990+i for i in range(len(names))][:-2]
sets_series = []
for bow,name in zip(bows[:-2],names[:-2]):
    labels = clustering(bow,name)
    sets = to_sets(name,labels)
    sets_series.append(sets)
sdd,dataframe = pre_plot(sets_series,bows[:-2],names[:-2],years)
filename = 'authors_by_pac_vecs.png'

size = dict(width=1800, height=800)
weave(sdd, dataframe, palette='Paired_12').to_widget(**size).auto_save_png(filename)

18


SankeyWidget(groups=[{'id': '1990', 'type': 'process', 'title': '', 'nodes': ['1990^(1990, 0)', '1990^(1990, 1…

In [34]:
# years = [1990+i for i in range(len(names))][10:15]
# sets_series = []
# for bow,name in zip(bows2[:5],names2[:5]):
#     labels = clustering(bow,name)
#     sets = to_sets(name,labels)
#     sets_series.append(sets)
# filename = 'authors_pacs_part2.png'
# size = dict(width=1800, height=800)
# sdd,dataframe = pre_plot(sets_series,bows2[:5],names2[:5],years) 
# weave(sdd, dataframe, palette='Paired_12').to_widget(**size).auto_save_png(filename)

In [13]:
files = 'colabs/wbasic/*selected*pac5_cluster.xnet'

files = glob.glob(files)
files = sorted(files)

# data = xnet.xnet2igraph('../data/citation_network_ge1990_pacs.xnet')

bows = []
names = []
i = 0
for f in files:
    print(f)
    colab_net = xnet.xnet2igraph(f)
    bows.append(colab_net.vs['reduced_bow'])
    names.append(colab_net.vs['name'])
    
bows = bows_to_list(bows)
bows_1 = bows[:10]
bows_2 = bows[10:]
names_1 = names[:10]
names_2 = names[10:]

colabs/wbasic/colab_1990_1993.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_1991_1994.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_1992_1995.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_1993_1996.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_1994_1997.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_1995_1998.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_1996_1999.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_1997_2000.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_1998_2001.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_1999_2002.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_2000_2003.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_2001_2004.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_2002_2005.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_2003_2006.xnet_0.5_selected_wb_pac5_cluster.xnet
colabs/wbasic/colab_2004_2007.xnet

In [None]:
years = [1990+i for i in range(len(names))][:10]
sets_series = []
for bow,name in zip(bows_1,names_1):
    labels = clustering(bow,name)
    sets = to_sets(name,labels)
    sets_series.append(sets)
sdd,dataframe = pre_plot(sets_series,bows_1,names_1,years)
filename = 'authors_pacs_by_cluster_part1.png'

size = dict(width=1800, height=800)
weave(sdd, dataframe, palette='Paired_12').to_widget(**size).auto_save_png(filename)

In [30]:
years = [1990+i for i in range(len(names))][:-2]
sets_series = []
for bow,name in zip(bows[:-2],names[:-2]):
    labels = clustering(bow,name)
    sets = to_sets(name,labels)
    sets_series.append(sets)
sdd,dataframe = pre_plot(sets_series,bows[:-2],names[:-2],years,1000)
filename = 'authors_pacs_by_cluster.png'

size = dict(width=2000, height=800)
weave(sdd, dataframe, palette='Paired_12').to_widget(**size).auto_save_png(filename)

18


SankeyWidget(groups=[{'id': '1990', 'type': 'process', 'title': '', 'nodes': ['1990^(1990, 0)', '1990^(1990, 1…

In [13]:
files = 'pacs/pac_net_*_w1_*.xnet'

files = glob.glob(files)
files = sorted(files)

names = []
communities = []
nets = []
for f in files:
    print(f)
    pac_net = xnet.xnet2igraph(f)
    nets.append(pac_net)
    names.append(pac_net.vs['name'])
    communities.append([c for c in pac_net.vs['community']])

pacs/pac_net_1990_w1_infomap.xnet
pacs/pac_net_1991_w1_infomap.xnet
pacs/pac_net_1992_w1_infomap.xnet
pacs/pac_net_1993_w1_infomap.xnet
pacs/pac_net_1994_w1_infomap.xnet
pacs/pac_net_1995_w1_infomap.xnet
pacs/pac_net_1996_w1_infomap.xnet
pacs/pac_net_1997_w1_infomap.xnet
pacs/pac_net_1998_w1_infomap.xnet
pacs/pac_net_1999_w1_infomap.xnet
pacs/pac_net_2000_w1_infomap.xnet
pacs/pac_net_2001_w1_infomap.xnet
pacs/pac_net_2002_w1_infomap.xnet
pacs/pac_net_2003_w1_infomap.xnet
pacs/pac_net_2004_w1_infomap.xnet
pacs/pac_net_2005_w1_infomap.xnet
pacs/pac_net_2006_w1_infomap.xnet
pacs/pac_net_2007_w1_infomap.xnet
pacs/pac_net_2008_w1_infomap.xnet
pacs/pac_net_2009_w1_infomap.xnet
pacs/pac_net_2010_w1_infomap.xnet


In [18]:
for comms in communities:
    print(np.unique(comms,return_counts=True))

(array(['1:1', '1:10', '1:11', '1:12', '1:13', '1:14', '1:15', '1:16',
       '1:17', '1:18', '1:19', '1:2', '1:20', '1:21', '1:22', '1:23',
       '1:24', '1:25', '1:26', '1:27', '1:28', '1:29', '1:3', '1:30',
       '1:31', '1:32', '1:33', '1:34', '1:35', '1:36', '1:37', '1:38',
       '1:39', '1:4', '1:40', '1:41', '1:42', '1:43', '1:44', '1:45',
       '1:46', '1:47', '1:48', '1:5', '1:6', '1:7', '1:8', '1:9', '2:1',
       '2:2', '2:3', '2:4', '2:5', '3:1', '3:2', '3:3', '3:4', '3:5',
       '3:6', '4:1', '4:2', '4:3', '4:4', '5:1', '6:1', '7:1', '8:1'],
      dtype='<U4'), array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1]))
(array(['1:1', '1:10', '1:11', '1:12', '1:13', '1:14', '1:15', '1:16',
       '1:17', '1:18', '1:19', '1:2', '1:20', '1:21', '1:22', '1:23',
       '1:24', '1:25', '1:26', '1:27', '1:

In [29]:
years = [1990+i for i in range(len(names))]
sets_series = []
for comm,name in zip(communities,names):
    sets = to_sets(name,comm)
    sets_series.append(sets)
sdd,dataframe = pre_plot(sets_series,communities,names,years,2,2)
filename = 'pac_clusters.png'

size = dict(width=1800, height=800)
weave(sdd, dataframe, palette='Paired_12').to_widget(**size).auto_save_png(filename)

0


SankeyWidget(layout=Layout(height='800', width='1800'), margins={'top': 25, 'bottom': 10, 'left': 130, 'right'…

In [87]:
pacs_names = {'01' : 'Communication, education, history, and philosophy'
,'02' : 'Mathematical methods in physics'
,'03' : 'Quantum mechanics, field theories, and special relativity'
,'04' : 'General relativity and gravitation'
,'05' : 'Statistical physics, thermodynamics, and nonlinear dynamical systems'
,'06' : 'Metrology, measurements, and laboratory procedures'
,'07' : 'Instruments, apparatus, and components common to several branches of physics and astronomy'
,'11' : 'General theory of fields and particles'
,'12' : 'Specific theories and interaction models; particle systematics'
,'13' : 'Specific reactions and phenomenology'
,'14' : 'Properties of specific particles'
,'21' : 'Nuclear structure'
,'22' : '?????'
,'23' : 'Radioactive decay and in-beam spectroscopy'
,'24' : 'Nuclear reactions: general'
,'25' : 'Nuclear reactions: specific reactions'
,'26' : 'Nuclear astrophysics'
,'27' : 'Properties of specific nuclei listed by mass ranges'
,'28' : 'Nuclear engineering and nuclear power studies'
,'29' : 'Experimental methods and instrumentation for elementary-particle and nuclear physics'
,'31' : 'Electronic structure of atoms and molecules: theory'
,'32' : 'Atomic properties and interactions with photons'
,'33' : 'Molecular properties and interactions with photons'
,'34' : 'Atomic and molecular collision processes and interactions'
,'35' : '???????'
,'36' : 'Exotic atoms and molecules; macromolecules; clusters'
,'37' : 'Mechanical control of atoms, molecules, and ions'
,'41' : 'Electromagnetism; electron and ion optics'
,'42' : 'Optics'
,'43' : 'Acoustics'
,'44' : 'Heat transfer'
,'45' : 'Classical mechanics of discrete systems'
,'46' : 'Continuum mechanics of solids'
,'47' : 'Fluid dynamics'
,'51' : 'Physics of gases'
,'52' : 'Physics of plasmas and electric discharges'
,'61' : 'Structure of solids and liquids; crystallography'
,'62' : 'Mechanical and acoustical properties of condensed matter'
,'63' : 'Lattice dynamics'
,'64' : 'Equations of state, phase equilibria, and phase transitions'
,'65' : 'Thermal properties of condensed matter'
,'66' : 'Nonelectronic transport properties of condensed matter'
,'67' : 'Quantum fluids and solids'
,'68' : 'Surfaces and interfaces; thin films and nanosystems (structure and nonelectronic properties)'
,'71' : 'Electronic structure of bulk materials'
,'72' : 'Electronic transport in condensed matter'
,'73' : 'Electronic structure and electrical properties of surfaces, interfaces, thin films, and low-dimensional structures'
,'74' : 'Superconductivity'
,'75' : 'Magnetic properties and materials'
,'76' : 'Magnetic resonances and relaxations in condensed matter, Mössbauer effect'
,'77' : 'Dielectrics, piezoelectrics, and ferroelectrics and their properties'
,'78' : 'Optical properties, condensed-matter spectroscopy and other interactions of radiation and particles with condensed matter'
,'79' : 'Electron and ion emission by liquids and solids; impact phenomena'
,'81' : 'Materials science'
,'82' : 'Physical chemistry and chemical physics'
,'83' : 'Rheology'
,'84' : 'Electronics; radiowave and microwave technology; direct energy conversion and storage'
,'85' : 'Electronic and magnetic devices; microelectronics'
,'86' : '??????'
,'87' : 'Biological and medical physics'
,'88' : 'Renewable energy resources and applications'
,'89' : 'Other areas of applied and interdisciplinary physics'
,'91' : 'Solid Earth physics'
,'92' : 'Hydrospheric and atmospheric geophysics'
,'93' : 'Geophysical observations, instrumentation, and techniques'
,'94' : 'Physics of the ionosphere and magnetosphere'
,'95' : 'Fundamental astronomy and astrophysics; instrumentation, techniques, and astronomical observations'
,'96' : 'Solar system; planetology'
,'97' : 'Stars'
,'98' : 'Stellar systems; interstellar medium; galactic and extragalactic objects and systems; the Universe'}

In [74]:
def print_elements_in_clusters(sets,minsize=2,minlen=2):
    paths_sets = match_all_series(sets_series,minsize)
    g_pacs_paths = get_paths_graph(paths_sets)
    valid_paths = get_paths(g_pacs_paths,minlen)
    for c,ps in enumerate(valid_paths):
        pacs_in_cluster = []
        print(c)
        for p in ps:
            p = p.split('_')
            idx = int(p[0])
            comm = int(p[1])
            pacs_in_cluster += list(sets[idx][comm])
#             print(sorted(list(sets[idx][comm])))
        pacs = [p[:2] for p in pacs_in_cluster]
        us,cs = np.unique(pacs,return_counts=True)
        idxs_sorted = np.argsort(-cs)[:10]
        for i in idxs_sorted:
            print(us[i],pacs_names[us[i]],cs[i])
        print()

In [75]:
def plot_graphs(nets):
    y = 1990
    for g in nets:
        print(g.vcount(),g.ecount())
        g.vs["label"] = g.vs["name"]
        plot(g,"graphs/3lvls_infomap_"+str(y)+".pdf", bbox = (300, 300), margin = 20)
        y += 1

In [88]:
print_elements_in_clusters(sets_series)

0
05 Statistical physics, thermodynamics, and nonlinear dynamical systems 21
61 Structure of solids and liquids; crystallography 21
71 Electronic structure of bulk materials 21
73 Electronic structure and electrical properties of surfaces, interfaces, thin films, and low-dimensional structures 21
75 Magnetic properties and materials 21
78 Optical properties, condensed-matter spectroscopy and other interactions of radiation and particles with condensed matter 21
42 Optics 18
68 Surfaces and interfaces; thin films and nanosystems (structure and nonelectronic properties) 16
03 Quantum mechanics, field theories, and special relativity 13
64 Equations of state, phase equilibria, and phase transitions 9

1
11 General theory of fields and particles 21
12 Specific theories and interaction models; particle systematics 21
13 Specific reactions and phenomenology 21
14 Properties of specific particles 21
96 Solar system; planetology 10
04 General relativity and gravitation 5
95 Fundamental astrono

In [35]:
plot_graphs(nets)

420 10015
424 10730
429 11185
433 11420
432 11690
438 11961
454 12402
454 13113
463 13523
460 13763
449 13778
476 13808
535 14195
536 14314
539 14586
536 15148
450 15057
449 15084
439 13329
431 11268
421 8222
