In [4]:
!pip install nltk
!pip install Cython

from os.path import join as PJ
import struct
import os
import numpy as np
import operator
import gensim
import ujson
import igraph as ig
import xnetwork as xn

import glob
import xnetwork as xn
from tqdm.auto import tqdm
from os.path import join as PJ

import matplotlib.pyplot as plt 



In [11]:
from scipy import integrate
from itertools import combinations
from collections import defaultdict

In [2]:
%load_ext Cython

In [3]:
%%cython

def calculateOverlapSimilarity(list journalAuthors):
    cdef list edges = [];
    cdef list weights = [];
    cdef float minWeight = 0.01;
    cdef list fromAuthors;
    cdef long fromSize;
    cdef list toAuthors;
    cdef long toSize;
    cdef long intersectionCount;
    cdef double overlapSimilarity;
    cdef long fromAuthorIndex;
    cdef long toAuthorIndex;
    cdef long fromAuthorID;
    cdef long  toAuthorID;
    for fromJournalIndex in range(len(journalAuthors)):
        if(fromJournalIndex%1000==0):
            print("%d/%d (%d)                      "%(fromJournalIndex,len(journalAuthors),len(edges)),end="\r",flush=True);
        fromAuthors = journalAuthors[fromJournalIndex];
        fromSize = len(fromAuthors);
        if(fromSize>0):
            for toJournalIndex in range(fromJournalIndex+1,len(journalAuthors)):
                toAuthors = journalAuthors[toJournalIndex];
                toSize = len(toAuthors);
                if(toSize>0):
                    fromAuthorIndex = 0;
                    toAuthorIndex = 0;
                    intersectionCount = 0;
                    while fromAuthorIndex<fromSize and toAuthorIndex<toSize:
                        fromAuthorID = fromAuthors[fromAuthorIndex];
                        toAuthorID = toAuthors[toAuthorIndex];
                        if(fromAuthorID==toAuthorID):
                            intersectionCount += 1;
                            fromAuthorIndex+=1;
                            toAuthorIndex+=1;
                        elif(fromAuthorID>toAuthorID):
                            toAuthorIndex+=1;
                        else:
                            fromAuthorIndex+=1;
                    overlapSimilarity = intersectionCount/min(fromSize,toSize);
                    if(overlapSimilarity>minWeight):
                        edges.append((fromJournalIndex,toJournalIndex));
                        weights.append(overlapSimilarity);
    return (edges,weights);

def calculateIntersection(list journalAuthors):
    cdef list edges = [];
    cdef list weights = [];
    cdef float minWeight = 0;
    cdef list fromAuthors;
    cdef long fromSize;
    cdef list toAuthors;
    cdef long toSize;
    cdef long intersectionCount;
    cdef double overlapSimilarity;
    cdef long fromAuthorIndex;
    cdef long toAuthorIndex;
    cdef long fromAuthorID;
    cdef long  toAuthorID;
    for fromJournalIndex in range(len(journalAuthors)):
        if(fromJournalIndex%1000==0):
            print("%d/%d (%d)                      "%(fromJournalIndex,len(journalAuthors),len(edges)),end="\r",flush=True);
        fromAuthors = journalAuthors[fromJournalIndex];
        fromSize = len(fromAuthors);
        if(fromSize>0):
            for toJournalIndex in range(fromJournalIndex+1,len(journalAuthors)):
                toAuthors = journalAuthors[toJournalIndex];
                toSize = len(toAuthors);
                if(toSize>0):
                    fromAuthorIndex = 0;
                    toAuthorIndex = 0;
                    intersectionCount = 0;
                    while fromAuthorIndex<fromSize and toAuthorIndex<toSize:
                        fromAuthorID = fromAuthors[fromAuthorIndex];
                        toAuthorID = toAuthors[toAuthorIndex];
                        if(fromAuthorID==toAuthorID):
                            intersectionCount += 1;
                            fromAuthorIndex+=1;
                            toAuthorIndex+=1;
                        elif(fromAuthorID>toAuthorID):
                            toAuthorIndex+=1;
                        else:
                            fromAuthorIndex+=1;
                    overlapSimilarity = intersectionCount;
                    if(overlapSimilarity>minWeight):
                        edges.append((fromJournalIndex,toJournalIndex));
                        weights.append(overlapSimilarity);
    return (edges,weights);


def calculateCosineSimilarity(list journalAuthors):
    cdef list edges = [];
    cdef list weights = [];
    cdef float minWeight = 0.0;
    cdef list fromAuthors;
    cdef long fromSize;
    cdef list toAuthors;
    cdef long toSize;
    cdef long intersectionCount;
    cdef double overlapSimilarity;
    cdef long fromAuthorIndex;
    cdef long toAuthorIndex;
    cdef long fromAuthorID;
    cdef long  toAuthorID;
    for fromJournalIndex in range(len(journalAuthors)):
        if(fromJournalIndex%1000==0):
            print("%d/%d (%d)                      "%(fromJournalIndex,len(journalAuthors),len(edges)),end="\r",flush=True);
        fromAuthors = journalAuthors[fromJournalIndex];
        fromSize = len(fromAuthors);
        if(fromSize>0):
            for toJournalIndex in range(fromJournalIndex+1,len(journalAuthors)):
                toAuthors = journalAuthors[toJournalIndex];
                toSize = len(toAuthors);
                if(toSize>0):
                    fromAuthorIndex = 0;
                    toAuthorIndex = 0;
                    intersectionCount = 0;
                    while fromAuthorIndex<fromSize and toAuthorIndex<toSize:
                        fromAuthorID = fromAuthors[fromAuthorIndex];
                        toAuthorID = toAuthors[toAuthorIndex];
                        if(fromAuthorID==toAuthorID):
                            intersectionCount += 1;
                            fromAuthorIndex+=1;
                            toAuthorIndex+=1;
                        elif(fromAuthorID>toAuthorID):
                            toAuthorIndex+=1;
                        else:
                            fromAuthorIndex+=1;
                    overlapSimilarity = intersectionCount/(fromSize*toSize);
                    if(overlapSimilarity>minWeight):
                        edges.append((fromJournalIndex,toJournalIndex));
                        weights.append(overlapSimilarity);
    return (edges,weights);

In [None]:
g = xn.xnet2igraph('citation_net.xnet')
print(g.vcount(), g.ecount())

for journal in set(g.vs['journal']):
    print(journal)
    vertex_seq = g.vs.select(journal_eq=journal)

    citationsList = [r.split(',') for r in vertex_seq['refs']]
    
    print('begin conv')
    wos_id = set()
    for cit_list in citationsList:
        wos_id |= set(cit_list)
    
    wos_map = dict()
    for i,w in enumerate(wos_id):
        wos_map[w] = i
    
    cit_list_id = []
    for cit_list in citationsList:
        c_id = []
        for r in cit_list:
            c_id.append(wos_map[r])
        cit_list_id.append(c_id)
    
    print('end conv')
    
    citationsList = cit_list_id
    
    edges, weights = calculateCosineSimilarity(citationsList)
    couplingNetwork = ig.Graph(n=len(citationsList), edges=edges, directed=False, 
                               edge_attrs={"weight":weights})
    
    couplingNetwork.vs['name'] = vertex_seq['name']
    couplingNetwork.vs['year'] = vertex_seq['year']
    couplingNetwork.vs['refs'] = vertex_seq['refs']
    couplingNetwork.vs['title'] = vertex_seq['title']
    couplingNetwork.vs['journal'] = vertex_seq['journal']
    couplingNetwork.vs['abstract'] = vertex_seq['abstract']
    
    xn.igraph2xnet(couplingNetwork, 'coref_cos_net_%s.xnet' % journal);

120475 606630
journal of materials chemistry
begin conv
end conv
nature nanotechnology                    
begin conv
end conv
acs nano4 (40406)                      
begin conv
end conv
acs applied materials & interfaces        
begin conv
end conv
nature materials361)                      
begin conv
end conv
langmuir2 (278630)                      
begin conv


In [5]:
import glob
files = glob.glob('coref_cos_net_*.xnet')
print(files)

for file in files:
    g = xn.xnet2igraph(file)
    text = []
    for v in g.vs:
        text.append(v['title'] + '\n' + v['abstract'])
    g.vs['text'] = text
    xn.igraph2xnet(g, file.replace('.xnet', '_text.xnet'))

['coref_cos_net_acs applied materials & interfaces.xnet', 'coref_cos_net_acs nano.xnet', 'coref_cos_net_journal of materials chemistry.xnet', 'coref_cos_net_langmuir.xnet', 'coref_cos_net_nano letters.xnet', 'coref_cos_net_nature materials.xnet', 'coref_cos_net_nature nanotechnology.xnet']


In [26]:
from igraph import *

# source: https://github.com/aekpalakorn/python-backbone-network/blob/master/backbone.py
def disparity_filter(g):
	total_vtx = g.vcount()
	g.es['alpha_ij'] = 1

	for v in range(total_vtx):
		edges = g.incident(v)

		k = len(edges)
		if k > 1:
			sum_w = sum([g.es[e]['weight'] for e in edges])
			for e in edges:
				w = g.es[e]['weight']
				p_ij = w/sum_w
				alpha_ij = 1 - (k-1) * integrate.quad(lambda x: (1-x)**(k-2), 0, p_ij)[0]
				g.es[e]['alpha_ij'] = min(alpha_ij,g.es[e]['alpha_ij'])

def alpha_cut(alpha,g):
    g_copy = g.copy()
    to_delete = g_copy.es.select(alpha_ij_ge=alpha)
    g_copy.delete_edges(to_delete)
    return g_copy

def get_largest_component_size(g):
    components = g.components()
    giant = components.giant()
    return giant.vcount()

def get_best_cut(net,a_min,a_max):
    vcount = net.components().giant().vcount()
    possible_edges = vcount*(vcount-1)/2
    p001_edges = 0.001*possible_edges
    
    for c in reversed(np.linspace(a_min, a_max, 20)):
        cuted_net = alpha_cut(c, net)
        print(c, p001_edges, cuted_net.ecount())
        if cuted_net.ecount() <= p001_edges:
            return cuted_net
    
    return net
    
def apply_backbone(net,a_min,a_max,preserve=0.8):
    disparity_filter(net)
    best = get_best_cut(net,a_min,a_max)
    return best

In [None]:
files = glob.glob('*_Comm.xnet')
for file in files: 
    g = xn.xnet2igraph(file)
    g_filtered = apply_backbone(g, 0.0001, 1)
    xn.igraph2xnet(g_filtered, file.replace('.xnet', '_filtered_001p.xnet'))