In [1]:
import xnet
import json
import glob

import numpy as np
import matplotlib.pyplot as plt

from igraph import *
from scipy.stats import pearsonr
from collections import defaultdict
from util import get_attr_pacs,get_pac_list

In [2]:
attr_pacs = get_attr_pacs()
pac_list = get_pac_list()

In [3]:
def save(data,filename):
	with open(filename, 'w') as f:
		json.dump(data, f)

def load(filename):
	data = None
	with open(filename, 'r') as f:
		data = json.load(f)
	return data

In [36]:
def get_pac_comm(pac_nets,paper,data,get_pacs):
    p_pacs = get_pacs(paper,data)
    p_comms = []
    for pacs,n_pacs in p_pacs:
        comms = []
        for pac in pacs:
            comms.append(pac_nets.vs.find(name=pac)['community'])
        p_comms.append((comms,n_pacs))
    return p_comms

def get_pacs_paper_published(paper,data):
	pacs = []
	for pac_code in attr_pacs:
		pac = paper[pac_code][:2]
		if pac in pac_list:
			pacs.append(pac)
	return [(pacs,len(pacs))]

def get_pacs_paper_cited_by(paper,data):
	pacs = []
	p_neighbors = data.neighbors(paper,mode=IN)
	for idx in p_neighbors:
		neighbor = data.vs[idx]

		pacs += get_pacs_paper_published(neighbor,data) 
	return pacs

def get_pacs_paper_citing(paper,data):
    pacs = []
    p_neighbors = data.neighbors(paper,mode=OUT)
    for idx in p_neighbors:
        neighbor = data.vs[idx]

        pacs += get_pacs_paper_published(neighbor,data) 
    return pacs

In [6]:
def norm(history,y):
	for author in history.keys():
		comms_freq = history[author][y]
		total_papers = sum(comms_freq.values())
		for comm in comms_freq.keys():
			comms_freq[comm] /= total_papers
		history[author][y] = comms_freq
	return history

'''
{
	"author_idx":
	{
		"1990":
		{
		"0":0.9
		"1":0.1
		}
	}

}

'''

def get_pac_comm_freq(data,pac_nets,get_papers,filename):

	history = defaultdict(lambda:defaultdict(lambda:defaultdict(lambda:0)))

	delta = 3
	year_begin = 1990
	year_end = 2010

	for i,year in enumerate(range(year_begin,year_end+1)):
		print("current year %d" % year)
		subset = data.vs.select(year_ge=year,year_le=year+delta)
		subgraph = data.subgraph(subset)
		for paper in subgraph.vs:
			authors_idxs = paper['authors_idxs'].split(',')
			comms_by_paper = get_pac_comm(pac_nets[i],paper,subgraph,get_papers)
			for author in authors_idxs:
				for comms,n_comms in comms_by_paper:
					for comm in comms:
						history[author][year][comm] += 1/n_comms
		history = norm(history,year)
		save(history,filename)

	return history

In [58]:
def get_div(values):
	div = np.exp(-np.sum(values*np.log(values)))
	return div

def author_div(a_history,filename):
    X = []
    Y = []
    for year,comms_freq in a_history.items():
        if len(comms_freq) > 0:
            a_div = get_div(list(comms_freq.values()))
        else:
            a_div = 0
        X.append(year)
        Y.append(a_div)
    # plot(X,Y,filename)
    return X,Y

def authors_div(history):
	authors = dict()
	for author,a_history in history.items():
		X,Y = author_div(a_history,"temp/authors_div/author_%s_divs.pdf"%author)
		authors[author] = (X,Y)
	return authors

def plot(X,Y,filename):
	plt.figure(figsize=(10,2))
	plt.plot(X,Y,color='green',marker='o',ls='-')
	plt.xticks(rotation=45)
	plt.xlabel('year')
	plt.ylabel('diversity')
	plt.tight_layout()
	plt.savefig(filename)
	plt.close()

def plot_average_div(authors,filename):
	divs = defaultdict(lambda:[])
	for authors,(X,Y) in authors.items():
		for x,y in zip(X,Y):
			divs[x].append(y)

	X = []
	Y = []
	yerr = []
	for year,divs in divs.items():
		X.append(year)
		Y.append(np.mean(divs))
		yerr.append(np.std(divs))

	X = np.asarray(X)
	Y = np.asarray(Y)
	yerr = np.asarray(yerr)

	idxs = np.argsort(X)
	X = X[idxs]
	Y = Y[idxs]
	yerr = yerr[idxs]

	plt.figure(figsize=(10,3))
	plt.errorbar(X,Y,yerr=yerr,marker='o',ls='-')
	plt.xticks(rotation=45)
	plt.xlabel('year')
	plt.ylabel('diversity')
	plt.tight_layout()
	plt.savefig(filename)
	plt.close()

In [9]:
data = xnet.xnet2igraph('data/citation_network_ge1990_pacs.xnet')

filenames = sorted(glob.glob('data/pacs/2lvls/*_multilevel0.xnet'))
pac_nets = []
for filename in filenames:
	net = xnet.xnet2igraph(filename)
	pac_nets.append(net)

In [37]:
# get_pac_comm_freq(data,pac_nets,get_pacs_paper_citing,'data/authors_pac_comm_papers_citing.json')
# get_pac_comm_freq(data,pac_nets,get_pacs_paper_published,'data/authors_pac_comm_papers_published.json')
# get_pac_comm_freq(data,pac_nets,get_pacs_paper_cited_by,'data/authors_pac_comm_cited_by.json')

current year 1990
current year 1991
current year 1992
current year 1993
current year 1994
current year 1995
current year 1996
current year 1997
current year 1998
current year 1999
current year 2000
current year 2001
current year 2002
current year 2003
current year 2004
current year 2005
current year 2006
current year 2007
current year 2008
current year 2009
current year 2010


defaultdict(<function __main__.get_pac_comm_freq.<locals>.<lambda>()>,
            {'1902042639': defaultdict(<function __main__.get_pac_comm_freq.<locals>.<lambda>.<locals>.<lambda>()>,
                         {1990: defaultdict(<function __main__.get_pac_comm_freq.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>()>,
                                      {'9': 0.8571428571428572,
                                       '5': 0.14285714285714285}),
                          1991: defaultdict(<function __main__.get_pac_comm_freq.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>()>,
                                      {'10': 0.9078947368421053,
                                       '5': 0.057017543859649106,
                                       '1': 0.035087719298245605}),
                          1992: defaultdict(<function __main__.get_pac_comm_freq.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>()>,
                                      {'11': 0.9675925925925927,
 

In [59]:
history_papers_citing = load('data/authors_pac_comm_papers_citing.json')
authors_citing = authors_div(history_papers_citing)

In [61]:
history_papers_published = load('data/authors_pac_comm_papers_published.json')
authors_papers_div = authors_div(history_papers_published)

In [62]:
history_cited_by = load('data/authors_pac_comm_cited_by.json')
authors_cited_by_div = authors_div(history_cited_by)

In [53]:
plot_average_div(authors_citing,"average_div_citing.pdf")

In [31]:
def get_y(xs,X1,Y1):
    ys1 = []
    for x in xs:
        x = str(x)
        try:
            i = X1.index(x)
            v = Y1[i]
        except:
            v = 0
        ys1.append(v)
    return ys1

In [47]:
def get_corr(authors1,authors2):
    corr = []
    for author,(X1,Y1) in authors1.items():
        if author in authors_papers_div:
            X2,Y2 = authors2[author]
            minx = min(min(X1),min(X2))
            maxx = max(max(X1),max(X2))
            xs = list(range(int(minx),int(maxx)))
            ys1 = get_y(xs,X1,Y1)
            ys2 = get_y(xs,X2,Y2)
            corr.append(pearsonr(ys1,ys2)[0])

    return np.nanmean(corr),np.nanstd(corr)

In [63]:
get_corr(authors_citing,authors_papers_div)

(0.6975516426082046, 0.27986201932335963)

In [64]:
get_corr(authors_cited_by,authors_papers_div)

(0.27490941508186384, 0.5279885780284598)