In [1]:
import xnet
import json
import glob
import util

import numpy as np
import matplotlib.pyplot as plt

from igraph import *
from util import save,load
from scipy.stats import pearsonr
from collections import defaultdict
from matplotlib.ticker import MaxNLocator

In [2]:
attr_pacs = util.get_attr_pacs() # nome dos atributos dos vértices que contem os códigos PACS
pac_list = util.get_pac_list() # lista de códigos válidos

In [3]:
# pega a lista de PACS associados a um paper (dado o critério de get_pacs) e converte para as comunidades correspondentes
def get_pac_comm(pac_nets,paper,data,get_pacs):
    p_pacs = get_pacs(paper,data)
    p_comms = []
    for pacs,n_pacs in p_pacs:
        comms = []
        for pac in pacs:
            comms.append(pac_nets.vs.find(name=pac)['community'])
        p_comms.append((comms,n_pacs))
    return p_comms

In [4]:
# calcula o número de citações de um artigo e atribui o valor para cada um dos autores
def get_num_citations(data,filename):

	history = defaultdict(lambda:defaultdict(lambda:0))

	delta = 4
	year_begin = 1986
	year_end = 2006

	for i,year in enumerate(range(year_begin,year_end+1)):
		print("current year %d" % year)
		subset = data.vs.select(year_ge=year,year_le=year+delta)
		subgraph = data.subgraph(subset)
		for paper in subgraph.vs:
			authors_idxs = paper['authors_idxs'].split(',')
			num_citations = len(paper.neighbors(mode=IN))
			for author in authors_idxs:
				history[author][year+delta] += num_citations
		save(history,filename)

	return history

In [5]:
# normaliza os valores de um dado autor para um dado ano
'''
{
    "author_idx":
    {
        "1990":
        {
            "0":0.9, # comunidade de pacs
            "1":0.1
        }
    }

}

'''
def norm(history,y):
    for author in history.keys():
        comms_freq = history[author][y]
        total_papers = sum(comms_freq.values())
        for comm in comms_freq.keys():
            comms_freq[comm] /= total_papers
        history[author][y] = comms_freq
    return history

def get_pac_comm_freq(data,pac_nets,get_papers,delta,filename):

    history = defaultdict(lambda:defaultdict(lambda:defaultdict(lambda:0)))

    year_begin = 1986
    year_end = 2006

    for i,year in enumerate(range(year_begin,year_end+1)):
        print("current year %d" % year)
        subset = data.vs.select(year_ge=year,year_le=year+delta)
        subgraph = data.subgraph(subset)
        for paper in subgraph.vs:
            authors_idxs = paper['authors_idxs'].split(',')
            comms_by_paper = get_pac_comm(pac_nets[i],paper,subgraph,get_papers)
            for author in authors_idxs:
                for comms,n_comms in comms_by_paper:
                    for comm in comms:
                        history[author][year+delta][comm] += 1/n_comms
                    if len(comm) == 0:
                        print(authors_idxs)
        history = norm(history,year+delta)
        save(history,filename)

    return history

In [21]:
def get_div(values):
	div = np.exp(-np.sum(values*np.log(values)))
	return div

def author_div(a_history,filename):
    a_hist = dict()
    for year,comms_freq in a_history.items():
        if len(comms_freq) > 0:
            a_div = get_div(list(comms_freq.values()))
        else:
            a_div = 0
        a_hist[year] = a_div
    # plot(X,Y,filename)
    return a_hist

def authors_div(history):
    authors = dict()
    for author,a_history in history.items():
        a_div = author_div(a_history,"temp/authors_div/author_%s_divs.pdf"%author)
        authors[author] = a_div
    return authors

# def plot(X,Y,filename):
# 	plt.figure(figsize=(10,2))
# 	plt.plot(X,Y,color='green',marker='o',ls='-')
# 	plt.xticks(rotation=45)
# 	plt.xlabel('year')
# 	plt.ylabel('diversity')
# 	plt.tight_layout()
# 	plt.savefig(filename)
# 	plt.close()

def plot_average_div(authors_divs,valid_authors,filename,title):
    divs = defaultdict(lambda:[])
    for year,authors in valid_authors.items():
        for author in authors:
            if author in authors_divs:
                a_div = authors_divs[author]
                if year in a_div:
                    divs[year].append(a_div[year])
                else:
                    print(year,author)
                
    X = []
    Y = []
    yerr = []
    for year,divs in divs.items():
        X.append(year)
        Y.append(np.mean(divs))
        yerr.append(np.std(divs))

    X = np.asarray(X)
    Y = np.asarray(Y)
    yerr = np.asarray(yerr)

    idxs = np.argsort(X)
    X = X[idxs]
    Y = Y[idxs]
    yerr = yerr[idxs]

    plt.figure(figsize=(10,3))
    plt.errorbar(X,Y,yerr=yerr,marker='o',ls='-')
    plt.xticks(rotation=45)
    plt.xlabel('year')
    plt.ylabel('diversity')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

In [7]:
data = xnet.xnet2igraph('data/citation_network_ge1985_pacs.xnet')

filenames = sorted(glob.glob('data/pacs/2lvls/*_delta4_multilevel2.xnet'))
pac_nets = []
for filename in filenames:
	net = xnet.xnet2igraph(filename)
	pac_nets.append(net)

In [None]:
get_pac_comm_freq(data,pac_nets,util.get_pacs_out,4,'data/authors_pac_out.json')
get_pac_comm_freq(data,pac_nets,util.get_pacs_in,4,'data/authors_pac_in.json')
placeholder = get_num_citations(data,'authors_citations.json')

current year 1986
current year 1987
current year 1988
current year 1989
current year 1990
current year 1991
current year 1992
current year 1993


In [8]:
history_out = load('data/authors_pac_out.json')
authors_out_div = authors_div(history_out) # diversidade das publicações citadas (author out)

In [9]:
# history = load('data/authors_pac_comm_papers_published.json')
# authors_papers_div = authors_div(history_papers_published)

In [10]:
history_in = load('data/authors_pac_in.json')
authors_in_div = authors_div(history_in) # diversidade das publicações que citam o autor (author in)

In [11]:
# authors_num_in = load('num_of_citations.json') # número de citações do autor
# authors_num_out = load('num_of_refs.json') # número de referências do autor
# ranking_in = util.authors_ranking(authors_num_in) # autores com mais citações (out)
# ranking_out = util.authors_ranking(authors_num_out) # autores que mais referenciam (in)

In [12]:
def filter_min_by_year(author_hist,min_val):
    valid_authors = []
    for author,a_hist in author_hist.items():
        valid = True
        if len(a_hist) < 18:
            continue
        for y,val in a_hist.items():
            if val < min_val:
                valid = False
        if valid:
            valid_authors.append(author)
    return valid_authors

In [13]:
top1000_in = load('authors_in_ranking1000_by_year.json')
top1000_out = load('authors_out_ranking1000_by_year.json')
top500_in = load('authors_in_ranking500_by_year.json')
top500_out = load('authors_out_ranking500_by_year.json')

In [14]:
top1000_in

{'1990': ['367014603',
  '2114212128',
  '258269623',
  '2082134624',
  '415830914',
  '2588889940',
  '2551170085',
  '1902717784',
  '2046542289',
  '2499759550',
  '1970405048',
  '2751311546',
  '942714141',
  '726162634',
  '2561096027',
  '2421055501',
  '2341201477',
  '2293352094',
  '2232739718',
  '2210547541',
  '2208845361',
  '2199871126',
  '2190760242',
  '2186832700',
  '2168334039',
  '2168032007',
  '2164321411',
  '2161866215',
  '2156494747',
  '2156405972',
  '2134444711',
  '2126184341',
  '2118683054',
  '2117417375',
  '2100384647',
  '2077388550',
  '2033156912',
  '2004387483',
  '1872662435',
  '1494708750',
  '2762454066',
  '2610441685',
  '2594997269',
  '2589026345',
  '2519142017',
  '2257798526',
  '2202936718',
  '2151130140',
  '2147528637',
  '2124305693',
  '2115834903',
  '2101130344',
  '1966898067',
  '1912949591',
  '1558888678',
  '2128942559',
  '1744530486',
  '2587651803',
  '2153951243',
  '1989666459',
  '536628356',
  '428805443',
  '4036

In [16]:
# top 10 average div
plot_average_div(authors_in_div,top1000_in,'temp/top1000_in_average_div_in.pdf','média de diversidade das citações do top 1000 autores com mais citações')
plot_average_div(authors_out_div,top1000_in,'temp/top1000_in_average_div_out.pdf','média de diversidade das referências do top 1000 autores com mais citações')
plot_average_div(authors_out_div,top1000_out,'temp/top1000_out_average_div_out.pdf','média de diversidade das referências do top 1000 autores com mais referências')
plot_average_div(authors_in_div,top1000_out,'temp/top1000_out_average_div_in.pdf','média de diversidade das citações do top 1000 autores com mais referências')

1990 {'1993': 1.0, '1994': 1.0, '1995': 1.0966017253362883, '1996': 1.3503986765817482, '1997': 1.2480433376294269, '1998': 1.3324561443168572, '1999': 1.4269489382684197, '2000': 1.3613753164072617, '2001': 1.5691925832141966, '2002': 0, '2003': 0, '2004': 0, '2005': 1.0, '2006': 1.3322056912494478, '2007': 1.3322056912494478, '2008': 1.4575692649810903, '2009': 1.0, '2010': 1.3889334980019865}
1990 {'1992': 1.8612009803029155, '1993': 2.1865370587518393, '1994': 2.257536728636524, '1995': 2.5506104557044016, '1996': 2.132530616596334, '1997': 2.077339627172897, '1998': 3.3914842639894283, '1999': 2.445929875743962, '2000': 2.2694304105562253, '2001': 2.2471449819933667, '2002': 2.3358598976579787, '2003': 2.407947692324133, '2004': 2.3139383731369714, '2005': 2.0141840166698257, '2006': 2.6973738686641013, '2007': 2.981414926740764, '2008': 2.7607325340798208, '2009': 1.9747163561392531, '2010': 2.070774813356892}
1990 {'1991': 1.4706169999235712, '1992': 1.4706169999235712, '1993': 

1990 {'1991': 2.453464429529421, '1992': 2.853520411720118, '1993': 3.2176802026526086, '1994': 3.088263260017317, '1995': 2.53932475861451, '1996': 2.5438381420934255, '1997': 2.5656342453038365, '1998': 3.664423347303316, '1999': 3.734418913840042, '2000': 3.0881377205532514, '2001': 1.8769713232796827, '2002': 2.0993853361914936, '2003': 1.876433905654279, '2004': 1.7332313363909686, '2005': 1.9475118712813582, '2006': 1.5421534755178354, '2007': 1.61057380735538, '2008': 1.5350925218999332, '2009': 1.9777838378420523, '2010': 2.206233144402706}
1990 {'1991': 1.0, '1992': 1.0, '1993': 1.0, '1994': 1.0, '1995': 1.0, '1996': 1.404777363959132, '1997': 1.3395168277661151, '1998': 1.404777363959132, '1999': 1.0, '2000': 1.0, '2001': 1.0, '2002': 1.4981757081359444, '2003': 1.7661401862977961, '2004': 1.629385387029676, '2005': 1.52776150018052, '2006': 1.2868527657480797, '2007': 1.2009696503313139, '2008': 1.3400061481863226, '2009': 1.3747944573556883, '2010': 1.5140501802831168}
1990

In [17]:
# top 10 average div
plot_average_div(authors_in_div,top500_in,'temp/top500_in_average_div_in.pdf','média de diversidade das citações do top 500 autores com mais citações')
plot_average_div(authors_out_div,top500_in,'temp/top500_in_average_div_out.pdf','média de diversidade das referências do top 500 autores com mais citações')
plot_average_div(authors_out_div,top500_out,'temp/top500_out_average_div_out.pdf','média de diversidade das referências do top 500 autores com mais referências')
plot_average_div(authors_in_div,top500_out,'temp/top500_out_average_div_in.pdf','média de diversidade das citações do top 500 autores com mais referências')

1990 {'1993': 1.0, '1994': 1.0, '1995': 1.0966017253362883, '1996': 1.3503986765817482, '1997': 1.2480433376294269, '1998': 1.3324561443168572, '1999': 1.4269489382684197, '2000': 1.3613753164072617, '2001': 1.5691925832141966, '2002': 0, '2003': 0, '2004': 0, '2005': 1.0, '2006': 1.3322056912494478, '2007': 1.3322056912494478, '2008': 1.4575692649810903, '2009': 1.0, '2010': 1.3889334980019865}
1990 {'1992': 1.8612009803029155, '1993': 2.1865370587518393, '1994': 2.257536728636524, '1995': 2.5506104557044016, '1996': 2.132530616596334, '1997': 2.077339627172897, '1998': 3.3914842639894283, '1999': 2.445929875743962, '2000': 2.2694304105562253, '2001': 2.2471449819933667, '2002': 2.3358598976579787, '2003': 2.407947692324133, '2004': 2.3139383731369714, '2005': 2.0141840166698257, '2006': 2.6973738686641013, '2007': 2.981414926740764, '2008': 2.7607325340798208, '2009': 1.9747163561392531, '2010': 2.070774813356892}
1991 {'1993': 1.0, '1994': 1.0, '1995': 1.0966017253362883, '1996': 1.

In [18]:
def get_y(xs,X1,Y1):
    ys1 = []
    for x in xs:
        x = str(x)
        try:
            i = X1.index(x)
            v = Y1[i]
        except:
            v = 0
        ys1.append(v)
    return ys1

In [19]:
def get_div_points(valid,authors1,authors2):
    div_points = defaultdict(lambda:[])
    for year,authors in valid.items():
        for author in authors:
            if author in authors1 and author in authors2:
                x = authors1[author]
                y = authors2[author]
                if year in x and year in y:
                    div_points[year].append((x[year],y[year]))
            
    return div_points

def plot_div_points(div_points,xlabel,ylabel,name):
    corrs = []
    years = []
    for year,points in div_points.items():
        points = np.asarray(points)
        
        fig = plt.figure()
        plt.scatter(points[:,0],points[:,1],alpha=0.3)
        pear = pearsonr(points[:,0],points[:,1])[0]
        years.append(year)
        corrs.append(pear)
        plt.title("%s (pearson=%.3f)" % (year,pear))
        plt.locator_params(axis='x',integer=MaxNLocator(integer=True))
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.xticks(rotation=45)
        plt.savefig('temp/div_points_%s_%s_%s_%s.pdf'%(name,year,xlabel,ylabel))
        plt.close()
        
    plt.scatter(years,corrs)
    plt.title('correlação (pearson entre \n %s e %s)' % (xlabel,ylabel))
    #plt.xlabel(xlabel)
    #plt.ylabel(ylabel)
    plt.xticks(rotation=45)
    plt.locator_params(axis='x',integer=MaxNLocator(integer=True))
    plt.savefig('correlacao_%s_%s_%s.pdf' % (name,xlabel,ylabel))
    plt.close()

In [20]:
div_points = get_div_points(top1000_in,authors_in_div,authors_out_div)
plot_div_points(div_points,'div in','div out','top 1000')



In [18]:
div_points = get_div_points(top500_in,authors_in_div,authors_out_div)
plot_div_points(div_points,'div in','div out','top 500')



In [19]:
freq_in = load('authors_in_freq.json')
freq_out = load('authors_out_freq.json')

In [20]:
div_points = get_div_points(top1000_in,authors_in_div,freq_in)
plot_div_points(div_points,'div in','freq in','top 1000')



In [21]:
div_points = get_div_points(top1000_in,authors_in_div,freq_out)
plot_div_points(div_points,'div in','freq out','top 1000')



In [22]:
div_points = get_div_points(top1000_in,authors_out_div,freq_in)
plot_div_points(div_points,'div out','freq in','top 1000')



In [23]:
div_points = get_div_points(top1000_in,authors_out_div,freq_out)
plot_div_points(div_points,'div out','freq out','top 1000')



In [None]:
def plot_div_dist(authors):
    div_dist = defaultdict(lambda:[])
    for author,(X,Y) in authors.items():
        for x,y in zip(X,Y):
            div_dist[x].append(y)

    for year,divs in div_dist.items():
        plt.hist(divs,bins=10)
        plt.title(year)
        plt.show()
        plt.clf()

In [None]:
plot_div_dist(filtered_authors_in)
plot_div_dist(filtered_authors_out)