# CONFIGURAÇÕES EM AC2

Análsies realizadas sob rede obtida por meio da extração no WoS (BIOLOGIA MATEMÁTICA E COMPUTACIONAL).

Configurando a rede AC2 de forma a coincidir com a rede WW1.

In [1]:
# carregando bibliotecas

import numpy as np
import pandas as pd

import time

import gc
import os

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# importando os dados

citations = pd.read_csv('outputs/citations.csv', index_col=0, encoding = "ISO-8859-1")
journals_cit = pd.read_csv('outputs/journals_cit.csv', index_col=0, encoding = "ISO-8859-1")
articles_cit = pd.read_csv('outputs/articles_cit.csv', index_col=0, encoding = "ISO-8859-1")

In [3]:
citations.head()

Unnamed: 0,SOURCE_A,SOURCE_J,TARGET_A,TARGET_J,YEAR
1,WOS:000418951400033,1553-734X,WOS:A1997XU79300002,0305-1048,2017
2,WOS:000418951400033,1553-734X,WOS:000382258600111,1932-6203,2017
3,WOS:000418951400033,1553-734X,WOS:000354024900017,0969-2126,2017
4,WOS:000418951400033,1553-734X,WOS:000176271000003,0907-4449,2017
5,WOS:000418951400033,1553-734X,WOS:000084896300069,0305-1048,2017


In [4]:
journals_cit.head()

Unnamed: 0_level_0,TITLE,CITED_OUT,CITED_IN,TOT_ART
ISSN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1553-734X,PLoS Computational Biology,224085.0,12190.0,4088.0
1367-4803,BIOINFORMATICS,132071.0,52320.0,5840.0
1752-0509,BMC Systems Biology,65446.0,4328.0,1443.0
1471-2105,BMC BIOINFORMATICS,149982.0,22495.0,4742.0
0303-6812,JOURNAL OF MATHEMATICAL BIOLOGY,27093.0,6551.0,845.0


In [5]:
articles_cit.head()

Unnamed: 0,ID_WOS,TITLE,JOURNAL_ID,YEAR,CITED_IN,CITED_OUT,CITED_OUT_OB,CITED_OUT_T
1,WOS:000418951400033,High-resolution global peptide-protein docking...,1,2017,0,62,3,62
2,WOS:000418951400005,A cyber-linked undergraduate research experien...,1,2017,0,34,7,47
3,WOS:000418951400031,Physiological models of the lateral superior o...,1,2017,0,170,3,185
4,WOS:000418951400028,MAGPIE: Simplifying access and execution of co...,1,2017,0,24,3,28
5,WOS:000418951400009,Stabilizing patterns in time: Neural network a...,1,2017,0,26,5,30


## 1. Identificando os periodicos periféricos

* Na rede AC2, identificar os periódicos periféricos que irão seguir o comportamento estratégico.


* Na tese do Walter, os periódicos periféricos foram definidos como os que possuem o seu respectivo FI2 (fator de impacto - 2 anos) no último quartil. 

Logo, a ideia aqui é calcular o FI2 de 2017 para cada periódico. Em seguida ordenar esses valores e encontra o último quartil. 

### Calculando o FI2

O WoS possui uma tabela com FI2 calculado. No entanto, vamos recalcular esses valores para a base extraída, pois ela é apenas uma fatia da rede utilizada para o cálculo.

In [6]:
# FI calculado pelo WoS

FI = pd.read_csv('outputs/IF.csv', index_col=0)
FI.head()

Unnamed: 0,TITLE,2010,2011,2012,2013,2014,2015,2016
1,PLoS Computational Biology,5.515,5.215,4.867,4.829,4.62,4.587,4.542
2,BIOINFORMATICS,4.877,5.468,5.323,4.621,4.981,5.766,7.307
3,BMC Systems Biology,3.565,3.148,2.982,2.853,2.435,2.208,2.303
4,BMC BIOINFORMATICS,3.029,2.751,3.024,2.672,2.576,2.435,2.448
5,JOURNAL OF MATHEMATICAL BIOLOGY,3.021,2.963,2.366,2.388,1.846,1.716,1.566


In [7]:
FI2 = pd.DataFrame(columns = ['TITLE', 'ISSN', '2017'])
FI2['TITLE'] = list(journals_cit.TITLE[0:59])
FI2['ISSN'] = list(journals_cit.index[0:59])
FI2.head()

Unnamed: 0,TITLE,ISSN,2017
0,PLoS Computational Biology,1553-734X,
1,BIOINFORMATICS,1367-4803,
2,BMC Systems Biology,1752-0509,
3,BMC BIOINFORMATICS,1471-2105,
4,JOURNAL OF MATHEMATICAL BIOLOGY,0303-6812,


In [8]:
# função para calcular o FI2

def calc_FI2(citations, FI2, year):
    
    issn = list(FI2['ISSN'])
    
    for i in range(0,len(FI2)):
        
        # seleciona o periodico
        issn_i = issn[i] 
        
        # conta as citacoes recebidas por artigos publicados nos anos anteriores
        all_cit_in = citations[citations['TARGET_J'] == issn_i]  
        all_cit_in_ant1 = all_cit_in[all_cit_in['YEAR'] == year-1]
        n_cit_in_ant1 = len(all_cit_in_ant1)
        all_cit_in_ant2 = all_cit_in[all_cit_in['YEAR'] == year-2] 
        n_cit_in_ant2 = len(all_cit_in_ant2)
        
        numerador = (n_cit_in_ant1 + n_cit_in_ant2)
        
        # conta os artigos publicados nesses mesmos anos anteriores 
        all_cit_out = citations[citations['SOURCE_J'] == issn_i] 
        all_cit_out_ant1 = all_cit_out[all_cit_out['YEAR'] == year-1]
        n_cit_out_ant1 = len(all_cit_out_ant1)
        all_cit_out_ant2 = all_cit_out[all_cit_out['YEAR'] == year-2] 
        n_cit_out_ant2 = len(all_cit_out_ant2)
        
        denominador = (n_cit_out_ant1 + n_cit_out_ant2)
        
        if denominador == 0:
            FI2['2017'][i] = 'NaN'
        else:
            FI2['2017'][i] = (numerador/denominador)
    

In [9]:
# calculando o FI2 de 2017
calc_FI2(citations, FI2, 2017)

Durante o intervalo de tempo extraído, alguns periódicos surgiram e outros "morreram". Portanto, ao calcularmos o FI2 de 2017 para os periódicos extintos, o denominador será zero, pois não produziram nada no biênio anterior. 

* Aos periódicos que não conseguimos calcular o FI2 foi atribuído `NaN`. 

* Apenas não foi possível calcular o FI2 de um periódico! Ou seja, passamos a contar 58 periódicos para selecionar o grupo de periódicos periféricos. 

In [15]:
#FI2 = FI2[FI2['2017'] != 'NaN']
#FI2 = FI2.reindex(index=range(1,len(FI2)+1))

FI2.to_csv('outputs/FI2.csv')
#FI2 = pd.read_csv('outputs/FI2.csv', index_col=0)
FI2.head()

Unnamed: 0,TITLE,ISSN,2017
1,BIOINFORMATICS,1367-4803,0.382975
2,BMC Systems Biology,1752-0509,0.112549
3,BMC BIOINFORMATICS,1471-2105,0.159809
4,JOURNAL OF MATHEMATICAL BIOLOGY,0303-6812,0.214788
5,Algorithms for Molecular Biology,1748-7188,0.138109


### Periódicos Periféricos

In [16]:
FI2_sort = FI2.sort_values(by='2017', ascending=True)
FI2_sort

Unnamed: 0,TITLE,ISSN,2017
45,Hokkaido Mathematical Journal,0385-4035,0.003976
57,Computational Intelligence and Neuroscience,1687-5265,0.007658
47,Journal of Medical Imaging and Health Informatics,2156-7018,0.009829
51,Interdisciplinary Sciences-Computational Life ...,1913-2751,0.01176
7,Evolutionary Bioinformatics,1176-9343,0.012359
29,Computational and Mathematical Methods in Medi...,1748-670X,0.015056
28,Current Bioinformatics,1574-8936,0.015993
52,IEEE Journal of Biomedical and Health Informatics,2168-2194,0.023654
50,Journal of Bioinformatics and Computational Bi...,0219-7200,0.026288
41,International Journal of Biostatistics,2194-573X,0.029925


In [17]:
perifericos = FI2_sort[FI2_sort['2017'] <= list(FI2_sort['2017'].quantile([.25]))[0]]
print('Total de Periódicos Periféricos: ' + str(len(perifericos)))
perifericos

Total de Periódicos Periféricos: 15


Unnamed: 0,TITLE,ISSN,2017
45,Hokkaido Mathematical Journal,0385-4035,0.003976
57,Computational Intelligence and Neuroscience,1687-5265,0.007658
47,Journal of Medical Imaging and Health Informatics,2156-7018,0.009829
51,Interdisciplinary Sciences-Computational Life ...,1913-2751,0.01176
7,Evolutionary Bioinformatics,1176-9343,0.012359
29,Computational and Mathematical Methods in Medi...,1748-670X,0.015056
28,Current Bioinformatics,1574-8936,0.015993
52,IEEE Journal of Biomedical and Health Informatics,2168-2194,0.023654
50,Journal of Bioinformatics and Computational Bi...,0219-7200,0.026288
41,International Journal of Biostatistics,2194-573X,0.029925


In [18]:
perifericos.to_csv('outputs/perifericos_2017.csv')

## 2. Mudar a base (5 cols para 6 cols)

A ideia é tentar aproveitar os códigos em Matlab desenvolvidos na tese do Walter e no meu TCC. Portanto, há a necessidade dessa mudança na estrutura da base. 

A base gerada (salvo no  `Matlab` em `var_cit`) pelo Walter é formada pelas 6 colunas:
    1. CÓDIGO ARTIGO SOURCE (código sequencial numéico)
    2. ANO ARTIGO SOURCE 
    3. PERIÓDICO DO ARTIGO SOURCE (código sequencial numérico)
    4. CÓDIGO DO ARTIGO TARGET (código sequencial numérico)
    5. ANO ARTIGO TARGET
    6. PERIÓDICO DO ARTIGO TARGET  (código sequencial numérico)
    
 No entanto, a base gerada pela extração do WoS é formada por 5 colunas:
     1. SOURCE_A (ID_WOS do artigo que está citando) - código não sequencial alfanumérico.
     2. SOURCE_J (ISSN do periódico do artigo em questão) - código não sequencial alfanumérico.
     3. TARGET_A (ID_WOS do artigo que está citando) - código não sequencial alfanumérico.
     4. TARGET_J (ISSN do periódico do artigo em questão) - código não sequencial alfanumérico.
     5. YEAR (ano em que ocorreu a citação - ou seja, o ano do artigo SOURCE_A)

### CÓDIGO ARTIGO SOURCE / CÓDIGO ARTIGO TARGET
* Em `articles_cit`, criar um dicionário atribuindo cada codigo alfanumérico ID_WOS a um código numérico sequencial. 

In [19]:
articles_cit.head()

Unnamed: 0,ID_WOS,TITLE,JOURNAL_ID,YEAR,CITED_IN,CITED_OUT,CITED_OUT_OB,CITED_OUT_T
1,WOS:000418951400033,High-resolution global peptide-protein docking...,1,2017,0,62,3,62
2,WOS:000418951400005,A cyber-linked undergraduate research experien...,1,2017,0,34,7,47
3,WOS:000418951400031,Physiological models of the lateral superior o...,1,2017,0,170,3,185
4,WOS:000418951400028,MAGPIE: Simplifying access and execution of co...,1,2017,0,24,3,28
5,WOS:000418951400009,Stabilizing patterns in time: Neural network a...,1,2017,0,26,5,30


In [20]:
len(articles_cit)

3956082

In [21]:
articles_cit_b = articles_cit[articles_cit['ID_WOS'] != 'FORA DA BASE']

In [22]:
len(articles_cit_b)

3956023

In [23]:
all_articles = list(articles_cit_b['ID_WOS'].unique())

In [24]:
len(all_articles)

690444

In [31]:
years = articles_cit_b
years = years.drop_duplicates(['ID_WOS'], keep='first')

In [32]:
all_years = list(years.YEAR)

In [33]:
articles_dic = {}
i = 0
for id_wos in all_articles:
    articles_dic[id_wos] = [i+1, all_years[i]]
    i = i + 1

In [34]:
articles_dic

{'WOS:000418951400033': [1, 2017],
 'WOS:000418951400005': [2, 2017],
 'WOS:000418951400031': [3, 2017],
 'WOS:000418951400028': [4, 2017],
 'WOS:000418951400009': [5, 2017],
 'WOS:000418951400026': [6, 2017],
 'WOS:000418951400036': [7, 2017],
 'WOS:000418951400021': [8, 2017],
 'WOS:000418951400003': [9, 2017],
 'WOS:000418951400015': [10, 2017],
 'WOS:000418951400023': [11, 2017],
 'WOS:000418951400035': [12, 2017],
 'WOS:000418951400020': [13, 2017],
 'WOS:000418951400014': [14, 2017],
 'WOS:000418951400027': [15, 2017],
 'WOS:000418951400041': [16, 2017],
 'WOS:000418951400016': [17, 2017],
 'WOS:000418951400038': [18, 2017],
 'WOS:000418951400025': [19, 2017],
 'WOS:000418951400002': [20, 2017],
 'WOS:000418951400001': [21, 2017],
 'WOS:000418951400024': [22, 2017],
 'WOS:000418951400018': [23, 2017],
 'WOS:000418951400011': [24, 2017],
 'WOS:000418951400008': [25, 2017],
 'WOS:000418951400032': [26, 2017],
 'WOS:000418951400006': [27, 2017],
 'WOS:000418951400013': [28, 2017],
 

### PERIÓDICO DO ARTIGO SOURCE / PERIÓDICO DO ARTIGO TARGET
* Em `journals_cit`, criar um dicionário atribuindo cada codigo alfanumérico ISSN a um código numérico sequencial.

In [35]:
journals_cit.head()

Unnamed: 0_level_0,TITLE,CITED_OUT,CITED_IN,TOT_ART
ISSN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1553-734X,PLoS Computational Biology,224085.0,12190.0,4088.0
1367-4803,BIOINFORMATICS,132071.0,52320.0,5840.0
1752-0509,BMC Systems Biology,65446.0,4328.0,1443.0
1471-2105,BMC BIOINFORMATICS,149982.0,22495.0,4742.0
0303-6812,JOURNAL OF MATHEMATICAL BIOLOGY,27093.0,6551.0,845.0


In [36]:
len(journals_cit)

61107

In [37]:
all_journals = list(journals_cit.index)

In [38]:
journals_dic = {}
i = 1
for issn in all_journals:
    journals_dic[issn] = i
    i = i + 1

In [39]:
journals_dic

{'1553-734X': 1,
 '1367-4803': 2,
 '1752-0509': 3,
 '1471-2105': 4,
 '0303-6812': 5,
 '1748-7188': 6,
 '1465-4644': 7,
 '1176-9343': 8,
 '1662-5188': 9,
 '0022-5193': 10,
 '0277-6715': 11,
 '0929-5313': 12,
 '1093-3263': 13,
 '0092-8240': 14,
 '0006-3444': 15,
 '0140-0118': 16,
 '0962-2802': 17,
 '0006-341X': 18,
 '1751-8849': 19,
 '1089-7771': 20,
 '1793-5245': 21,
 '1742-4682': 22,
 '1066-5277': 23,
 '0025-5564': 24,
 '1062-936X': 25,
 '0323-3847': 26,
 '1477-8599': 27,
 '0010-4825': 28,
 '1574-8936': 29,
 '1748-670X': 30,
 '1085-7117': 31,
 '0973-5348': 32,
 '1748-5673': 33,
 '1938-7989': 34,
 '0218-3390': 35,
 '2040-7939': 36,
 '1868-1743': 37,
 '1467-5463': 38,
 '1758-0463': 39,
 '0303-2647': 40,
 '0001-5342': 41,
 '2194-573X': 42,
 '1547-1063': 43,
 '1431-7613': 44,
 '1946-6315': 45,
 '0385-4035': 46,
 '1759-0876': 47,
 '2156-7018': 48,
 '1756-0381': 49,
 '2194-6302': 50,
 '0219-7200': 51,
 '1913-2751': 52,
 '2168-2194': 53,
 '1759-2879': 54,
 '1662-5196': 55,
 '2041-1480': 56,
 

In [40]:
# trabalhando apenas com os artigos que estao na base do WoS
citations_b = citations[citations.TARGET_A != 'FORA DA BASE']
citations_b.to_csv('outputs/citations_b.csv')

In [41]:
print(len(citations) - len(citations_b))

138470


In [42]:
citations_b.head()

Unnamed: 0,SOURCE_A,SOURCE_J,TARGET_A,TARGET_J,YEAR
1,WOS:000418951400033,1553-734X,WOS:A1997XU79300002,0305-1048,2017
2,WOS:000418951400033,1553-734X,WOS:000382258600111,1932-6203,2017
3,WOS:000418951400033,1553-734X,WOS:000354024900017,0969-2126,2017
4,WOS:000418951400033,1553-734X,WOS:000176271000003,0907-4449,2017
5,WOS:000418951400033,1553-734X,WOS:000084896300069,0305-1048,2017


In [47]:
#var_cit = pd.DataFrame(columns = ['SOURCE_A_ID', 'SOURCE_A_YEAR', 'SOURCE_J', 'TARGET_A_ID', 'TARGET_A_YEAR', 'TARGET_J'])

In [48]:
#var_cit = pd.read_csv('outputs/var_cit.csv', index_col=0, encoding = "ISO-8859-1")

In [49]:
#start = time.time()
#ids = list(citations_b.index)
#for i in ids[480169+1:]:
#        aux = citations_b[citations_b.index == i].values[0]
#
#        SOURCE = articles_dic[aux[0]]
#        SOURCE_A_ID = SOURCE[0]
#        SOURCE_A_YEAR = SOURCE[1]
#        SOURCE_J = journals_dic[aux[1]]
#
#        TARGET = articles_dic[aux[2]]
#        TARGET_A_ID = TARGET[0]
#        TARGET_A_YEAR = TARGET[1]
#        TARGET_J = journals_dic[aux[3]]
#
#        var_cit.loc[i] = [SOURCE_A_ID, SOURCE_A_YEAR, SOURCE_J, TARGET_A_ID, TARGET_A_YEAR, TARGET_J]
#        #var_cit.to_csv('outputs/var_cit_1.csv')
#
#        print(i)
#        gc.collect()
#
#end = time.time()
#print(end-start)
# a partir de 512542 - time: 636421.9620666504

In [51]:
#var_cit.to_csv('outputs/var_cit_1.csv')

In [52]:
var_cit = pd.read_csv('outputs/var_cit_1.csv', index_col=0)

In [55]:
var_cit.head()

Unnamed: 0,SOURCE_A_ID,SOURCE_A_YEAR,SOURCE_J,TARGET_A_ID,TARGET_A_YEAR,TARGET_J
1,1,2017,1,50130,1997,60
2,1,2017,1,50131,2016,61
3,1,2017,1,50132,2015,62
4,1,2017,1,50133,2002,63
5,1,2017,1,50134,2000,60


In [56]:
var_cit.tail()

Unnamed: 0,SOURCE_A_ID,SOURCE_A_YEAR,SOURCE_J,TARGET_A_ID,TARGET_A_YEAR,TARGET_J
1646670,50129,2010,59,690444,2005,1370
1646671,50129,2010,59,201240,2005,1150
1646672,50129,2010,59,111921,2007,455
1646673,50129,2010,59,124770,1967,335
1646674,50129,2010,59,206602,2007,345


In [57]:
len(var_cit)

1508204

In [58]:
len(citations_b)

1508204