In [None]:
import pandas as pd
import ast
from ast import literal_eval
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib import rcParams
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
from collections import Counter

import sys
import networkx as nx

In [None]:
def read_data(path, sep):
    df = pd.read_csv(path, sep=sep)
    df.loc[:, 'AA'] = df['AA'].apply(lambda x: ast.literal_eval(x))
    df.loc[:, 'W'] = df['W'].apply(lambda x: ast.literal_eval(x))
    return df

In [None]:
def centrality_ind(filenameInput, info, id_col = 'Id'):

    Gdirect = nx.read_weighted_edgelist(filenameInput,delimiter='\t',create_using=nx.DiGraph)

    print("network loaded" )

    print("Pagerank centrality, directed")

    prc = nx.pagerank(Gdirect, alpha=0.9, personalization=None, max_iter=100, tol=1e-04, nstart=None, weight='weight', dangling=None)

    orig_stdout = sys.stdout

    f = open('out_Pagerank.txt', 'w')

    sys.stdout = f

    for v in Gdirect.nodes():

        print(v,'\t',prc[v])

    sys.stdout = orig_stdout

    f.close()

    prc = pd.DataFrame(prc.items())
    prc.rename(columns = {0:id_col, 1:'PageRank'}, inplace = True)
    prc[id_col] = prc[id_col].astype(float)
    info = info.merge(prc, how = 'left', on = id_col)
    info.loc[:, 'PageRank rank'] = info['PageRank'].rank(method = 'dense', ascending = False)

    print("Eigenvector centrality, directed")

    eigc = nx.eigenvector_centrality(Gdirect, max_iter=1000, tol=1e-04, nstart=None, weight='weight')

    orig_stdout = sys.stdout

    f = open('out_Eigen.txt', 'w')

    sys.stdout = f

    for v in Gdirect.nodes():

        print(v,'\t',eigc[v])

    sys.stdout = orig_stdout

    f.close()

    eigc = pd.DataFrame(eigc.items())
    eigc.rename(columns = {0:id_col, 1:'Eigen'}, inplace = True)
    eigc[id_col] = eigc[id_col].astype(float)
    info = info.merge(eigc, how = 'left', on = id_col)
    info.loc[:, 'Eigen rank'] = info['Eigen'].rank(method = 'dense', ascending = False)


    print("Betweenness centrality, directed")

    b=nx.algorithms.centrality.betweenness_centrality(Gdirect)

    orig_stdout = sys.stdout

    f = open('out_Between.txt', 'w')

    sys.stdout = f

    for v in Gdirect.nodes():

        print(v,'\t',b[v])

    sys.stdout = orig_stdout

    f.close()

    b = pd.DataFrame(b.items())
    b.rename(columns = {0:id_col, 1:'Between'}, inplace = True)
    b[id_col] = b[id_col].astype(float)
    info = info.merge(b, how = 'left', on = id_col)
    info.loc[:, 'Between rank'] = info['Between'].rank(method = 'dense', ascending = False)

    print("done ")

    return info

In [None]:
def period_ind(id_name, net, info, id_col = 'Id'):
    pagerank = id_name.copy()
    eigen = id_name.copy()
    between = id_name.copy()

    for Y in range(2015, 2022, 1):
        net_Y = net[net['Y'] == Y]
        net_Y[[id_col + '1', id_col + '2', 'weight']].to_csv(f'net_{Y}.csv', sep = '\t', index = False, header = False)
        print(Y)
        
        info_Y = id_name.copy()

        filenameInput = f'net_{Y}.csv'

        Gdirect = nx.read_weighted_edgelist(filenameInput,delimiter='\t',create_using=nx.DiGraph)

        print("network loaded" )

        print("Pagerank centrality, directed")

        prc = nx.pagerank(Gdirect, alpha=0.9, personalization=None, max_iter=100, tol=1e-04, nstart=None, weight='weight', dangling=None)

        prc = pd.DataFrame(prc.items())
        prc.rename(columns = {0:id_col, 1:Y}, inplace = True)
        prc[id_col] = prc[id_col].astype(float)
        pagerank = pagerank.merge(prc, how = 'left', on = id_col)

        print("Eigenvector centrality, directed")

        eigc = nx.eigenvector_centrality(Gdirect, max_iter=1000, tol=1e-04, nstart=None, weight='weight')

        eigc = pd.DataFrame(eigc.items())
        eigc.rename(columns = {0:id_col, 1:Y}, inplace = True)
        eigc[id_col] = eigc[id_col].astype(float)
        eigen = eigen.merge(eigc, how = 'left', on = id_col)

        print("Betweenness centrality, directed")

        b=nx.algorithms.centrality.betweenness_centrality(Gdirect)

        b = pd.DataFrame(b.items())
        b.rename(columns = {0:id_col, 1:Y}, inplace = True)
        b[id_col] = b[id_col].astype(float)
        between = between.merge(b, how = 'left', on = id_col)


    pagerank.fillna(0, inplace = True)
    pagerank = pagerank.merge(info[[id_col, 'PageRank', 'PageRank rank']], on = id_col)    

    eigen.fillna(0, inplace = True)
    eigen = eigen.merge(info[[id_col, 'Eigen', 'Eigen rank']], on = id_col)

    between.fillna(0, inplace = True)
    between = between.merge(info[[id_col, 'Between', 'Between rank']], on = id_col)


    print("done ")
        
    return pagerank, eigen, between

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
data = read_data('drive/MyDrive/filtered_data.csv', sep = ';')
net = pd.read_csv('drive/MyDrive/excluded_net.csv', sep = ';')

In [None]:
data.loc[:, 'RId'] = data['RId'].apply(lambda x: ast.literal_eval(x))

In [None]:
author_data = data[['Id', 'AA', 'Y']].copy().rename(columns = {'Y':'Year'})
author_data.dropna(subset = ['AA'], inplace = True, axis = 0)

# Coauthor network

In [None]:
co_author_net = []
id_name = []
isolated = []

for k in tqdm(range(len(author_data))):
    authors = author_data.loc[k, 'AA']
    unique_authors = []

    for i in range(len(authors)):
        unique_authors.append(authors[i]['AuId'])
        id_name.append([authors[i]['AuId'], authors[i]['DAuN'], authors[i]['AfId'], authors[i]['DAfN']])

    unique_authors = list(set(unique_authors))

    if len(unique_authors) == 1:
        isolated.append([unique_authors[0], author_data.loc[k, 'Year'], 1])

    for i in range(len(unique_authors)):
        for j in range(len(unique_authors)):
            if j > i:
                author1 = unique_authors[i]
                author2 = unique_authors[j]

                if author1 == author2:
                   print(k, i, j)

                co_author_net.append([min(author1, author2), max(author1, author2), author_data.loc[k, 'Year'], 1])

co_author_net = pd.DataFrame(co_author_net).rename(columns = {0:'AuId1', 1:'AuId2', 2:'Y', 3:'weight'})
isolated = pd.DataFrame(isolated).rename(columns = {0:'AuId', 1:'Y', 2:'weight'})
id_name = pd.DataFrame(id_name).rename(columns = {0:'AuId', 1:'AuN'})
id_name.drop_duplicates(['AuId', 'AuN'], inplace = True)

co_author_net = co_author_net.groupby(['AuId1', 'AuId2', 'Y']).sum().reset_index()
isolated = isolated[isolated['AuId'].isin(list(set(co_author_net['AuId1']) | set(co_author_net['AuId2']))) == False]
isolated = isolated.groupby(['AuId', 'Y']).sum().reset_index()

In [None]:
co_authors = list(co_author_net[['AuId1', 'AuId2']].drop_duplicates().itertuples(index=False, name=None))

In [None]:
import pickle

output = open('drive/MyDrive/co_authors.pkl', 'wb')
pickle.dump(co_authors, output)
output.close()

In [None]:
import pickle

pkl_file = open('drive/MyDrive/co_authors.pkl', 'rb')
co_authors = pickle.load(pkl_file)
pkl_file.close()

In [None]:
co_authors_df = pd.DataFrame(co_authors).rename(columns = {0:'A1', 1:'A2'})

# Independent affiliation citation network 

In [None]:
all_author_data = []

for i in tqdm(range(len(author_data))):
    authors = author_data.loc[i, 'AA']
    for j in range(len(authors)):
        all_author_data.append([author_data.loc[i, 'Id'], authors[j]['AuId'], authors[j]['AfId']])

all_author_data = pd.DataFrame(all_author_data).rename(columns = {0:'Id', 1:'AuId', 2:'AfId'})

  0%|          | 0/39811 [00:00<?, ?it/s]

In [None]:
# join info about authors to paper citation network
author_net = net.merge(all_author_data, left_on = 'ID1', right_on = 'Id').rename(columns = {'AuId':'AuId1', 'AfId':'AfId1'}).drop(columns = 'Id')
author_net = author_net.merge(all_author_data, left_on = 'ID2', right_on = 'Id').rename(columns = {'AuId':'AuId2', 'AfId':'AfId2'}).drop(columns = 'Id')

In [None]:
author_net

Unnamed: 0,ID1,ID2,weight,Y,AuId1,AfId1,AuId2,AfId2
0,2754967293,2339791932,1,2017,130753112,39854758.0,2229563412,168635309.0
1,2754967293,2339791932,1,2017,130753112,39854758.0,2273634235,
2,2754967293,2339791932,1,2017,130753112,39854758.0,2063789477,
3,2754967293,2339791932,1,2017,130753112,39854758.0,2282261216,
4,2754967293,2339791932,1,2017,130753112,39854758.0,2074779875,
...,...,...,...,...,...,...,...,...
23670301,3210614838,2950902575,1,2021,3210059833,,2660466461,
23670302,3210614838,2950902575,1,2021,3210059833,,2191477668,123044942.0
23670303,2547599175,1924335830,1,2016,2656812379,,1849388696,
23670304,2547599175,1924335830,1,2016,2656812379,,2164406578,11983389.0


In [None]:
all_author_net = author_net.dropna(subset = ['AfId1', 'AfId2']).reset_index(drop = True)

In [None]:
all_author_net = all_author_net[['weight',	'Y',	'AfId1',	'AfId2']].groupby(['AfId1', 'AfId2', 'Y']).sum().reset_index()

In [None]:
all_author_net

Unnamed: 0,AfId1,AfId2,Y,weight
0,4.605000e+03,5.720697e+07,2017,2
1,4.605000e+03,1.299024e+08,2017,1
2,4.605000e+03,1.380062e+08,2017,2
3,4.605000e+03,1.330343e+09,2017,2
4,9.507000e+03,9.507000e+03,2021,4
...,...,...,...,...
1103292,3.151616e+09,1.307629e+09,2020,1
1103293,3.151616e+09,2.800365e+09,2020,5
1103294,3.151616e+09,2.801148e+09,2020,1
1103295,3.151616e+09,2.801442e+09,2020,1


In [None]:
all_author_net.to_csv('drive/MyDrive/Aff_net_independent/all_author_net.csv', sep = ';', index = False)

In [None]:
# join info about coauthors to net with all authors citations
author_net_co_a = author_net.merge(co_authors_df, how = 'left', left_on = ['AuId1', 'AuId2'], right_on = ['A1', 'A2']).rename(columns = {'A1':'A3', 'A2':'A4'})
author_net_co_a = author_net_co_a.merge(co_authors_df, how = 'left', left_on = ['AuId2', 'AuId1'], right_on = ['A1', 'A2'])

In [None]:
author_net_co_a

Unnamed: 0,ID1,ID2,weight,Y,AuId1,AfId1,AuId2,AfId2,A3,A4,A1,A2
0,2754967293,2339791932,1,2017,130753112,39854758.0,2229563412,168635309.0,,,,
1,2754967293,2339791932,1,2017,130753112,39854758.0,2273634235,,,,,
2,2754967293,2339791932,1,2017,130753112,39854758.0,2063789477,,,,,
3,2754967293,2339791932,1,2017,130753112,39854758.0,2282261216,,,,,
4,2754967293,2339791932,1,2017,130753112,39854758.0,2074779875,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
23670301,3210614838,2950902575,1,2021,3210059833,,2660466461,,,,,
23670302,3210614838,2950902575,1,2021,3210059833,,2191477668,123044942.0,,,,
23670303,2547599175,1924335830,1,2016,2656812379,,1849388696,,,,,
23670304,2547599175,1924335830,1,2016,2656812379,,2164406578,11983389.0,,,,


In [None]:
# if they are coauthors there is not null info in A1 or A2 or A3 or A4
co_author_cit = author_net_co_a[(author_net_co_a['A1'].isna()==False) | (author_net_co_a['A2'].isna()==False) |(author_net_co_a['A3'].isna()==False) |(author_net_co_a['A4'].isna()==False)].reset_index()

In [None]:
co_author_cit

Unnamed: 0,index,ID1,ID2,weight,Y,AuId1,AfId1,AuId2,AfId2,A3,A4,A1,A2
0,2533,2897637471,2339791932,1,2019,2024161122,2.037639e+08,2282261216,,2.024161e+09,2.282261e+09,,
1,2628,3108179903,2339791932,1,2021,2024161122,1.852618e+08,2282261216,,2.024161e+09,2.282261e+09,,
2,2633,3108179903,2339791932,1,2021,2024161122,2.037639e+08,2282261216,,2.024161e+09,2.282261e+09,,
3,3569,2800142452,2339791932,1,2018,2049710663,1.077210e+08,2074779875,,2.049711e+09,2.074780e+09,,
4,7130,2112455323,1920030402,1,2015,2047227543,2.800294e+09,2175448217,2.801533e+09,2.047228e+09,2.175448e+09,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3483768,23670128,3134128686,2613414590,1,2021,3186990654,6.960165e+07,2119071778,,,,2.119072e+09,3.186991e+09
3483769,23670129,3134128686,2613414590,1,2021,3134925025,2.018509e+08,3186990654,,3.134925e+09,3.186991e+09,,
3483770,23670131,3134128686,2613414590,1,2021,3134925025,2.018509e+08,2119071778,,,,2.119072e+09,3.134925e+09
3483771,23670132,3134128686,2613414590,1,2021,2119071778,6.960165e+07,3186990654,,2.119072e+09,3.186991e+09,,


In [None]:
co_author_cit_aff = co_author_cit[['AfId1', 'AfId2', 'weight', 'Y']].dropna(subset = ['AfId1', 'AfId2'])
co_author_cit_aff = co_author_cit_aff.groupby(['AfId1', 'AfId2', 'Y']).sum().reset_index()

In [None]:
co_author_cit_aff

Unnamed: 0,AfId1,AfId2,Y,weight
0,4.605000e+03,57206974.0,2017,1
1,5.232500e+04,52325.0,2017,1
2,5.232500e+04,5023651.0,2021,8
3,5.232500e+04,25112270.0,2020,6
4,5.232500e+04,40034438.0,2016,5
...,...,...,...,...
182659,3.148998e+09,27837315.0,2016,5
182660,3.151616e+09,4576418.0,2020,2
182661,3.151616e+09,110525433.0,2020,1
182662,3.151616e+09,120514687.0,2020,1


In [None]:
co_author_cit_aff.to_csv('drive/MyDrive/Aff_net_independent/co_author_cit_aff.csv', sep = ';', index = False)

In [None]:
co_author_cit_papers = co_author_cit[['ID1', 'ID2']].drop_duplicates()

In [None]:
co_author_cit_papers.to_csv('drive/MyDrive/Aff_net_independent/co_author_cit_papers.csv', sep = ';', index = False)

In [None]:
# author_net_co_a.to_csv('drive/MyDrive/Aff_net_independent/all_co_author_cit_net.csv', sep = ';', index = False)

In [None]:
co_author_cit_papers

Unnamed: 0,ID1,ID2
0,2897637471,2339791932
1,3108179903,2339791932
3,2800142452,2339791932
4,2112455323,1920030402
344,2917107887,1920030402
...,...,...
3483745,3164697975,2990022896
3483747,3164697975,3131252885
3483759,2460929359,2297296783
3483761,3134128686,3096193136


In [None]:
filtered_net = author_net_co_a[(author_net_co_a['A1'].isna()) & (author_net_co_a['A2'].isna()) & (author_net_co_a['A3'].isna()) & (author_net_co_a['A4'].isna())].reset_index(drop = True)

In [None]:
filtered_net = filtered_net.dropna(subset = ['AfId1', 'AfId2']).reset_index(drop = True).drop(columns = ['A1', 'A2', 'A3', 'A4'])

In [None]:
filtered_net

Unnamed: 0,ID1,ID2,weight,Y,AuId1,AfId1,AuId2,AfId2
0,2754967293,2339791932,1,2017,130753112,3.985476e+07,2229563412,1.686353e+08
1,2754967293,2339791932,1,2017,2336267196,1.651438e+08,2229563412,1.686353e+08
2,2754967293,2339791932,1,2017,2526738623,1.443707e+06,2229563412,1.686353e+08
3,2754967293,2339791932,1,2017,2167087174,1.454875e+08,2229563412,1.686353e+08
4,2754967293,2339791932,1,2017,1586390273,1.202509e+08,2229563412,1.686353e+08
...,...,...,...,...,...,...,...,...
14061097,3140855448,3028333743,1,2021,2961061762,2.802508e+09,2965151694,1.285302e+09
14061098,3140855448,3028333743,1,2021,2961061762,2.802508e+09,2226498062,1.285302e+09
14061099,3140855448,3028333743,1,2021,3149102611,2.802508e+09,2546617100,1.285302e+09
14061100,3140855448,3028333743,1,2021,3149102611,2.802508e+09,2965151694,1.285302e+09


In [None]:
filtered_net.to_csv('drive/MyDrive/Aff_net_independent/independent_net.csv', sep = ';', index = False)

In [None]:
affiliation_net = filtered_net[['weight',	'Y',	'AfId1',	'AfId2']].groupby(['AfId1', 'AfId2', 'Y']).sum().reset_index()

In [None]:
affiliation_net.to_csv('drive/MyDrive/Aff_net_independent/affiliation_independent_net.csv', sep = ';', index = False)

In [None]:
print('The number of edges:', affiliation_net.shape[0])
print('The number of vertices:', len(set(affiliation_net['AfId1'].values) | set(affiliation_net['AfId2'].values)))

The number of edges: 1062074
The number of vertices: 4682


# Metrics

In [None]:
affiliation_net = pd.read_csv('drive/MyDrive/Aff_net_independent/affiliation_independent_net.csv', sep = ';')

In [None]:
aff_net_no_periods = affiliation_net.groupby(['AfId1', 'AfId2']).sum()['weight'].reset_index()

In [None]:
aff_net_no_periods.to_csv('aff_net_indep_no_periods.csv', sep = '\t', index = False, header = False)

In [None]:
Gdirect = nx.read_weighted_edgelist('aff_net_indep_no_periods.csv', delimiter='\t', create_using=nx.DiGraph)

In [None]:
degrees = [val for (node, val) in Gdirect.degree()]

In [None]:
print('Vertices:', len(Gdirect.nodes()))
print('Edges:', len(Gdirect.edges()))
print('Density:', nx.density(Gdirect))
print('Minimum citation number:', np.min(degrees))
print('Maximum citation number:', np.max(degrees))
print('Average citation number:', np.mean(degrees))


Vertices: 4682
Edges: 670890
Density: 0.030611264364900106
Minimum citation number: 1
Maximum citation number: 3711
Average citation number: 286.5826569841948


### Connectivity components

In [None]:
comp_df = pd.DataFrame()

In [None]:
G = nx.read_weighted_edgelist('aff_net_indep_no_periods.csv', delimiter='\t')
components = nx.connected_components(G)

comp_df = pd.DataFrame()

i = 1
for comp in components:
    for aff in comp:
        comp_df.loc[aff, 'component'] = i
    i += 1
        
comp_df = comp_df.reset_index().rename(columns = {'index': 'AfId'})

comp_df.to_csv('aff_components.csv', sep = ';', index = False)

In [None]:
comp_df

Unnamed: 0,AfId,component
0,4264650.0,1.0
1,177909021.0,1.0
2,173887773.0,1.0
3,150037166.0,1.0
4,2800966917.0,1.0
...,...,...
4677,8764889.0,1.0
4678,148202161.0,1.0
4679,66068411.0,1.0
4680,40542001.0,1.0


In [None]:
affiliation_net[affiliation_net['AfId1'] == 165053356]

Unnamed: 0,AfId1,AfId2,Y,weight
644197,165053356.0,165053356.0,2017,1


In [None]:
affiliations[affiliations['AfId'] == 165053356]

Unnamed: 0,AfId,AfN
1696,165053356,IMS Health


# Centrality indices for affiliation independent net

## No periods

In [None]:
affiliation_net = pd.read_csv('drive/MyDrive/Aff_net_independent/affiliation_independent_net.csv', sep = ';')

In [None]:
id_name = pd.read_csv('drive/MyDrive/affiliations.csv', sep = ';')

In [None]:
aff_net_no_periods = affiliation_net.groupby(['AfId1', 'AfId2']).sum()['weight'].reset_index()

In [None]:
aff_net_no_periods.to_csv('aff_net_indep_no_periods.csv', sep = '\t', index = False, header = False)

In [None]:
aff_net = pd.read_csv('aff_net_no_periods.csv', sep = '\t', header = None)
aff_net.rename(columns = {0:'AfId1', 1:'AfId2', 2:'weight'}, inplace = True)

In [None]:
affiliations_info = id_name[id_name['AfId'].isin(set(aff_net['AfId1'].values) | set(aff_net['AfId2'].values))].copy()

weights = aff_net.groupby(['AfId2']).sum()['weight'].reset_index()
affiliations_info = affiliations_info.merge(weights, how = 'left', left_on = 'AfId', right_on = 'AfId2')[['AfId', 'AfN', 'weight']]
affiliations_info['weight'].fillna(0, inplace = True)

affiliations_info.loc[:, 'In-degree'] = affiliations_info['weight'] / aff_net['weight'].sum()
affiliations_info.loc[:, 'In-degree rank'] = affiliations_info['In-degree'].rank(method = 'dense', ascending = False)

In [None]:
affiliations_info = centrality_ind('aff_net_indep_no_periods.csv', affiliations_info, 'AfId')

network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
done 


In [None]:
affiliations_info.sort_values('In-degree rank').head(10)

Unnamed: 0,AfId,AfN,weight,In-degree,In-degree rank,PageRank,PageRank rank,Eigen,Eigen rank,Between,Between rank
83,1299303238,National Institutes of Health,280384.0,0.01994,1.0,0.015684,1.0,0.381232,1.0,0.015405,2.0
174,115076166,UCL Institute of Neurology,195254.0,0.013886,2.0,0.011222,2.0,0.266265,2.0,0.010338,7.0
140,79576946,University of Pennsylvania,183289.0,0.013035,3.0,0.01106,3.0,0.231488,3.0,0.010606,6.0
31,241749,University of Cambridge,175178.0,0.012458,4.0,0.010854,4.0,0.193421,7.0,0.015098,3.0
151,1330342723,Mayo Clinic,156084.0,0.0111,5.0,0.00929,5.0,0.20651,4.0,0.00986,9.0
60,45129253,University College London,148868.0,0.010587,6.0,0.009206,6.0,0.199709,5.0,0.012899,4.0
273,2799853436,Johns Hopkins University School of Medicine,148855.0,0.010586,7.0,0.008151,10.0,0.194817,6.0,0.003457,68.0
139,2801533059,German Center for Neurodegenerative Diseases,146905.0,0.010448,8.0,0.008425,8.0,0.17639,9.0,0.008422,14.0
17,136199984,Harvard University,142000.0,0.010099,9.0,0.008999,7.0,0.163959,10.0,0.015553,1.0
20,40120149,University of Oxford,137847.0,0.009803,10.0,0.00829,9.0,0.176953,8.0,0.008344,15.0


In [None]:
affiliations_info.to_csv('aff_info_indep_classic.csv', sep = ';', index = False)

In [None]:
affiliations_info = pd.read_csv('aff_info_indep_classic.csv', sep = ';')

In [None]:
affiliations_info[['AfId', 'In-degree']].to_csv('IndexCI_indep.txt', sep = '\t', index = False)

### Periods

In [None]:
affiliations = pd.read_csv('drive/MyDrive/affiliations.csv', sep = ';')
affiliations_info = pd.read_csv('aff_info_indep_classic.csv', sep = ';')

In [None]:
aff_net = pd.read_csv('drive/MyDrive/Aff_net_independent/affiliation_independent_net.csv', sep = ';')

In [None]:
in_degree = affiliations.copy()

In [None]:
for Y in range(2015, 2022, 1):
    aff_net_Y = aff_net[aff_net['Y'] == Y]
    print(Y)
    
    aff_info_Y = affiliations.copy()

    weights_Y = aff_net_Y.groupby(['AfId2']).sum()['weight'].reset_index()
    aff_info_Y = aff_info_Y.merge(weights_Y, how = 'left', left_on = 'AfId', right_on = 'AfId2')[['AfId', 'AfN', 'weight']]
    aff_info_Y['weight'].fillna(0, inplace = True)

    aff_info_Y.loc[:, 'In-degree'] = aff_info_Y['weight'] / aff_net_Y['weight'].sum()
    
    in_degree = in_degree.merge(aff_info_Y[['AfId', 'In-degree']], on = ['AfId']).rename(columns = {'In-degree':Y, 'weight' : f'weight_{Y}'})

2015
2016
2017
2018
2019
2020
2021


In [None]:
in_degree = in_degree.merge(affiliations_info[['AfId', 'In-degree', 'In-degree rank']], on = 'AfId')

In [None]:
pagerank, eigen, between = period_ind(affiliations, aff_net, affiliations_info, id_col = 'AfId')

2015
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2016
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2017
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2018
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2019
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2020
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2021
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
done 


In [None]:
in_degree.to_csv('In_degree_indep.txt', sep = '\t', index = False)
pagerank.to_csv('PageRank_indep.txt', sep = '\t', index = False)
eigen.to_csv('Eigen_indep.txt', sep = '\t', index = False)
between.to_csv('Between_indep.txt', sep = '\t', index = False)

# Centrality indices for all affiliation citation net

## No periods

In [None]:
affiliation_net = pd.read_csv('drive/MyDrive/Aff_net_independent/all_author_net.csv', sep = ';')

In [None]:
id_name = pd.read_csv('drive/MyDrive/affiliations.csv', sep = ';')

In [None]:
aff_net_no_periods = affiliation_net.groupby(['AfId1', 'AfId2']).sum()['weight'].reset_index()

In [None]:
aff_net_no_periods.to_csv('aff_net_all_no_periods.csv', sep = '\t', index = False, header = False)

In [None]:
aff_net = pd.read_csv('aff_net_all_no_periods.csv', sep = '\t', header = None)
aff_net.rename(columns = {0:'AfId1', 1:'AfId2', 2:'weight'}, inplace = True)

In [None]:
affiliations_info = id_name[id_name['AfId'].isin(set(aff_net['AfId1'].values) | set(aff_net['AfId2'].values))].copy()

weights = aff_net.groupby(['AfId2']).sum()['weight'].reset_index()
affiliations_info = affiliations_info.merge(weights, how = 'left', left_on = 'AfId', right_on = 'AfId2')[['AfId', 'AfN', 'weight']]
affiliations_info['weight'].fillna(0, inplace = True)

affiliations_info.loc[:, 'In-degree'] = affiliations_info['weight'] / aff_net['weight'].sum()
affiliations_info.loc[:, 'In-degree rank'] = affiliations_info['In-degree'].rank(method = 'dense', ascending = False)

In [None]:
affiliations_info = centrality_ind('aff_net_all_no_periods.csv', affiliations_info, 'AfId')

network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
done 


In [None]:
affiliations_info.sort_values('In-degree rank')

Unnamed: 0,AfId,AfN,weight,In-degree,In-degree rank,PageRank,PageRank rank,Eigen,Eigen rank,Between,Between rank
83,1299303238,National Institutes of Health,386700.0,0.023392,1.0,0.017313,1.0,5.540161e-01,1.0,0.014976,3.0
174,115076166,UCL Institute of Neurology,251960.0,0.015241,2.0,0.011649,2.0,3.225918e-01,2.0,0.010084,7.0
31,241749,University of Cambridge,222117.0,0.013436,3.0,0.011612,3.0,1.988013e-01,4.0,0.015227,2.0
140,79576946,University of Pennsylvania,214834.0,0.012996,4.0,0.010971,4.0,1.944736e-01,6.0,0.010262,6.0
139,2801533059,German Center for Neurodegenerative Diseases,188351.0,0.011394,5.0,0.008774,8.0,1.951758e-01,5.0,0.008512,13.0
...,...,...,...,...,...,...,...,...,...,...,...
2783,183716014,John Carroll University,0.0,0.000000,1848.0,0.000024,4018.0,2.987441e-21,4017.0,0.000000,3624.0
2121,320755355,Children's of Alabama,0.0,0.000000,1848.0,0.000024,4018.0,2.987441e-21,4017.0,0.000000,3624.0
4251,2801183603,Mahatma Gandhi Memorial Medical College,0.0,0.000000,1848.0,0.000024,4018.0,2.987441e-21,4017.0,0.000000,3624.0
2114,117963711,Kuban State University,0.0,0.000000,1848.0,0.000024,4018.0,2.987441e-21,4017.0,0.000000,3624.0


In [None]:
affiliations_info.to_csv('aff_info_all_classic.csv', sep = ';', index = False)

In [None]:
affiliations_info = pd.read_csv('aff_info_all_classic.csv', sep = ';')

In [None]:
affiliations_info[['AfId', 'In-degree']].to_csv('IndexCI_all.txt', sep = '\t', index = False)

### Periods

In [None]:
affiliations = pd.read_csv('drive/MyDrive/affiliations.csv', sep = ';')
affiliations_info = pd.read_csv('aff_info_all_classic.csv', sep = ';')

In [None]:
aff_net = pd.read_csv('drive/MyDrive/Aff_net_independent/all_author_net.csv', sep = ';')

In [None]:
in_degree = affiliations.copy()

In [None]:
for Y in range(2015, 2022, 1):
    aff_net_Y = aff_net[aff_net['Y'] == Y]
    print(Y)
    
    aff_info_Y = affiliations.copy()

    weights_Y = aff_net_Y.groupby(['AfId2']).sum()['weight'].reset_index()
    aff_info_Y = aff_info_Y.merge(weights_Y, how = 'left', left_on = 'AfId', right_on = 'AfId2')[['AfId', 'AfN', 'weight']]
    aff_info_Y['weight'].fillna(0, inplace = True)

    aff_info_Y.loc[:, 'In-degree'] = aff_info_Y['weight'] / aff_net_Y['weight'].sum()
    
    in_degree = in_degree.merge(aff_info_Y[['AfId', 'In-degree']], on = ['AfId']).rename(columns = {'In-degree':Y, 'weight' : f'weight_{Y}'})

2015
2016
2017
2018
2019
2020
2021


In [None]:
in_degree = in_degree.merge(affiliations_info[['AfId', 'In-degree', 'In-degree rank']], on = 'AfId')

In [None]:
pagerank, eigen, between = period_ind(affiliations, aff_net, affiliations_info, id_col = 'AfId')

2015
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2016
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2017
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2018
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2019
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2020
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2021
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
done 


In [None]:
in_degree.to_csv('In_degree_all.txt', sep = '\t', index = False)
pagerank.to_csv('PageRank_all.txt', sep = '\t', index = False)
eigen.to_csv('Eigen_all.txt', sep = '\t', index = False)
between.to_csv('Between_all.txt', sep = '\t', index = False)

# Coauthor citation affiliations

In [None]:
affiliations_info = pd.read_csv('aff_info_all_classic.csv', sep = ';')

In [None]:
co_author_cit_aff = pd.read_csv('drive/MyDrive/Aff_net_independent/co_author_cit_aff.csv', sep = ';')

In [None]:
co_author_cit_aff = co_author_cit_aff.merge(affiliations, how = 'left', left_on = 'AfId1', right_on = 'AfId').rename(columns = {'AfN':'AfN1'}).drop(columns = {'AfId'})
co_author_cit_aff = co_author_cit_aff.merge(affiliations, how = 'left', left_on = 'AfId2', right_on = 'AfId').rename(columns = {'AfN':'AfN2'}).drop(columns = {'AfId'})

In [None]:
co_author_cit_aff

Unnamed: 0,AfId1,AfId2,Y,weight,AfN1,AfN2
0,4.605000e+03,57206974.0,2017,1,Illinois College of Optometry,New York University
1,5.232500e+04,52325.0,2017,1,Oswaldo Cruz Foundation,Oswaldo Cruz Foundation
2,5.232500e+04,5023651.0,2021,8,Oswaldo Cruz Foundation,McGill University
3,5.232500e+04,25112270.0,2020,6,Oswaldo Cruz Foundation,Federal University of Pernambuco
4,5.232500e+04,40034438.0,2016,5,Oswaldo Cruz Foundation,Rio de Janeiro State University
...,...,...,...,...,...,...
182659,3.148998e+09,27837315.0,2016,5,Institute for Systems Biology,University of Michigan
182660,3.151616e+09,4576418.0,2020,2,University of Kurdistan Hewler,Universiti Teknologi Malaysia
182661,3.151616e+09,110525433.0,2020,1,University of Kurdistan Hewler,Islamic Azad University
182662,3.151616e+09,120514687.0,2020,1,University of Kurdistan Hewler,Baqiyatallah University of Medical Sciences


In [None]:
co_author_cit_aff_grouped = co_author_cit_aff.groupby(['AfId1', 'AfId2', 'AfN1', 'AfN2']).sum().reset_index().drop(columns = {'Y'})

In [None]:
co_author_cit_aff_grouped

Unnamed: 0,AfId1,AfId2,AfN1,AfN2,weight
0,4.605000e+03,57206974.0,Illinois College of Optometry,New York University,1
1,5.232500e+04,52325.0,Oswaldo Cruz Foundation,Oswaldo Cruz Foundation,1
2,5.232500e+04,5023651.0,Oswaldo Cruz Foundation,McGill University,8
3,5.232500e+04,25112270.0,Oswaldo Cruz Foundation,Federal University of Pernambuco,6
4,5.232500e+04,40034438.0,Oswaldo Cruz Foundation,Rio de Janeiro State University,27
...,...,...,...,...,...
96268,3.148998e+09,27837315.0,Institute for Systems Biology,University of Michigan,5
96269,3.151616e+09,4576418.0,University of Kurdistan Hewler,Universiti Teknologi Malaysia,2
96270,3.151616e+09,110525433.0,University of Kurdistan Hewler,Islamic Azad University,1
96271,3.151616e+09,120514687.0,University of Kurdistan Hewler,Baqiyatallah University of Medical Sciences,1


In [None]:
co_author_cit_aff_grouped.sort_values('weight', ascending = False).head(10)

Unnamed: 0,AfId1,AfId2,AfN1,AfN2,weight
80401,1299303000.0,1299303000.0,National Institutes of Health,National Institutes of Health,26322
16,241749.0,241749.0,University of Cambridge,University of Cambridge,17870
46326,139660500.0,139660500.0,Central South University,Central South University,14520
14057,34077900.0,34077900.0,Juntendo University,Juntendo University,11897
87614,2799853000.0,2799853000.0,Johns Hopkins University School of Medicine,Johns Hopkins University School of Medicine,11082
16777,40120150.0,40120150.0,University of Oxford,University of Oxford,10709
59418,173911200.0,173911200.0,Iowa State University,Iowa State University,10519
78888,1292860000.0,1292860000.0,Montreal Neurological Institute and Hospital,Montreal Neurological Institute and Hospital,10200
80135,1299303000.0,115076200.0,National Institutes of Health,UCL Institute of Neurology,9449
28007,79576950.0,79576950.0,University of Pennsylvania,University of Pennsylvania,9033


In [None]:
co_author_cit_aff_grouped_citations = co_author_cit_aff_grouped.groupby('AfId2').sum().drop(columns = {'AfId1'}).reset_index().rename(columns = {'weight':'coauthor_citations'})
co_author_cit_aff_grouped_citations = co_author_cit_aff_grouped_citations.merge(affiliations_info, how = 'right', right_on = 'AfId', left_on = 'AfId2').drop(columns = {'AfId2'})
co_author_cit_aff_grouped_citations['coauthor_citations'].fillna(0, inplace = True)
co_author_cit_aff_grouped_citations.loc[:, 'coauthor_citations_proportion'] = co_author_cit_aff_grouped_citations['coauthor_citations'] / co_author_cit_aff_grouped_citations['weight']

In [None]:
co_author_cit_aff_grouped_citations.sort_values('coauthor_citations', ascending = False).head(10)

Unnamed: 0,coauthor_citations,AfId,AfN,weight,In-degree,In-degree rank,PageRank,PageRank rank,Eigen,Eigen rank,Between,Between rank,coauthor_citations_proportion
83,106316.0,1299303238,National Institutes of Health,386700.0,0.023392,1.0,0.017313,1.0,0.554016,1.0,0.014976,3.0,0.274931
174,56706.0,115076166,UCL Institute of Neurology,251960.0,0.015241,2.0,0.011649,2.0,0.322592,2.0,0.010084,7.0,0.22506
31,46939.0,241749,University of Cambridge,222117.0,0.013436,3.0,0.011612,3.0,0.198801,4.0,0.015227,2.0,0.211326
164,44095.0,5023651,McGill University,125963.0,0.00762,12.0,0.005201,23.0,0.14853,11.0,0.005484,34.0,0.350063
179,41575.0,1292859797,Montreal Neurological Institute and Hospital,113701.0,0.006878,16.0,0.004489,30.0,0.14296,12.0,0.003593,66.0,0.365652
139,41446.0,2801533059,German Center for Neurodegenerative Diseases,188351.0,0.011394,5.0,0.008774,8.0,0.195176,5.0,0.008512,13.0,0.220047
178,37633.0,8087733,University of Tübingen,143665.0,0.008691,11.0,0.006713,12.0,0.168897,8.0,0.006354,28.0,0.26195
60,33329.0,45129253,University College London,182197.0,0.011021,7.0,0.009253,5.0,0.20118,3.0,0.013226,4.0,0.182928
20,32206.0,40120149,University of Oxford,170053.0,0.010287,8.0,0.008476,9.0,0.152725,9.0,0.008152,16.0,0.189388
140,31545.0,79576946,University of Pennsylvania,214834.0,0.012996,4.0,0.010971,4.0,0.194474,6.0,0.010262,6.0,0.146834


In [None]:
co_author_cit_aff_grouped_citations[co_author_cit_aff_grouped_citations['weight'] > 1000].sort_values('coauthor_citations_proportion', ascending = False).head(10)

Unnamed: 0,coauthor_citations,AfId,AfN,weight,In-degree,In-degree rank,PageRank,PageRank rank,Eigen,Eigen rank,Between,Between rank,coauthor_citations_proportion
427,2927.0,36522303,University of Greifswald,4425.0,0.000268,588.0,0.000235,686.0,0.002297,563.0,9.405449e-06,1728.0,0.661469
433,800.0,46543356,Epilepsy Society,1299.0,7.9e-05,1049.0,8.6e-05,1266.0,0.000678,1010.0,6.622474e-08,3182.0,0.615858
3584,653.0,72702400,Athens State University,1124.0,6.8e-05,1111.0,6.3e-05,1559.0,0.000449,1208.0,6.534963e-07,2619.0,0.580961
3937,3308.0,28006308,Shandong Normal University,6150.0,0.000372,478.0,0.000239,675.0,0.000285,1406.0,2.010914e-05,1490.0,0.537886
2269,1459.0,190085865,Universidade Federal de Sergipe,2881.0,0.000174,727.0,0.000145,928.0,0.000436,1220.0,0.0001020276,939.0,0.506421
451,549.0,3018323443,Centre Hospitalier Universitaire de Bordeaux,1091.0,6.6e-05,1120.0,7.5e-05,1380.0,0.000512,1148.0,2.880489e-06,2142.0,0.503208
1147,3530.0,76835614,University of Missouri,7017.0,0.000424,447.0,0.000371,490.0,0.000943,857.0,0.0001415906,857.0,0.503064
561,1142.0,2800491743,Sacred Heart Hospital,2389.0,0.000145,809.0,0.000112,1066.0,0.002695,520.0,8.14974e-07,2540.0,0.478024
720,994.0,129043915,University of Udine,2204.0,0.000133,835.0,9.8e-05,1180.0,0.001973,603.0,8.946075e-06,1750.0,0.450998
361,3894.0,22465464,University of Münster,9279.0,0.000561,363.0,0.000456,415.0,0.007639,253.0,0.0005442747,396.0,0.419657


In [None]:
co_author_cit_aff_grouped_citations[co_author_cit_aff_grouped_citations['In-degree rank'] < 100].sort_values('coauthor_citations_proportion', ascending = False).head(10)

Unnamed: 0,coauthor_citations,AfId,AfN,weight,In-degree,In-degree rank,PageRank,PageRank rank,Eigen,Eigen rank,Between,Between rank,coauthor_citations_proportion
179,41575.0,1292859797,Montreal Neurological Institute and Hospital,113701.0,0.006878,16.0,0.004489,30.0,0.14296,12.0,0.003593,66.0,0.365652
164,44095.0,5023651,McGill University,125963.0,0.00762,12.0,0.005201,23.0,0.14853,11.0,0.005484,34.0,0.350063
180,14197.0,1281400175,Oslo University Hospital,42182.0,0.002552,92.0,0.001755,109.0,0.068584,28.0,0.000694,332.0,0.336565
83,106316.0,1299303238,National Institutes of Health,386700.0,0.023392,1.0,0.017313,1.0,0.554016,1.0,0.014976,3.0,0.274931
654,12170.0,173911158,Iowa State University,45133.0,0.00273,85.0,0.002217,86.0,0.019421,117.0,0.001476,171.0,0.269647
178,37633.0,8087733,University of Tübingen,143665.0,0.008691,11.0,0.006713,12.0,0.168897,8.0,0.006354,28.0,0.26195
224,16085.0,204730241,University of Paris,63325.0,0.003831,52.0,0.002907,63.0,0.066656,30.0,0.003291,72.0,0.254007
512,9908.0,2799807261,Tel Aviv Sourasky Medical Center,42353.0,0.002562,91.0,0.002298,83.0,0.029897,83.0,0.001048,234.0,0.233939
465,15717.0,139660479,Central South University,69261.0,0.00419,48.0,0.002587,75.0,0.046033,54.0,0.003407,68.0,0.226924
1,17477.0,165143802,University of Queensland,77500.0,0.004688,36.0,0.003773,40.0,0.092728,15.0,0.004786,38.0,0.22551


In [None]:
affiliations_info

# Comparison 2 networks

In [None]:
affiliations = pd.read_csv('drive/MyDrive/affiliations.csv', sep = ';')

In [None]:
affiliation_independent_net = pd.read_csv('drive/MyDrive/Aff_net_independent/affiliation_independent_net.csv', sep = ';')
affiliation_independent_net_no_periods = pd.read_csv('aff_net_indep_no_periods.csv', sep = '\t', header = None).rename(columns = {0:'AfId1', 1:'AfId2', 2:'weight'})
affiliation_independent_info = pd.read_csv('aff_info_indep_classic.csv', sep = ';')

In [None]:
affiliation_all_net = pd.read_csv('drive/MyDrive/Aff_net_independent/all_author_net.csv', sep = ';')
affiliation_all_net_no_periods = pd.read_csv('aff_net_all_no_periods.csv', sep = '\t', header = None).rename(columns = {0:'AfId1', 1:'AfId2', 2:'weight'})
affiliation_all_info = pd.read_csv('aff_info_all_classic.csv', sep = ';')

In [None]:
ranks = affiliation_all_info[['AfId', 'In-degree rank']].rename(columns = {'In-degree rank':'1'}).merge(affiliation_independent_info[['AfId', 'In-degree rank']].rename(columns = {'In-degree rank':'2'}), on = 'AfId')

In [None]:
ranks

Unnamed: 0,AfId,1,2
0,39854758,457.0,443.0
1,165143802,36.0,46.0
2,1443707,495.0,476.0
3,145487455,243.0,227.0
4,120250893,1008.0,968.0
...,...,...,...
4677,167724894,1848.0,1790.0
4678,913583111,1848.0,1790.0
4679,201850948,1848.0,1790.0
4680,148195559,1848.0,1790.0


In [None]:
affiliation_independent_net_no_periods.loc[:, 'Y'] = '2'

In [None]:
affiliation_all_net_no_periods.loc[:, 'Y'] ='1'

In [None]:
net_all = pd.concat([affiliation_all_net_no_periods, affiliation_all_net_no_periods]).reset_index(drop = True)

In [None]:
def distance(M1, M2):
    return np.abs(np.array(M1) - np.array(M2)).sum()
    
def net_stability(net, ranks, eps):

    stability = []
    distance_C = []
    distance_R = []
    C = []
    R = []

    for year in tqdm(['1', '2']):
        net_cur = net[net['Y'] == year]

        R_t = []
        for i in tqdm(range(len(ranks))):
            row_t = []
            for j in range(len(ranks)):
                rt = int(ranks.loc[i, str(year)] - ranks.loc[j, str(year)] > eps)
                row_t.append(rt)

            R_t.append(row_t)

        R.append(R_t)
    return R
       

In [None]:
R = net_stability(net_all, ranks, 5)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4682 [00:00<?, ?it/s]

  0%|          | 0/4682 [00:00<?, ?it/s]

In [None]:
distance(R[0], R[1])

287806

In [None]:
np.abs(np.array(R[0]) - np.array(R[1])).sum() - np.abs(np.trace(np.array(R[0]) - np.array(R[1])))

287806

In [None]:
distance(R[0], R[1]) / (len(ranks) * (len(ranks) - 1))

0.013131967314767607

In [None]:
def distance(M1, M2):
    return np.abs(np.array(M1) - np.array(M2)).sum()
    
def net_stability(net, ranks, eps):

    stability = []
    distance_C = []
    distance_R = []
    C = []
    R = []

    for year in tqdm(['1', '2']):
        net_cur = net[net['Y'] == year]

        R_t = []
        for i in tqdm(range(len(ranks))):
            row_t = []
            for j in range(len(ranks)):
                rt = int(ranks.loc[i, str(year)] - ranks.loc[j, str(year)])
                row_t.append(rt)

            R_t.append(row_t)

        R.append(R_t)
    return R
        

In [1]:
R1 = net_stability(net_all, ranks, 5)

In [None]:
np.abs(np.array(R1[0]) - np.array(R1[1])).sum()

In [None]:
distance(R1[0], R1[1]) 

In [None]:
690631290 / (len(ranks) * (len(ranks) - 1))

31.512016868431473

In [None]:
distance(R1[0], R1[1]) / (len(ranks) * (len(ranks) - 1)*max(int(ranks['2'].max()), int(ranks['1'].max())) )

0.01705195717988716

In [None]:
max(int(ranks['2'].max()), int(ranks['1'].max()))

1848

## Periods

In [None]:
in_degree_all = pd.read_csv('In_degree_all.txt', sep = '\t')
in_degree_indep = pd.read_csv('In_degree_indep.txt', sep = '\t')

In [None]:
in_degree_indep = in_degree_all[['AfId', 'AfN']].merge(in_degree_indep, on = ['AfId', 'AfN'], how = 'left')
in_degree_indep.fillna(0, inplace = True)

In [None]:
in_degree_all_ranks = in_degree_all[['AfN',	'AfId',	'2015',	'2016',	'2017',	'2018',	'2019',	'2020',	'2021']].copy()
in_degree_indep_ranks = in_degree_indep[['AfN',	'AfId',	'2015',	'2016',	'2017',	'2018',	'2019',	'2020',	'2021']].copy()

for year in ['2015',	'2016',	'2017',	'2018',	'2019',	'2020',	'2021']:
    in_degree_all_ranks.loc[:, year] = in_degree_all_ranks[year].rank(method = 'dense', ascending = False)
    in_degree_indep_ranks.loc[:, year] = in_degree_indep_ranks[year].rank(method = 'dense', ascending = False)

In [None]:
in_degree_all_ranks.loc[:, year]

0        362.0
1         29.0
2        398.0
3        192.0
4        848.0
         ...  
4684    1382.0
4685    1382.0
4686    1382.0
4687    1382.0
4688    1382.0
Name: 2021, Length: 4689, dtype: float64

In [None]:
stabililies = []
dR = []
dC = []

for year in ['2015',	'2016',	'2017',	'2018',	'2019',	'2020',	'2021']:
    print(year)
    ranks = in_degree_all_ranks[['AfId', year]].rename(columns = {year:'1'}).merge(in_degree_indep_ranks[['AfId', year]].rename(columns = {year:'2'}), on = 'AfId')
    net_1 = affiliation_all_net[affiliation_all_net['Y'] == int(year)].reset_index(drop = True)
    net_2 = affiliation_independent_net[affiliation_independent_net['Y'] == int(year)].reset_index(drop = True)
    net_1.loc[:, 'Y'] = '1'
    net_2.loc[:, 'Y'] = '2'
    net_all = pd.concat([net_1, net_2]).reset_index(drop = True)
    stability, distance_R, distance_C, R, C = net_stability(net_all, ranks, 5)
    stabililies.append(stability)
    dR.append(distance_R)
    dC.append(distance_C)
    print('Stability:', stability)
    print('distance_R:', distance_R)
    print('distance_C:', distance_C)

2015


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Stability: [0.00707733388224003]
distance_R: [0.010008856324110528]
distance_C: [1.0239370983754196e-05]
2016


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Stability: [0.008887272918563177]
distance_R: [0.012568492303168333]
distance_C: [1.5526855808094388e-05]
2017


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Stability: [0.008198990796391964]
distance_R: [0.011595106403266086]
distance_C: [2.0190482363945297e-05]
2018


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Stability: [0.008525780121861436]
distance_R: [0.012057256581193223]
distance_C: [2.042322031203481e-05]
2019


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

  0%|          | 0/4689 [00:00<?, ?it/s]

# Papers independent net

In [None]:
co_author_cit_papers = pd.read_csv('drive/MyDrive/Aff_net_independent/co_author_cit_papers.csv', sep = ';')

In [None]:
paper_net = pd.read_csv('drive/MyDrive/excluded_net.csv', sep = ';')

In [None]:
co_author_cit_papers['tmp'] = 1
independent_paper_net = paper_net.merge(co_author_cit_papers, on = ['ID1', 'ID2'], how = 'left')
independent_paper_net = independent_paper_net[independent_paper_net['tmp'].isna()].reset_index(drop = True).drop('tmp', axis = 1)

In [None]:
independent_paper_net

Unnamed: 0,ID1,ID2,weight,Y
0,2754967293,2339791932,1,2017
1,2177834950,2123627348,1,2015
2,2558041282,1934236512,1,2016
3,2558041282,1947901277,1,2016
4,2558041282,1564387586,1,2016
...,...,...,...,...
227511,2895767795,1967224666,1,2019
227512,2895767795,2274550138,1,2019
227513,2895767795,2617488731,1,2019
227514,2619593042,2545724250,1,2017


In [None]:
independent_paper_net.to_csv('drive/MyDrive/Aff_net_independent/independent_paper_net.csv', sep = ';', index = False)

In [None]:
independent_paper_net = pd.read_csv('drive/MyDrive/Aff_net_independent/independent_paper_net.csv', sep = ';')

In [None]:
print('Vertices in full net:', len(set(paper_net['ID1']) | set(paper_net['ID2'])))
print('Edges in full net:', len(paper_net))
print('Vertices in independent net:', len(set(independent_paper_net['ID1']) | set(independent_paper_net['ID2'])))
print('Edges in independent net:', len(independent_paper_net))
print('Difference vertices:', len(set(paper_net['ID1']) | set(paper_net['ID2'])) - len(set(independent_paper_net['ID1']) | set(independent_paper_net['ID2'])))
print('Difference edges:', len(paper_net) - len(independent_paper_net))

Vertices in full net: 39811
Edges in full net: 310829
Vertices in independent net: 38450
Edges in independent net: 227516
Difference vertices: 1361
Difference edges: 83313


In [None]:
independent_paper_net[['ID1', 'ID2', 'weight']].to_csv('independent_paper_net_no_periods.csv', sep = '\t', index = False, header = False)

In [None]:
G = nx.read_weighted_edgelist('independent_paper_net_no_periods.csv', delimiter='\t')
components = nx.connected_components(G)

comp_df = pd.DataFrame()

i = 1
for comp in components:
    for aff in comp:
        comp_df.loc[aff, 'component'] = i
    i += 1
        
comp_df = comp_df.reset_index().rename(columns = {'index': 'Id'})

comp_df.to_csv('independent_paper_components.csv', sep = ';', index = False)

In [None]:
comp_df.groupby('component').count().reset_index().sort_values('Id', ascending  = False)

Unnamed: 0,component,Id
0,1.0,38282
2,3.0,5
56,57.0,4
70,71.0,3
22,23.0,3
...,...,...
29,30.0,2
28,29.0,2
27,28.0,2
25,26.0,2


In [None]:
paper_net[['ID1', 'ID2', 'weight']].to_csv('paper_net_no_periods.csv', sep = '\t', index = False, header = False)

In [None]:
G = nx.read_weighted_edgelist('paper_net_no_periods.csv', delimiter='\t')
components = nx.connected_components(G)

comp_df1 = pd.DataFrame()

i = 1
for comp in components:
    for aff in comp:
        comp_df1.loc[aff, 'component'] = i
    i += 1
        
comp_df1 = comp_df1.reset_index().rename(columns = {'index': 'Id'})

comp_df1.to_csv('paper_components.csv', sep = ';', index = False)

In [None]:
comp_df1.groupby('component').count().reset_index().sort_values('Id', ascending  = False)

Unnamed: 0,component,Id
0,1.0,39618
86,87.0,3
31,32.0,3
17,18.0,3
16,17.0,3
...,...,...
32,33.0,2
30,31.0,2
29,30.0,2
28,29.0,2


# Centrality indices for papers independent net

## No periods

In [None]:
independent_paper_net = pd.read_csv('drive/MyDrive/Aff_net_independent/independent_paper_net.csv', sep = ';')

In [None]:
independent_paper_net[['ID1', 'ID2', 'weight']].to_csv('independent_paper_net_no_periods.csv', sep = '\t', index = False, header = False)

In [None]:
independent_paper_net_no_periods = pd.read_csv('independent_paper_net_no_periods.csv', sep = '\t', header = None)
independent_paper_net_no_periods.rename(columns = {0:'ID1', 1:'ID2', 2:'weight'}, inplace = True)

In [None]:
data = read_data('drive/MyDrive/filtered_data.csv', sep = ';')

In [None]:
papers_info = data[data['Id'].isin(set(independent_paper_net['ID1'].values) | set(independent_paper_net['ID2'].values))].copy()

weights = independent_paper_net_no_periods.groupby(['ID2']).sum()['weight'].reset_index()
papers_info = papers_info.merge(weights, how = 'left', left_on = 'Id', right_on = 'ID2')[['DN', 'Id', 'Y', 'DOI', 'J', 'weight']]
papers_info['weight'].fillna(0, inplace = True)

papers_info.loc[:, 'In-degree'] = papers_info['weight'] / independent_paper_net_no_periods['weight'].sum()
papers_info.loc[:, 'In-degree rank'] = papers_info['In-degree'].rank(method = 'dense', ascending = False)

In [None]:
papers_info = centrality_ind('independent_paper_net_no_periods.csv', papers_info, 'Id')

network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
done 


In [None]:
papers_info.sort_values('In-degree rank')

Unnamed: 0,DN,Id,Y,DOI,J,weight,In-degree,In-degree rank,PageRank,PageRank rank,Eigen,Eigen rank,Between,Between rank
1,MDS clinical diagnostic criteria for Parkinson...,2112455323,2015,10.1002/MDS.26424,"{'JN': 'movement disorders', 'JId': 163027424}",632.0,0.002778,1.0,0.002550,1.0,2.341650e-02,149.0,0.000000,15835.0
15,Epidemiology of Parkinson's disease.,2584311212,2017,10.1007/S00702-017-1686-Y,"{'JN': 'journal of neural transmission', 'JId'...",470.0,0.002066,2.0,0.001344,3.0,1.061770e-02,376.0,0.000065,63.0
3,Gut Microbiota Regulate Motor Deficits and Neu...,2558041282,2016,10.1016/J.CELL.2016.11.018,"{'JN': 'cell', 'JId': 110447773}",460.0,0.002022,3.0,0.000981,9.0,1.023573e-01,13.0,0.000039,140.0
9,"The Roles of PINK1, Parkin and Mitochondrial F...",2082425146,2015,10.1016/J.NEURON.2014.12.007,"{'JN': 'neuron', 'JId': 45757444}",433.0,0.001903,4.0,0.001788,2.0,1.911566e-01,4.0,0.000000,15835.0
12,The epidemiology of Parkinson's disease: risk ...,2531872507,2016,10.1016/S1474-4422(16)30230-7,"{'JN': 'lancet neurology', 'JId': 70053155}",416.0,0.001828,5.0,0.001287,4.0,1.195880e-02,341.0,0.000116,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30275,Unraveling Pathophysiological Mechanisms of Pa...,3094738391,2020,10.1177/1177271920964077,"{'JN': 'biomarker insights', 'JId': 50152778}",0.0,0.000000,170.0,0.000008,17933.0,1.234957e-08,12201.0,0.000000,15835.0
30278,Synaptic density and neuronal metabolic functi...,3170760877,2021,10.1101/2021.05.27.444950,"{'JN': 'biorxiv', 'JId': 2734324842}",0.0,0.000000,170.0,0.000008,17933.0,1.234957e-08,12201.0,0.000000,15835.0
21608,Extensive functional repertoire underpins comp...,2982292669,2019,10.1101/823849,"{'JN': 'biorxiv', 'JId': 2734324842}",0.0,0.000000,170.0,0.000008,17933.0,1.234957e-08,12201.0,0.000000,15835.0
11690,Cholinesterase inhibitor to prevent falls in P...,3209517239,2021,10.1186/S12883-021-02430-2,"{'JN': 'bmc neurology', 'JId': 120289491}",0.0,0.000000,170.0,0.000008,17933.0,1.234957e-08,12201.0,0.000000,15835.0


In [None]:
papers_info.to_csv('papers_info_indep_classic.csv', sep = ';', index = False)

In [None]:
papers_info[['Id', 'In-degree']].to_csv('IndexCI_indep_papers.txt', sep = '\t', index = False)

### Periods

In [None]:
papers_info = pd.read_csv('papers_info_indep_classic.csv', sep = ';')

In [None]:
independent_paper_net = pd.read_csv('drive/MyDrive/Aff_net_independent/independent_paper_net.csv', sep = ';')

In [None]:
in_degree = papers_info[['DN', 'Id', 'Y', 'DOI', 'J']].copy()

In [None]:
for Y in range(2015, 2022, 1):
    paper_net_Y = independent_paper_net[independent_paper_net['Y'] == Y]
    print(Y)
    
    paper_info_Y = papers_info[['DN', 'Id', 'Y', 'DOI', 'J']].copy()

    weights_Y = paper_net_Y.groupby(['ID2']).sum()['weight'].reset_index()
    paper_info_Y = paper_info_Y.merge(weights_Y, how = 'left', left_on = 'Id', right_on = 'ID2')
    paper_info_Y['weight'].fillna(0, inplace = True)

    paper_info_Y.loc[:, 'In-degree'] = paper_info_Y['weight'] / paper_net_Y['weight'].sum()
    
    in_degree = in_degree.merge(paper_info_Y, on = ['Id']).rename(columns = {'In-degree':Y, 'weight' : f'weight_{Y}'})

2015
2016
2017
2018
2019
2020
2021


In [None]:
in_degree = in_degree.merge(papers_info[['Id', 'In-degree', 'In-degree rank']], on = 'Id')

In [None]:
independent_paper_net.rename(columns = {'ID1':'Id1', 'ID2':'Id2'}, inplace = True)

In [None]:
pagerank, eigen, between = period_ind(papers_info[['DN', 'Id', 'Y', 'DOI', 'J']], independent_paper_net, papers_info, id_col = 'Id')

2015
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2016
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2017
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2018
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2019
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2020
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
2021
network loaded
Pagerank centrality, directed
Eigenvector centrality, directed
Betweenness centrality, directed
done 


In [None]:
in_degree.to_csv('In_degree_papers_indep.txt', sep = '\t', index = False)
pagerank.to_csv('PageRank_papers_indep.txt', sep = '\t', index = False)
eigen.to_csv('Eigen_papers_indep.txt', sep = '\t', index = False)
between.to_csv('Between_papers_indep.txt', sep = '\t', index = False)

In [None]:
writer = pd.ExcelWriter('index_papers_indep_periods.xlsx', engine='openpyxl')

in_degree.to_excel(writer, sheet_name='In-degree', index = False)
between.to_excel(writer, sheet_name='Between', index = False)
eigen.to_excel(writer, sheet_name='Eigen', index = False)
pagerank.to_excel(writer, sheet_name='PageRank', index = False)

writer.save()

# Independent index

In [None]:
def independent_index(net, co_authors_df, data, all_author_data):
    # join info about authors to paper citation network
    author_net = net.merge(all_author_data, left_on = 'ID1', right_on = 'Id').rename(columns = {'AuId':'AuId1', 'AfId':'AfId1'}).drop(columns = 'Id')
    author_net = author_net.merge(all_author_data, left_on = 'ID2', right_on = 'Id').rename(columns = {'AuId':'AuId2', 'AfId':'AfId2'}).drop(columns = 'Id')

    # join info about coauthors to net with all authors citations
    author_net_co_a = author_net.merge(co_authors_df, how = 'left', left_on = ['AuId1', 'AuId2'], right_on = ['A1', 'A2']).rename(columns = {'A1':'A3', 'A2':'A4'})
    author_net_co_a = author_net_co_a.merge(co_authors_df, how = 'left', left_on = ['AuId2', 'AuId1'], right_on = ['A1', 'A2'])

    # if they are coauthors there is not null info in A1 or A2 or A3 or A4
    co_author_cit = author_net_co_a[(author_net_co_a['A1'].isna()==False) | (author_net_co_a['A2'].isna()==False) |(author_net_co_a['A3'].isna()==False) |(author_net_co_a['A4'].isna()==False)].reset_index(drop = True)

    co_author_cit_papers = co_author_cit[['ID1', 'ID2']].drop_duplicates()
    
    dependent_degree = co_author_cit_papers.groupby('ID2').count().reset_index().rename(columns = {'ID1':'D-degree'})

    papers_info = data[data['Id'].isin(set(net['ID1'].values) | set(net['ID2'].values))].copy()

    weights = net.groupby(['ID2']).sum()['weight'].reset_index()
    papers_info = papers_info.merge(weights, how = 'left', left_on = 'Id', right_on = 'ID2')[['DN', 'Id', 'Y', 'DOI', 'J', 'weight']]
    papers_info['weight'].fillna(0, inplace = True)


    papers_info = papers_info.merge(dependent_degree, left_on = 'Id', right_on = 'ID2', how = 'left')
    papers_info['D-degree'].fillna(0, inplace = True)
    papers_info.drop(columns = {'ID2'}, inplace = True)
    
    papers_info.loc[:, 'I-degree'] = papers_info['weight'] - papers_info['D-degree']
    papers_info.loc[:, 'Independent index'] = papers_info['I-degree']/papers_info['weight']

    papers_info.loc[:, 'I-degree norm'] = papers_info['I-degree']/papers_info['I-degree'].sum()

    papers_info.loc[:, 'I-degree rank'] = papers_info['I-degree'].rank(method = 'dense', ascending = False)
    papers_info.loc[:, 'Independent index rank'] = papers_info['Independent index'].rank(method = 'dense', ascending = False)
    
    return papers_info

In [None]:
import pickle

pkl_file = open('drive/MyDrive/co_authors.pkl', 'rb')
co_authors = pickle.load(pkl_file)
pkl_file.close()

In [None]:
co_authors_df = pd.DataFrame(co_authors).rename(columns = {0:'A1', 1:'A2'})

In [None]:
data = read_data('drive/MyDrive/filtered_data.csv', sep = ';')
net = pd.read_csv('drive/MyDrive/excluded_net.csv', sep = ';')

In [None]:
author_data = data[['Id', 'AA', 'Y']].copy().rename(columns = {'Y':'Year'})
author_data.dropna(subset = ['AA'], inplace = True, axis = 0)

all_author_data = []

for i in tqdm(range(len(author_data))):
    authors = author_data.loc[i, 'AA']
    for j in range(len(authors)):
        all_author_data.append([author_data.loc[i, 'Id'], authors[j]['AuId'], authors[j]['AfId']])

all_author_data = pd.DataFrame(all_author_data).rename(columns = {0:'Id', 1:'AuId', 2:'AfId'})

  0%|          | 0/39811 [00:00<?, ?it/s]

In [None]:
papers_info_indep_index = independent_index(net, co_authors_df, data, all_author_data)

In [None]:
papers_info_indep_index.sort_values('I-degree rank')

Unnamed: 0,DN,Id,Y,DOI,J,weight,D-degree,I-degree,Independent index,I-degree norm,I-degree rank,Independent index rank
1,MDS clinical diagnostic criteria for Parkinson...,2112455323,2015,10.1002/MDS.26424,"{'JN': 'movement disorders', 'JId': 163027424}",1563.0,931.0,632.0,0.404351,0.002778,1.0,785.0
15,Epidemiology of Parkinson's disease.,2584311212,2017,10.1007/S00702-017-1686-Y,"{'JN': 'journal of neural transmission', 'JId'...",485.0,15.0,470.0,0.969072,0.002066,2.0,56.0
3,Gut Microbiota Regulate Motor Deficits and Neu...,2558041282,2016,10.1016/J.CELL.2016.11.018,"{'JN': 'cell', 'JId': 110447773}",500.0,40.0,460.0,0.920000,0.002022,3.0,135.0
9,"The Roles of PINK1, Parkin and Mitochondrial F...",2082425146,2015,10.1016/J.NEURON.2014.12.007,"{'JN': 'neuron', 'JId': 45757444}",458.0,25.0,433.0,0.945415,0.001903,4.0,99.0
12,The epidemiology of Parkinson's disease: risk ...,2531872507,2016,10.1016/S1474-4422(16)30230-7,"{'JN': 'lancet neurology', 'JId': 70053155}",469.0,53.0,416.0,0.886994,0.001828,5.0,179.0
...,...,...,...,...,...,...,...,...,...,...,...,...
15097,Attenuated NoGo-related beta desynchronisation...,2944713507,2019,10.1038/S41598-019-43762-X,"{'JN': 'scientific reports', 'JId': 196734849}",0.0,0.0,0.0,,0.000000,170.0,
15095,Mitochondria and lipid peroxidation in the mec...,3043203703,2021,10.1002/MED.21712,"{'JN': 'medicinal research reviews', 'JId': 22...",1.0,1.0,0.0,0.000000,0.000000,170.0,1008.0
30617,"Ubisol-Q10, a Nanomicellar and Water-Dispersib...",3162450207,2021,10.3390/ANTIOX10050764,"{'JN': 'antioxidants', 'JId': 2737566431}",0.0,0.0,0.0,,0.000000,170.0,
15123,α-Synuclein Propagation Mouse Models of Parkin...,3164597337,2021,10.1007/978-1-0716-1495-2_12,"{'JN': 'methods of molecular biology', 'JId': ...",0.0,0.0,0.0,,0.000000,170.0,


In [None]:
papers_info_indep_index.to_csv('papers_info_indep_index.csv', index = False, sep = ';')