In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
%matplotlib inline

**scopus_authors_links_2016.csv** - articles, downloaded from Scopus, which were published in 2016 with affiliation to National Research University Higher School of Economics. File contains:

- Author name;

- Link (by default)

In [93]:
with open('scopus_authors_links_2016.csv', 'r') as file:
    articles = []
    for row in file.readlines():
        splitted_line = [x.strip() for x in row.split(',')]
        while '"' in splitted_line[0] and 'http' in splitted_line[-1]:
            splitted_line[0] = splitted_line[0].replace('"', '')
            splitted_line = splitted_line[:-1]
        articles.append(splitted_line)  
        articles.sort()
    # We delete the most noticable outliers and extra lines  
    # E.g. collective works written by more than 80 authors
    del articles[1096] # delete NCD Risk Factor Collaboration
    del articles[437:444] # delete GBD 2015 collaborative papers(Vlasov with 100500 non-HSE authors)
    del articles[-6] # delete node 'author was not found'
    del articles[-1] # delete "header"

In [94]:
articles[:6]

[['Abankina I.',
  'Aleskerov F.',
  'Belousova V.',
  'Gokhberg L.',
  'Kiselgof S.',
  'Petrushchenko V.',
  'Shvydun S.',
  'Zinkovsky K.'],
 ['Abankina I.V.', 'Filatova L.M.', 'Vynaryk V.A.'],
 ['Abankina T.V.', 'Derkachev P.V.'],
 ['Abdrakhmanova G.'],
 ['Abelev M.Y.', 'Averin I.V.', 'Korableva U.A.'],
 ['Abramov R.']]

In [14]:
# number of articles
len(articles)

1635

In [117]:
# Assign unique id to each author

unique_authors = {}
ids = 1
for article in articles:
    for name in article:
        print(name)
        if name not in unique_authors:
            unique_authors[name] = ids
            ids += 1
        else:
            continue

sorted(unique_authors)

Abankina I.
Aleskerov F.
Belousova V.
Gokhberg L.
Kiselgof S.
Petrushchenko V.
Shvydun S.
Zinkovsky K.
Abankina I.V.
Filatova L.M.
Vynaryk V.A.
Abankina T.V.
Derkachev P.V.
Abdrakhmanova G.
Abelev M.Y.
Averin I.V.
Korableva U.A.
Abramov R.
Abramov R.N.
Abramov R.N.
Gruzdev I.A.
Terentyev E.A.
Abrashkin A.A.
Oshmarina O.E.
Abrashkin A.A.
Yakubovich E.I.
Abylkalikov S.I.
Abylkalikov S.I.
Abylkalikov S.I.
Aevskiy V.
Chetverikov V.
Afanas'ev V.
Afanasiev M.P.
Shash N.N.
Afanasiev M.P.
Shash N.N.
Afanasiev M.P.
Shash N.N.
Afanasyev D.O.
Fedorova E.A.
Afanasyeva L.G.
Tkachenko A.V.
Afanasyeva L.G.
Tkachenko A.V.
Agranovich M.S.
Ahmadi M.
Ulyanov D.
Semenov S.
Trofimov M.
Giacinto G.
Aistov A.
Aleksandrova E.
Akaev A.
Ichkitidze Y.
Sarygulov A.
Sokolov V.
Akayev A.A.
Ichkitidze Yu.R.
Sarygulov A.I.
Sokolov V.N.
Akelev E.V.
Wilson L.
Akhmedov E.T.
Godazgar H.
Popov F.K.
Akhmedov E.T.
Kalinov D.A.
Popov F.K.
Akhmedova V.
Zabrodin A.
Akhmet'ev P.M.
Kudryavtseva E.A.
Smirnov A.Y.
Akhremenko A.S.


['Abaev T.',
 'Abankina I.',
 'Abankina I.V.',
 'Abankina T.V.',
 'Abd-Allah F.',
 'Abdrakhmanova G.',
 'Abdulrab H.',
 'Abelev M.Y.',
 'Abera S.F.',
 'Abraham J.P.',
 'Abrameshin A.E.',
 'Abramov R.',
 'Abramov R.N.',
 'Abramov-Maximov V.E.',
 'Abrashkin A.A.',
 'Abu-Raddad L.J.',
 'Abubakar I.',
 'Abuhamdeh S.',
 'Abylkalikov S.I.',
 'Achoui M.',
 'Adofo K.',
 'Aevskiy V.',
 "Afanas'ev V.",
 'Afanasiev M.P.',
 'Afanasyev D.O.',
 'Afanasyeva L.G.',
 'Afanas’ev V.V.',
 'Agalakov Y.',
 'Ageykin M.',
 'Agranovich M.S.',
 'Ahmadi M.',
 'Aistov A.',
 'Akaev A.',
 'Akayev A.A.',
 'Akelev E.',
 'Akelev E.V.',
 'Akhmedov E.T.',
 'Akhmedova V.',
 "Akhmet'ev P.M.",
 'Akhremenko A.S.',
 'Akimova L.',
 'Akinina Y.S.',
 'Akolzin I.',
 'Akopov A.S.',
 'Akopov S.',
 'Akotia C.S.',
 'Aksenov S.A.',
 'Aköz K.K.',
 'Al Buhairan F.S.',
 'Al-Shukri S.H.',
 'Aladyshkina A.',
 'Alaee R.',
 'Alberti G.',
 'Aldhafri S.',
 'Aleiner I.L.',
 'Aleksandrov A.A.',
 'Aleksandrova A.B.',
 'Aleksandrova E.',
 'Aleksa

In [118]:
# number of authors
len(unique_authors)

3455

In [119]:
splitted_names = []
for name in sorted(unique_authors):
    name = name.split(" ")
    splitted_names.append(name)

In [120]:
splitted_names

[['Abaev', 'T.'],
 ['Abankina', 'I.'],
 ['Abankina', 'I.V.'],
 ['Abankina', 'T.V.'],
 ['Abd-Allah', 'F.'],
 ['Abdrakhmanova', 'G.'],
 ['Abdulrab', 'H.'],
 ['Abelev', 'M.Y.'],
 ['Abera', 'S.F.'],
 ['Abraham', 'J.P.'],
 ['Abrameshin', 'A.E.'],
 ['Abramov', 'R.'],
 ['Abramov', 'R.N.'],
 ['Abramov-Maximov', 'V.E.'],
 ['Abrashkin', 'A.A.'],
 ['Abu-Raddad', 'L.J.'],
 ['Abubakar', 'I.'],
 ['Abuhamdeh', 'S.'],
 ['Abylkalikov', 'S.I.'],
 ['Achoui', 'M.'],
 ['Adofo', 'K.'],
 ['Aevskiy', 'V.'],
 ["Afanas'ev", 'V.'],
 ['Afanasiev', 'M.P.'],
 ['Afanasyev', 'D.O.'],
 ['Afanasyeva', 'L.G.'],
 ['Afanas’ev', 'V.V.'],
 ['Agalakov', 'Y.'],
 ['Ageykin', 'M.'],
 ['Agranovich', 'M.S.'],
 ['Ahmadi', 'M.'],
 ['Aistov', 'A.'],
 ['Akaev', 'A.'],
 ['Akayev', 'A.A.'],
 ['Akelev', 'E.'],
 ['Akelev', 'E.V.'],
 ['Akhmedov', 'E.T.'],
 ['Akhmedova', 'V.'],
 ["Akhmet'ev", 'P.M.'],
 ['Akhremenko', 'A.S.'],
 ['Akimova', 'L.'],
 ['Akinina', 'Y.S.'],
 ['Akolzin', 'I.'],
 ['Akopov', 'A.S.'],
 ['Akopov', 'S.'],
 ['Akotia', '

In [121]:
# Убрать дубли

double_check = []
check = []
sorted_splitted_names = sorted(splitted_names)
for i in range(len(sorted_splitted_names)):
    if sorted_splitted_names[i][0] == sorted_splitted_names[i-1][0]:
        double_check.append(sorted_splitted_names[i-1])
        double_check.append(sorted_splitted_names[i])
        #if splitted_names[i][1][0] == splitted_names[i-1][1][0]:
            #check.append(splitted_names[i-1])

len(double_check)

1184

In [122]:
articles

[['Abankina I.',
  'Aleskerov F.',
  'Belousova V.',
  'Gokhberg L.',
  'Kiselgof S.',
  'Petrushchenko V.',
  'Shvydun S.',
  'Zinkovsky K.'],
 ['Abankina I.V.', 'Filatova L.M.', 'Vynaryk V.A.'],
 ['Abankina T.V.', 'Derkachev P.V.'],
 ['Abdrakhmanova G.'],
 ['Abelev M.Y.', 'Averin I.V.', 'Korableva U.A.'],
 ['Abramov R.'],
 ['Abramov R.N.'],
 ['Abramov R.N.', 'Gruzdev I.A.', 'Terentyev E.A.'],
 ['Abrashkin A.A.', 'Oshmarina O.E.'],
 ['Abrashkin A.A.', 'Yakubovich E.I.'],
 ['Abylkalikov S.I.'],
 ['Abylkalikov S.I.'],
 ['Abylkalikov S.I.'],
 ['Aevskiy V.', 'Chetverikov V.'],
 ["Afanas'ev V."],
 ['Afanasiev M.P.', 'Shash N.N.'],
 ['Afanasiev M.P.', 'Shash N.N.'],
 ['Afanasiev M.P.', 'Shash N.N.'],
 ['Afanasyev D.O.', 'Fedorova E.A.'],
 ['Afanasyeva L.G.', 'Tkachenko A.V.'],
 ['Afanasyeva L.G.', 'Tkachenko A.V.'],
 ['Agranovich M.S.'],
 ['Ahmadi M.', 'Ulyanov D.', 'Semenov S.', 'Trofimov M.', 'Giacinto G.'],
 ['Aistov A.', 'Aleksandrova E.'],
 ['Akaev A.', 'Ichkitidze Y.', 'Sarygulov A.',

In [129]:
unique_authors

{'Abankina I.': 1,
 'Aleskerov F.': 2,
 'Belousova V.': 3,
 'Gokhberg L.': 4,
 'Kiselgof S.': 5,
 'Petrushchenko V.': 6,
 'Shvydun S.': 7,
 'Zinkovsky K.': 8,
 'Abankina I.V.': 9,
 'Filatova L.M.': 10,
 'Vynaryk V.A.': 11,
 'Abankina T.V.': 12,
 'Derkachev P.V.': 13,
 'Abdrakhmanova G.': 14,
 'Abelev M.Y.': 15,
 'Averin I.V.': 16,
 'Korableva U.A.': 17,
 'Abramov R.': 18,
 'Abramov R.N.': 19,
 'Gruzdev I.A.': 20,
 'Terentyev E.A.': 21,
 'Abrashkin A.A.': 22,
 'Oshmarina O.E.': 23,
 'Yakubovich E.I.': 24,
 'Abylkalikov S.I.': 25,
 'Aevskiy V.': 26,
 'Chetverikov V.': 27,
 "Afanas'ev V.": 28,
 'Afanasiev M.P.': 29,
 'Shash N.N.': 30,
 'Afanasyev D.O.': 31,
 'Fedorova E.A.': 32,
 'Afanasyeva L.G.': 33,
 'Tkachenko A.V.': 34,
 'Agranovich M.S.': 35,
 'Ahmadi M.': 36,
 'Ulyanov D.': 37,
 'Semenov S.': 38,
 'Trofimov M.': 39,
 'Giacinto G.': 40,
 'Aistov A.': 41,
 'Aleksandrova E.': 42,
 'Akaev A.': 43,
 'Ichkitidze Y.': 44,
 'Sarygulov A.': 45,
 'Sokolov V.': 46,
 'Akayev A.A.': 47,
 'Ichki

In [174]:
# количество авторов
len(unique_authors)

3455

In [169]:
# количество статей
len(articles)

1635

In [177]:
from tqdm import tqdm_notebook

graph_edges = []
for line in tqdm_notebook(articles, total = 1635):
    i, j, k = 0, 0, 1
    for name in line[i:len(line)+1]:
        for co_author in line[k:len(line)+1]:
            if co_author in unique_authors:
                #print(unique_authors[name], unique_authors[co_author])
                #print(name, co_author)
                row = str(unique_authors[name]) + ' ' + str(unique_authors[co_author])
                graph_edges.append(row)
                k += 1
            else:
                print('%s' % co_author, 'is missing in dict')
                pass
        i += 1
        j += 1
        k = j + 1
print(graph_edges)


['1 2', '1 3', '1 4', '1 5', '1 6', '1 7', '1 8', '2 3', '2 4', '2 5', '2 6', '2 7', '2 8', '3 4', '3 5', '3 6', '3 7', '3 8', '4 5', '4 6', '4 7', '4 8', '5 6', '5 7', '5 8', '6 7', '6 8', '7 8', '9 10', '9 11', '10 11', '12 13', '15 16', '15 17', '16 17', '19 20', '19 21', '20 21', '22 23', '22 24', '26 27', '29 30', '29 30', '29 30', '31 32', '33 34', '33 34', '36 37', '36 38', '36 39', '36 40', '37 38', '37 39', '37 40', '38 39', '38 40', '39 40', '41 42', '43 44', '43 45', '43 46', '44 45', '44 46', '45 46', '47 48', '47 49', '47 50', '48 49', '48 50', '49 50', '51 52', '53 54', '53 55', '54 55', '53 56', '53 55', '56 55', '57 58', '59 60', '59 61', '60 61', '62 63', '62 64', '63 64', '65 66', '68 69', '68 70', '69 70', '68 71', '68 69', '68 72', '71 69', '71 72', '69 72', '73 74', '75 76', '75 77', '76 77', '78 79', '78 80', '78 81', '78 82', '79 80', '79 81', '79 82', '80 81', '80 82', '81 82', '78 81', '78 80', '78 79', '78 82', '81 80', '81 79', '81 82', '80 79', '80 82', '79

In [180]:
#for pair in tqdm_notebook(graph_edges, total = 31204):
    #extra = pair[::-1]
    #while extra in graph_edges:
        #graph_edges.remove(extra)




In [173]:
# количество ребер неориентированного графа
# граф без удаленных дублей (!)
len(graph_edges)

31204

In [172]:
with open('test_graph.csv', 'w') as graph:
    for pair in graph_edges:
        graph.write(pair + '\n')

In [182]:
import networkx as nx

In [186]:
graph_edges_splitted = [i.split() for i in graph_edges]

In [187]:
Citation_graph = nx.from_edgelist(graph_edges_splitted)