In [1]:
import requests
import cv2
import warnings
import urllib.request, json
import re
import http.client, urllib.request, urllib.parse, urllib.error, base64
import csv

import scholarly as slr
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

from PIL import Image
from io import BytesIO
from math import ceil

# warnings.filterwarnings('ignore')

%matplotlib inline

ROWS = 1000

In [4]:
class FakeInfo:
    def __init__(self, name):
        self.name = name
        self.id = hash(name)
        self.url_picture = '/citations/images/avatar_scholar_128.jpg'

In [5]:
class Network:
    def __init__(self):
        self.id_info = {}
        self.name_id = {}
    
    def split_full_name(self, author_name):
        full_name = author_name.replace('.', '').split(' ')
        names, surname = full_name[:-1], full_name[-1]
        if len(names) == 1 and names[0].isupper():
            names = [ch for ch in names[0]]
        return names, surname
    
    def possible_names(self, names, surname):
        surnames = surname.split('-')
        result = []
        for surname in surnames:
            surname = surname.title()
            for name in names:
                result += [name+' '+surname, name[0]+' '+surname, name[0]+'. '+surname]
        return result
    
    def compare_name(self, name1, name2):
        if len(name1) > 1 and len(name2) > 1:
            return name1 in name2 or name2 in name1
        return name1[0] == name2[0]
    
    def compare_names(self, fullname1, fullname2):
        names1, surname1 = self.split_full_name(fullname1)
        names2, surname2 = self.split_full_name(fullname2)
        
        if '-' in surname1 or '-' in surname2:
            res = self.compare_names(surname1.replace('-', ' ') + " abc", surname2.replace('-', ' ') + " abc")
        else:
            res = self.compare_name(surname1, surname2)

        if res:
            ptr1, ptr2 = 0, 0
            
            while ptr1 < len(names1) and ptr2 < len(names2):
                if self.compare_name(names1[ptr1], names2[ptr2]):
                    ptr1 += 1
                    ptr2 += 1
                else:
                    if len(names1) - ptr1 == len(names2) - ptr2:
                        return False
                    if len(names1) - ptr1 > len(names2) - ptr2:
                        ptr1 += 1
                    else:
                        ptr2 += 1
            return True
            
        return False
    
    def get_info(self, author_name):
        names, surname = self.split_full_name(author_name)
        
        for name in self.possible_names(names, surname):
            if name in self.name_id:
                print('\t {} is already in the database.'.format(author_name))
                if author_name not in self.name_id:
                    for other_name in self.possible_names(names, surname):
                        self.name_id[other_name] = self.name_id[name]
                return self.id_info[self.name_id[name]]
        
        author_info = FakeInfo(author_name)
        
        print('\t Adding {} in database.'.format(author_name))

        self.id_info[author_info.id] = author_info

        for name in self.possible_names(names, surname):
            self.name_id[name] = author_info.id

        if author_name != author_info.name:
            for name in self.possible_names(*self.split_full_name(author_info.name)):
                self.name_id[name] = author_info.id
            
        return author_info
        
    def add_author(self, G, author_name):
        print('Adding {}:'.format(author_name))
        
        author_info = self.get_info(author_name)
        
        if author_info.id in G.nodes():
            print('\t Author id is already in the network.')            
            return
        
        print('\t Adding {} to network.'.format(author_name))
        G.add_node(author_info.id)
                
    def add_coauthors(self, G, author_name):
        print("Adding {}'s co-authors:".format(author_name))
        author_info = self.id_info[self.name_id[author_name]]
        author_id = author_info.id
        
        coauthors = set()

        # Crossref
        print("\t Adding co-authors from Crossref")
        
        names, surname = self.split_full_name(author_name)
        with urllib.request.urlopen('https://api.crossref.org/works?query.author={}&rows=0'.format(surname)) as url:
            data = json.loads(url.read().decode())
        
        for i in range(ceil(data['message']['total-results'] / ROWS)):
            try:
                with urllib.request.urlopen(
                    'https://api.crossref.org/works?query.author={s}&rows={r}&offset={of}'.format(s=surname, r=ROWS,
                                                                                                 of=i*ROWS)) as url:
                    data = json.loads(url.read().decode())

            except:
                continue
                    
            for pub in data['message']['items']:
                for author in pub['author']:
                    try:
                        if surname in author['family']:
                            for name in names:
                                if len(name) > 1:
                                    if name in author['family'] or name in author['given']:
                                        for a in pub['author']:
                                            coauthors.add(a['given'] + ' ' + a['family'])
                    except:
                        continue
        
        # Microsoft Academic
        print("\t Adding co-authors from Microsoft Academic Knowledge")
        
        headers = {'Ocp-Apim-Subscription-Key': 'e2a75d980b2e4752bbe5e7f87fd72eea'}
        params = urllib.parse.urlencode({
            'expr': "Composite(AA.AuN='{}')".format(author_name.lower()),
            'attributes': 'AA.AuN'
        })

        try:
            conn = http.client.HTTPSConnection('westus.api.cognitive.microsoft.com')
            conn.request("GET", "/academic/v1.0/evaluate?%s" % params, "{body}", headers)
            response = conn.getresponse()
            data = response.read()
            conn.close()
        except Exception as e:
            print("[Errno {0}] {1}".format(e.errno, e.strerror))
            
        for match in re.finditer(b"\"AuN\"\:\"([\w ]+)\"", data):
            coauthors.add(match.group(1).decode().title())

        print()
        
        surnames = {}
        
        for name in coauthors:
            surname = name.replace('.', '').split(' ')[-1]
            if surname not in surnames or (len(surnames[surname]) < len(name) and 
                                           self.compare_names(name, surnames[surname])):
                surnames[surname] = name
        
        for surname in surnames:
            coauthor_info = self.get_info(surnames[surname])
            
            if coauthor_info.id != author_id:
                self.add_author(G, coauthor_info.name)
                print('\t Adding connection between {a} and {c}.\n'.format(a=author_name, c=coauthor_info.name))
                G.add_edge(author_id, coauthor_info.id)

In [6]:
G = nx.Graph()

In [8]:
gn = Network()

In [9]:
gn.add_author(G, 'Arseny Alexeev')

Adding Arseny Alexeev:
	 Adding Arseny Alexeev in database.
	 Adding Arseny Alexeev to network.


In [39]:
gn.add_coauthors(G, 'M Portnoi')

Adding M Portnoi's co-authors:
	 Adding co-authors from Crossref
	 Adding co-authors from Microsoft Academic Knowledge

	 Adding Valerie Malan in database.
Adding Valerie Malan:
	 Valerie Malan is already in the database.
	 Adding Valerie Malan to network.
	 Adding connection between M Portnoi and Valerie Malan.

	 Adding Gerard Pals in database.
Adding Gerard Pals:
	 Gerard Pals is already in the database.
	 Adding Gerard Pals to network.
	 Adding connection between M Portnoi and Gerard Pals.

	 Adding Vladimir Mordachev in database.
Adding Vladimir Mordachev:
	 Vladimir Mordachev is already in the database.
	 Adding Vladimir Mordachev to network.
	 Adding connection between M Portnoi and Vladimir Mordachev.

	 Adding Christel Thauvinrobinet in database.
Adding Christel Thauvinrobinet:
	 Christel Thauvinrobinet is already in the database.
	 Adding Christel Thauvinrobinet to network.
	 Adding connection between M Portnoi and Christel Thauvinrobinet.

	 Marie France Portnoi is already i

In [11]:
for i in G.neighbors_iter(gn.name_id['Arseny Alexeev']):
    print(gn.id_info[i].name)

I A Shelykh
C. David Wright
Venkata Karthik Nagareddy
Andrea Ferrari
Alessandro Curioni
Monica Felicia Craciun
Tobias Bachmann
Anna Ott
C Dou
Matthew D. Barnes
C Scheu
Wabe Koelmans
Chunmeng Duo
Khue T. Lai
Matthias Wuttig
S Zhang
Richard R Hartmann
Vara Sudananda Prasad Jonnalagadda
Federico Zipoli
Evangelos S Eleftheriou
Abu Sebastian
Oana Cojocarumiredin
M E Portnoi


In [42]:
response = requests.get('https://scholar.google.com' + '/citations/images/avatar_scholar_128.jpg')
img = Image.open(BytesIO(response.content))

In [77]:
labels = {}

for n in G.nodes():
    labels[n] = gn.id_info[n].name

In [87]:
f = plt.figure(figsize=(200,400))
ax = f.add_subplot(111)
pos=nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, ax=ax, node_shape='_', node_color='blue')
nx.draw_networkx_edges(G, pos, edge_color='grey')
nx.draw_networkx_labels(G, pos, ax=ax, labels=labels)
f.savefig("graph.png")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [40]:
nx.write_graphml(G, 'crossref-network')

In [41]:
w = csv.writer(open("crossref-name_id.csv", "w"))
for key, val in gn.name_id.items():
    w.writerow([key, val])

In [None]:
for key, val in csv.reader(open("crossref-name_id.csv")):
    gn.name_id[key] = val