In [43]:
import requests
import cv2
import warnings
import urllib.request, json

import scholarly as slr
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

from PIL import Image
from io import BytesIO
from math import ceil

# warnings.filterwarnings('ignore')

%matplotlib inline

ROWS = 1000

In [2]:
class ImageToFace:
    def __init__(self):
        # Model parameters
        dir_path = "/usr/local/Cellar/opencv/2.4.13.2/share/OpenCV/haarcascades"
        filename = "haarcascade_frontalface_default.xml" # for frontal faces
        #filename = "haarcascade_profileface.xml" # for profile faces
        model_path = dir_path + "/" + filename

        # Create the classifier
        self.clf = cv2.CascadeClassifier(model_path)
        
    def get_face(self, img):
        image = np.array(img.convert('RGB'))[:,:,::-1]
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Detect faces on image
        faces = self.clf.detectMultiScale(
            gray,
            scaleFactor=1.1,
            minNeighbors=5,
            minSize=(30, 30),
            flags=cv2.CASCADE_SCALE_IMAGE
        )
        
        if len(faces) > 0:
            x, y, w, h = faces[0]
        else:
            return img
        
        return Image.fromarray(cv2.cvtColor(image[y:y+h, x:x+w], cv2.COLOR_BGR2RGB))
    
imtf = ImageToFace()

get_face = imtf.get_face

In [32]:
class FakeInfo:
    def __init__(self, name):
        self.name = name
        self.id = hash(name)
        self.url_picture = '/citations/images/avatar_scholar_128.jpg'
        
    def fill(self):
        self.publications = []

In [47]:
class GoogleNetwork:
    def __init__(self):
        self.site_prefix = 'https://scholar.google.com'
        self.id_info = {}
        self.name_id = {}
    
    def split_full_name(self, author_name):
        full_name = author_name.split(' ')
        return full_name[:-1], full_name[-1]
    
    def possible_names(self, names, surname): # Name Middle Surname
        result = []
        for name in names:
            result += [name+' '+surname, name[0]+' '+surname, name[0]+'. '+surname]
        return result
    
    def get_info(self, author_name):
        names, surname = self.split_full_name(author_name)
        
        for name in self.possible_names(names, surname):
            if name in self.name_id:
                print('\t {} is already in the database.'.format(author_name))
                if author_name not in self.name_id:
                    for other_name in self.possible_names(names, surname):
                        self.name_id[other_name] = self.name_id[name]
                return self.id_info[self.name_id[name]]
        try:
            print('\t Searching {} in google scholar.'.format(author_name))
            query = slr.search_author(surname)
            while True:
                author_info = next(query)
                found = False
                for name in names:
                    if name in author_info.name:
                        found = True
                if found:
                    break
        except:
            print('\t \t No match, creating fake id.')
            author_info = FakeInfo(author_name)

        print('\t Adding {} in database.'.format(author_name))

        self.id_info[author_info.id] = author_info

        for name in self.possible_names(names, surname):
            self.name_id[name] = author_info.id

        if author_name != author_info.name:
            for name in self.possible_names(*self.split_full_name(author_info.name)):
                self.name_id[name] = author_info.id
            
        return author_info
        
    def add_author(self, G, author_name):
        print('Adding {}:'.format(author_name))
        
        author_info = self.get_info(author_name)
        
        if author_info.id in G.nodes():
            print('\t Author id is already in the network.')            
            return
        
        print('\t Adding {} to network.'.format(author_name))
        G.add_node(author_info.id)
        
        response = requests.get(self.site_prefix + author_info.url_picture)
        img = Image.open(BytesIO(response.content))
        G.node[author_info.id]['image'] = get_face(img)
                
    def add_coauthors(self, G, author_name):
        print("Adding {}'s co-authors:".format(author_name))
        author_info = self.id_info[self.name_id[author_name]]
        author_id = author_info.id
        author_info.fill()
        
        G.node[author_id]['co-authors'] = set()
        
        # Google Scholar
        
        for pub in author_info.publications:
            pub.fill()
            union = G.node[author_id]['co-authors'].union(pub.bib['author'].split(' and '))
            G.node[author_id]['co-authors'] = union
        
        # Crossref
        
        names, surname = self.split_full_name(author_name)
        with urllib.request.urlopen('https://api.crossref.org/works?query.author={}&rows=0'.format(surname)) as url:
            data = json.loads(url.read().decode())
        
        for i in range(ceil(data['message']['total-results'] / ROWS)):
            with urllib.request.urlopen(
                'https://api.crossref.org/works?query.author={s}&rows={r}&offset={of}'.format(s=surname, r=ROWS,
                                                                                             of=i*ROWS)) as url:
                data = json.loads(url.read().decode())

            for pub in data['message']['items']:
                for author in pub['author']:
                    try:
                        if surname in author['family']:
                            for name in names:
                                if len(name) > 1:
                                    if name in author['family'] or name in author['given']:
                                        for a in pub['author']:
                                            G.node[author_id]['co-authors'].add(a['given'] + ' ' + a['family'])
                    except:
                        continue
        
        for coauthor_name in G.node[author_id]['co-authors']:
            coauthor_info = self.get_info(coauthor_name)
            
            if coauthor_info.id != author_id:
                self.add_author(G, coauthor_info.name)
                print('\t Addind connection between {a} and {c}.\n'.format(a=author_name, c=coauthor_name))
                G.add_edge(author_id, coauthor_info.id)

In [48]:
G = nx.Graph()

In [49]:
gn = GoogleNetwork()

In [52]:
gn.name_id

{'A Alexeev': 8360796958314390197,
 'A. Alexeev': 8360796958314390197,
 'Arseny Alexeev': 8360796958314390197}

In [51]:
gn.add_author(G, 'Arseny Alexeev')

Adding Arseny Alexeev:
	 Searching Arseny Alexeev in google scholar.
	 	 No match, creating fake id.
	 Adding Arseny Alexeev in database.
	 Adding Arseny Alexeev to network.


In [31]:
gn.add_coauthors(G, 'Arseny Alexeev')

Adding Arseny Alexeev's co-authors:


AttributeError: 'Author' object has no attribute 'publications'

In [30]:
for pub in gn.id_info[gn.name_id['A Alexeev']].publications:
    print(pub)

AttributeError: 'Author' object has no attribute 'publications'

In [11]:
for i in G[gn.name_id['A Alexeev']]:
    print(gn.id_info[i].name)

In [None]:
pos = nx.spring_layout(G)
fig = plt.figure(figsize=(20,20))
ax = plt.subplot(111)
ax.set_aspect('equal')
nx.draw_networkx_edges(G, pos, ax=ax)

plt.axis('off')

trans = ax.transData.transform
trans2 = fig.transFigure.inverted().transform

piesize = 0.05 # this is the image size
p2 = piesize / 2.0
for n in G:
    xx, yy = trans(pos[n]) # figure coordinates
    xa, ya = trans2((xx, yy)) # axes coordinates
    a = plt.axes([xa - p2, ya - p2, piesize, piesize])
    a.set_aspect('equal')
    a.imshow(G.node[n]['image'])
    a.annotate(gn.id_info[n].name, xy=(0, 0), horizontalalignment='left', verticalalignment='bottom')
    a.axis('off')
    
plt.savefig('google-scholar-first-circle')