In [1]:
2019-05-06 Scholarly-modified
Modification by Austin Swafford
This notebook is a modification of https://github.com/OrganicIrradiation/scholarly,
itself a fork of https://github.com/1ucian0/chalmers-web
    
Several urls have been updated by Google so the code needed to be modified. This seems to be captured in the pull
requests on the OrganicIrradiation but these have not yet been merged

The environment needed to run this notebook is at gscholar.yaml in this same repo

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

from bs4 import BeautifulSoup

import arrow
import bibtexparser
import codecs
import hashlib
import pprint
import random
import re
import requests
import sys
import time
import pandas as pd


In [4]:
_GOOGLEID = hashlib.md5(str(random.random()).encode('utf-8')).hexdigest()[:16]
_COOKIES = {'GSP': 'ID={0}:CF=4'.format(_GOOGLEID)}
_HEADERS = {
    'accept-language': 'en-US,en',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/41.0.2272.76 Chrome/41.0.2272.76 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml'
    }
_HOST = 'https://scholar.google.com'
_AUTHSEARCH = '/citations?view_op=search_authors&hl=en&mauthors={0}'
_CITATIONAUTH = '/citations?user={0}&hl=en'
_CITATIONPUB = '/citations?view_op=view_citation&citation_for_view={0}'
_KEYWORDSEARCH = '/citations?view_op=search_authors&hl=en&mauthors=label:{0}'
_PUBSEARCH = '/scholar?q={0}'
_SCHOLARPUB = '/scholar?oi=bibs&hl=en&cites={0}'

_CITATIONAUTHRE = r'user=([\w-]*)'
_CITATIONPUBRE = r'citation_for_view=([\w-]*:[\w-]*)'
_SCHOLARCITERE = r'gs_ocit\(event,\'([\w-]*)\''
_SCHOLARPUBRE = r'cites=([\w-]*)'
_EMAILAUTHORRE = r'Verified email at '

_SESSION = requests.Session()
_PAGESIZE = 100

In [6]:
def _get_page(pagerequest):
    """Return the data for a page on scholar.google.com"""
    # Note that we include a sleep to avoid overloading the scholar server
    time.sleep(5+random.uniform(0, 5))
    resp = _SESSION.get(pagerequest, headers=_HEADERS, cookies=_COOKIES)
    if resp.status_code == 200:
        return resp.text
    if resp.status_code == 503:
        # Inelegant way of dealing with the G captcha
        raise Exception('Error: {0} {1}'.format(resp.status_code, resp.reason))
        # TODO: Need to fix captcha handling
        # dest_url = requests.utils.quote(_SCHOLARHOST+pagerequest)
        # soup = BeautifulSoup(resp.text, 'html.parser')
        # captcha_url = soup.find('img').get('src')
        # resp = _handle_captcha(captcha_url)
        # return _get_page(re.findall(r'https:\/\/(?:.*?)(\/.*)', resp)[0])
    else:
        raise Exception('Error: {0} {1}'.format(resp.status_code, resp.reason))
        
def _get_soup(pagerequest):
    """Return the BeautifulSoup for a page on scholar.google.com"""
    html = _get_page(pagerequest)
    html = html.replace(u'\xa0', u' ')
    return BeautifulSoup(html, 'html.parser')

def search_author_alt(name):
    """Search by author name and return a generator of Author objects"""
    url = _AUTHSEARCH.format(requests.utils.quote(name))
    soup = _get_soup(_HOST+url)
    return _search_citation_soup(soup)

def _search_citation_soup(soup):
    """Generator that returns Author objects from the author search page"""
    while True:
        for row in soup.find_all('div', 'gsc_1usr'):
            #print(row)
            yield Author(row)
        next_button = soup.find(class_='gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx')
        if next_button and 'disabled' not in next_button.attrs:
            url = next_button['onclick'][17:-1]
            url = codecs.getdecoder("unicode_escape")(url)[0]
            soup = _get_soup(_HOST+url)
        else:
            break

In [7]:
class Author(object):
    """Returns an object for a single author"""
    def __init__(self, __data):
        if isinstance(__data, str):
            self.id = __data
        else:
            self.id = re.findall(_CITATIONAUTHRE, __data('a')[0]['href'])[0]
            self.url_picture = _HOST+'/citations?view_op=medium_photo&user={}'.format(self.id)
            self.name = __data.find('h3', class_='gs_ai_name').text
            affiliation = __data.find('div', class_='gs_ai_aff')
            if affiliation:
                self.affiliation = affiliation.text
            email = __data.find('div', class_='gs_ai_eml')
            if email:
                self.email = re.sub(_EMAILAUTHORRE, r'@', email.text)
            self.interests = [i.text.strip() for i in
                              __data.find_all('a', class_='gs_ai_one_int')]
            citedby = __data.find('div', class_='gs_ai_cby')
            if citedby and citedby.text != '':
                self.citedby = int(citedby.text[9:])
        self._filled = False

    def fill(self):
        """Populate the Author with information from their profile"""
        gs_prefix='gsc'
        url_citations = _CITATIONAUTH.format(self.id)
        url = '{0}&pagesize={1}'.format(url_citations, _PAGESIZE)
        self.gs_website = _HOST+url
        print(_HOST+url)
        soup = _get_soup(_HOST+url)
        #print(soup)
        self.name = soup.find('div', id=gs_prefix + '_prf_in').text
        self.affiliation = soup.find('div', class_=gs_prefix + '_prf_il').text
        self.interests = [i.text.strip() for i in soup.find_all('a', class_=gs_prefix + '_prf_inta')]
        
        # h-index, i10-index and h-index, i10-index in the last 5 years
        index = soup.find_all('td', class_='gs_rsb_std')
        if index:
            self.citedby = int(index[0].text)
            self.citedby5y = int(index[1].text)
            self.hindex = int(index[2].text)
            self.hindex5y = int(index[3].text)
            self.i10index = int(index[4].text)
            self.i10index5y = int(index[5].text)
        else:
            self.hindex = self.hindex5y = self.i10index = self.i10index5y = 0

        # number of citations per year
        years = [int(y.text) for y in soup.find_all('span', class_=gs_prefix + '_g_t')]
        cites = [int(c.text) for c in soup.find_all('span', class_=gs_prefix + '_g_al')]
        self.cites_per_year = dict(zip(years, cites))

        # co-authors
        self.coauthors = []
        for row in soup.find_all('span', class_=gs_prefix + '_rsb_a_desc'):
            new_coauthor = Author(re.findall(_CITATIONAUTHRE, row('a')[0]['href'])[0])
            new_coauthor.name = row.find(tabindex="-1").text
            new_coauthor.affiliation = row.find(class_=gs_prefix + "_rsb_a_ext").text
            self.coauthors.append(new_coauthor)


        self.publications = list()
        pubstart = 0
        while True:
            for row in soup.find_all('tr', class_=gs_prefix + '_a_tr'):
                new_pub = Publication(row, 'citations')
                self.publications.append(new_pub)
            if 'disabled' not in soup.find('button', id=gs_prefix + '_bpf_more').attrs:
                pubstart += _PAGESIZE
                url = '{0}&cstart={1}&pagesize={2}'.format(url_citations, pubstart, _PAGESIZE)
                soup = _get_soup(_HOST+url)
            else:
                break

        self._filled = True
        return self

    def __str__(self):
        return pprint.pformat(self.__dict__)

In [9]:
class Publication(object):
    """Returns an object for a single publication"""
    def __init__(self, __data, pubtype=None):
        self.bib = dict()
        self.source = pubtype
        if self.source == 'citations':
            self.bib['title'] = __data.find('a', class_='gsc_a_at').text
            self.id_citations = re.findall(_CITATIONPUBRE, __data.find('a', class_='gsc_a_at')['data-href'])[0]
            citedby = __data.find(class_='gsc_a_ac')
            if citedby and not (citedby.text.isspace() or citedby.text == ''):
                self.citedby = int(citedby.text)
            year = __data.find(class_='gsc_a_h')
            if year and year.text and not year.text.isspace() and len(year.text)>0:
                self.bib['year'] = int(year.text)
        elif self.source == 'scholar':
            databox = __data.find('div', class_='gs_ri')
            title = databox.find('h3', class_='gs_rt')
            if title.find('span', class_='gs_ctu'): # A citation
                title.span.extract()
            elif title.find('span', class_='gs_ctc'): # A book or PDF
                title.span.extract()
            self.bib['title'] = title.text.strip()
            if title.find('a'):
                self.bib['url'] = title.find('a')['href']
            authorinfo = databox.find('div', class_='gs_a')
            self.bib['author'] = ' and '.join([i.strip() for i in authorinfo.text.split(' - ')[0].split(',')])
            if databox.find('div', class_='gs_rs'):
                self.bib['abstract'] = databox.find('div', class_='gs_rs').text
                if self.bib['abstract'][0:8].lower() == 'abstract':
                    self.bib['abstract'] = self.bib['abstract'][9:].strip()
            lowerlinks = databox.find('div', class_='gs_fl').find_all('a')
            for link in lowerlinks:
                if 'Import into BibTeX' in link.text:
                    self.url_scholarbib = link['href']
                if 'Cited by' in link.text:
                    self.citedby = int(re.findall(r'\d+', link.text)[0])
                    self.id_scholarcitedby = re.findall(_SCHOLARPUBRE, link['href'])[0]
            if __data.find('div', class_='gs_ggs gs_fl'):
                self.bib['eprint'] = __data.find('div', class_='gs_ggs gs_fl').a['href']
        self._filled = False

    def fill(self):
        """Populate the Publication with information from its profile"""
        if self.source == 'citations':
            url = _CITATIONPUB.format(self.id_citations)
            soup = _get_soup(_HOST+url)
            self.bib['title'] = soup.find('div', id='gsc_vcd_title').text
            if soup.find('a', class_='gsc_vcd_title_link'):
                self.bib['url'] = soup.find('a', class_='gsc_vcd_title_link')['href']
            for item in soup.find_all('div', class_='gs_scl'):
                key = item.find(class_='gsc_vcd_field').text
                val = item.find(class_='gsc_vcd_value')
                if key == 'Authors':
                    self.bib['author'] = ' and '.join([i.strip() for i in val.text.split(',')])
                elif key == 'Journal':
                    self.bib['journal'] = val.text
                elif key == 'Volume':
                    self.bib['volume'] = val.text
                elif key == 'Issue':
                    self.bib['number'] = val.text
                elif key == 'Pages':
                    self.bib['pages'] = val.text
                elif key == 'Publisher':
                    self.bib['publisher'] = val.text
                elif key == 'Publication date':
                    self.bib['year'] = arrow.get(val.text).year
                elif key == 'Description':
                    if val.text[0:8].lower() == 'abstract':
                        val = val.text[9:].strip()
                    self.bib['abstract'] = val
                elif key == 'Total citations':
                    self.id_scholarcitedby = re.findall(_SCHOLARPUBRE, val.a['href'])[0]
            if soup.find('div', class_='gsc_vcd_title_ggi'):
                self.bib['eprint'] = soup.find('div', class_='gsc_vcd_title_ggi').a['href']
            self._filled = True
        elif self.source == 'scholar':
            bibtex = _get_page(self.url_scholarbib)
            self.bib.update(bibtexparser.loads(bibtex).entries[0])
            self._filled = True
        return self

    def get_citedby(self):
        """Searches GScholar for other articles that cite this Publication and
        returns a Publication generator.
        """
        if not hasattr(self, 'id_scholarcitedby'):
            self.fill()
        if hasattr(self, 'id_scholarcitedby'):
            url = _SCHOLARPUB.format(requests.utils.quote(self.id_scholarcitedby))
            soup = _get_soup(_HOST+url)
            return _search_scholar_soup(soup)
        else:
            return []

    def __str__(self):
        return pprint.pformat(self.__dict__)

In [13]:
#these are new methods introduced to enable me to get a list of multiple authors at onces
def multi_author_search(list_of_authors):
    author_dict={}
    gs_dict= {}
    for name in list_of_authors:
        print(name)
        try:
            temp_author=next(search_author_alt(name)).fill()
            author_dict[name]= temp_author
        except:
            print(name + ' not found in Google Scholar')
    return author_dict

def get_citations(author_dict,name='Total',year='Total'):
    total_citations= 0
    if name == year == 'Total':
        for author in author_dict:
            total_citations = total_citations + author_dict[author].citedby
    elif name == 'Total':
        for author in author_dict:
            total_citations = total_citations + author_dict[author].cites_per_year[int(year)]
    elif year == 'Total':
        total_citations = author_dict[name].citedby
    else:
        total_citations = author_dict[name].cites_per_year[int(year)]
    
    return total_citations

def get_author_list(filename,head=0,index=0,delim='\t'):
    author_list =list(pd.read_csv(filename,header=head,index_col=index,sep=delim).index)
    author_list = [auth for auth in author_list if str(auth) != 'nan']
    return author_list
        

In [16]:
authors= get_author_list('./20190410_list_of_CMI_faculty_for_brochure.txt')
authors

['Laura Crotty Alexander',
 'Sanjay Nigam',
 'Louise Laurent',
 'Lars Bode',
 'Gabriel Haddad',
 'Suzi Hong',
 'Rohit Loomba',
 'Kyung Rhee',
 'Vipin Kumar',
 'Peter Ernst',
 'Douglas Conrad',
 'Pieter Dorrestein',
 'Paul Jensen',
 'Rob Knight',
 'Victor Nizet',
 'Bill Sandborn',
 'Karsten Zengler',
 'Manuela Raffatellu',
 'Brigid Boland',
 'Abraham Palmer',
 'Sheila Podell',
 'Jennifer Smith',
 'Michael McCarthy',
 'Shane Crotty',
 'Wenxian Fu',
 'Matt Daugherty',
 'Adam DeConde',
 'Rommie E. Amaro',
 'Kit Pogliano',
 'Sandip Patel',
 'John Bradley',
 'Emily Lukacz',
 'Lisa Eyler',
 'Chris Glass',
 'Maria Rosario Araneta',
 'Pascal Gagneux',
 'Lada Rasochova',
 'Shelley Lawrence',
 'Jim Moore',
 'Jill Mesirov',
 'Sheila Crowe',
 'Wayne Pfeiffer',
 'Lakshmi Chilukuri',
 'James Golden',
 'Lihini Aluwihare',
 'Scott Baden',
 'Nan Hao',
 'Justin Meyer',
 'Joseph Vinetz',
 'Bernd Schnabl',
 'Siavash Mirarab',
 'Amir Zarrinpar',
 'Mohit Jain',
 'Ilkay Altintas',
 'David Smith',
 'Nick Webst

In [17]:
results = multi_author_search(authors)

Laura Crotty Alexander
https://scholar.google.com/citations?user=awF2ZroAAAAJ&hl=en&pagesize=100
Sanjay Nigam
https://scholar.google.com/citations?user=dh23XRMAAAAJ&hl=en&pagesize=100
Louise Laurent
https://scholar.google.com/citations?user=kDdMyKkAAAAJ&hl=en&pagesize=100
Lars Bode
https://scholar.google.com/citations?user=lLcpFlwAAAAJ&hl=en&pagesize=100
Gabriel Haddad
Gabriel Haddad not found in Google Scholar
Suzi Hong
Suzi Hong not found in Google Scholar
Rohit Loomba
https://scholar.google.com/citations?user=fWXBbB0AAAAJ&hl=en&pagesize=100
Kyung Rhee
https://scholar.google.com/citations?user=k7esjnUAAAAJ&hl=en&pagesize=100
Vipin Kumar
https://scholar.google.com/citations?user=BnxU9TEAAAAJ&hl=en&pagesize=100
Peter Ernst
https://scholar.google.com/citations?user=cPBQ1t0AAAAJ&hl=en&pagesize=100
Douglas Conrad
Douglas Conrad not found in Google Scholar
Pieter Dorrestein
https://scholar.google.com/citations?user=IsfIfVsAAAAJ&hl=en&pagesize=100
Paul Jensen
https://scholar.google.com/cita

Sheldon Brown
https://scholar.google.com/citations?user=otC57hcAAAAJ&hl=en&pagesize=100
Kun Zhang
https://scholar.google.com/citations?user=CYeurYgAAAAJ&hl=en&pagesize=100
Kimberly Prather
https://scholar.google.com/citations?user=CSnZbf8AAAAJ&hl=en&pagesize=100
David Gonzalez
https://scholar.google.com/citations?user=3s1QuLQAAAAJ&hl=en&pagesize=100
Sergey Kryazhimskiy
https://scholar.google.com/citations?user=cEr8jtAAAAAJ&hl=en&pagesize=100
Jerrold Olefsky
https://scholar.google.com/citations?user=SYYG0d8AAAAJ&hl=en&pagesize=100
Cinnamon Bloss
https://scholar.google.com/citations?user=DnQMAnwAAAAJ&hl=en&pagesize=100
Emily Troemel
Emily Troemel not found in Google Scholar
Ming Tsuang
https://scholar.google.com/citations?user=5AIq8e8AAAAJ&hl=en&pagesize=100
Gurol Suel
https://scholar.google.com/citations?user=x8lYIFEAAAAJ&hl=en&pagesize=100
Dilip Jeste
https://scholar.google.com/citations?user=2gl5XFYAAAAJ&hl=en&pagesize=100
Lawrence Prince
https://scholar.google.com/citations?user=pQLW

In [40]:
get_citations(results)

2173670