## Tool for scraping the publication info of the team from Google Scholar

Setting up the necessary imports

In [1]:
from scholarly import scholarly, ProxyGenerator
from os import path, getcwd, chdir, listdir
import markdown
import json

Define some general variables, may need to be adapted upon structural changes to the repository

In [2]:
# The path to the folder that stores the information of all team members
people_folder = path.join(getcwd(), 'people')
# The name of the meta tag, which stores the necessary information about the team member for the search query.
meta_name_att = 'name'
meta_affiliation_att = 'affiliation'

Set up Proxies for the scraping

In [3]:
# %%
pg = ProxyGenerator()

scholarly.use_proxy(pg, pg)

In the folders of each member, a Markdown file is stored containing Meta information of the member. We want to access the meta information in order to build up the search queries.


In [None]:
# The 'people' folder holds a subfolder for each member of the team
subfolders = listdir(people_folder)

# Set up a Markdown parser
md_parser = markdown.Markdown(extensions=['meta'])

# Iterate through the folders of each member
for member_folder in subfolders:
  # Complete the path to the folder of the member we currently process
  full_path = path.join(people_folder, member_folder)
  # Build the path to the Markdown file in the current subfolder
  md_path = path.join(full_path, [filename for filename in listdir(full_path) if '.md' in filename ][0])
  
  # Read in the Markdown file
  with open(md_path, 'r') as md_file:
    text = md_file.read()
    # Make use of the Markdown Parser to access the Meta information that is stored in the file
    md_parser.convert(text)
    # Retrieve the Metadata of the author
    author_name = md_parser.Meta[meta_name_att][0].replace('"', '')
    author_affiliation = md_parser.Meta[meta_affiliation_att][0].replace('"', '')
    print(author_name)

  # Use scholarly to build up a search query for the team member
  author_query = scholarly.search_author(' '.join([author_name, author_affiliation]))
  
  # Execute the search query to fill up information about the team member
  try:
    author = scholarly.fill(next(author_query))
    
    # Collect information of all publications of the team member
    publications = []
    num_pubs = len(author['publications'])
    print(f'start processing {num_pubs} publications')
    # Iterate over each entry in the publication list
    # NOTE: The entries here are not filled up yet by scholarly, but simple placeholders containing some meta info
    # that can be used for further querying
    for idx, pub_query in enumerate(author['publications']):
      if idx % 10 == 0:
        print(f'begin processing publication #{idx}')
      # use the placeholder query to retrieve the actual publication info from Google Scholar
      publications.append(scholarly.fill(pub_query))
    
    # Save the collected publication info appropriately
    with open(path.join(full_path, 'publications.json'), 'w', encoding='utf-8') as f:
      json.dump(publications, f, ensure_ascii=False, indent=4)
      
  except StopIteration:
    print('no information found for the author, proceeding to the next')
  
  