# Scraping Portal de la Reserca

This notebook asynchrnously scrapes information in Portal de la Reserca. It can download the following items:

- Links to author portals
- Author information from the author portals.

# Import modules

In [2]:
import pandas as pd
from requests_html import HTMLSession, AsyncHTMLSession
import time
import asyncio

# Helper functions

## Helper Functions for links

In [61]:
async def scrape_url(s, url, items='author_links'):
    """
    Async scrape URL. 
    Items = ['paper_links', 'author_links', 'papers', 'authors']
    """
    # print(f"Scraping url: {url}")
    while True:
        try:
            r = await s.get(url)
            table = r.html.find('div.panel.panel-info table.table', first=True)
            rows = table.find('tr')
        except AttributeError:
            print(".", end="")
            with open('errors.txt', 'a') as f:
                f.write(f"Could not find row in url: {url}")
            time.sleep(1)
            continue
        break
    
    result_list = []
    
    for row in rows:
        # Get columns in row
        columns = row.find('td')
        # Skip if empty row
        if len(columns) == 0:
            continue
            
        if items == 'paper_links':
            # Get paper link
            paper_link = columns[1].find('a')[0].attrs['href']
            scrape_item = paper_link
            
        elif items == 'author_links':
            # Get paper link
            author_link = columns[0].find('a')[0].attrs['href']
            scrape_item = author_link
            
        elif items == 'papers':
            # Get paper data
            paper_date    = columns[0].text
            paper_title   = columns[1].text
            paper_authors = columns[2].text
            paper_type    = columns[3].text

            paper = {}
            paper['date'] = paper_date
            paper['title'] = paper_title
            paper['type'] = paper_type

            paper_authors_list= paper_authors.split(';')
            for i, author in enumerate(paper_authors_list):
                paper[f"author_{i}"] = author

            scrape_item = paper

        elif items == 'authors':
            # Get author data
            author_name = columns[0].text
            author_last = author_name.split(',')[0]

            try:
                author_first = author_name.split(',')[1]
            except IndexError:  # If there is no comma in the name
                author_first = ''

            author_inst = columns[1].text

            author_dict = {
                'Last Name': author_last,
                'First Name': author_first,
                'Institution': author_inst,
                }

            scrape_item = author_dict
            
        # Append to paper links list
        result_list.append(scrape_item)
        
    return result_list


async def scrape_urls(s, urls, items='author_links'):
    """Wrapper of scrape_url to scrape a list of urls."""
    tasks = (scrape_url(s, url, items) for url in urls)
    return await asyncio.gather(*tasks)


def get_max_pages(url):
    """
    Get max pages from pagination box in footer.
    """
    session = HTMLSession()
    r = session.get(url)
    pagination_box = r.html.find('ul.pagination.pull-right')
    pagination_items = pagination_box[0].find('li')
    max_pages = pagination_items[-2].text.replace('.','').replace(',','')
    return int(max_pages)


async def scrape(items='author_links', n_pages=None):
    """
    Scrape Portal de la Reserca.
    Options:
        items   = ['paper_links', 'author_links', 'papers', 'authors']
        n_pages = Number of pages to scrape.
    """
    print(f"Scraping {items} from Portal de la Reserca.")
    
    if items == 'author_links':
        url_root =                                            \
            'https://portalrecerca.csuc.cat/simple-search?' + \
            'query='                                        + \
            '&location=crisrp'                              + \
            '&filter_field_1=resourcetype'                  + \
                '&filter_type_1=equals'                     + \
                '&filter_value_1=Researchers'               + \
            '&sort_by=crisrp.fullName_sort'                 + \
                '&order=asc'                                + \
            '&rpp=300'                                      + \
            '&etal=0'                                       + \
            '&start='
        
    elif items == 'paper_links':
        url_root =                                            \
            'https://portalrecerca.csuc.cat/simple-search?' + \
            'query='                                        + \
            '&location=publications'                        + \
            '&filter_field_1=resourcetype'                  + \
                '&filter_type_1=equals'                     + \
                '&filter_value_1=Items'                     + \
            '&filter_field_2=itemtype'                      + \
                '&filter_type_2=notequals'                  + \
                '&filter_value_2=Phd+Thesis'                + \
            '&sort_by=dc.contributor.authors_sort'          + \
                '&order=asc'                                + \
            '&rpp=300'                                      + \
            '&etal=0'                                       + \
            '&start='
        
    if not n_pages:
        print("Calculating number of pages to scrape.")
        max_pages = get_max_pages(url_root + '0')
        n_pages = max_pages

    urls = [url_root + str(page*300) for page in range(n_pages)]

    s = AsyncHTMLSession()
    
    print(f"Scraping {len(urls)} URLs...")
    t1 = time.perf_counter()
    result = await scrape_urls(s, urls, items=items)
    t2 = time.perf_counter()
    
    # Gather all results into single list
    full_list = [href for sublist in result for href in sublist]
    
    print(f"Scraped {len(full_list)} items in {t2-t1:.2f} seconds.")
    
    return full_list
   

## Helper functions for Author pages

In [57]:
async def scrape_author_tab(s, url, selector):
    while True:
        try:
            r = await s.get(url)
        except (SSLError, MaxRetryError):
            print(f"Failed request to url: {url}.")
            time.sleep(1)
            continue
        break
    result = r.html.find(selector)
    return result
    
    
async def scrape_author_page(s, url, item='name'):
    if item == 'name':
        selector = 'div#fullNameDiv span'
        result = await scrape_author_tab(s, url, selector)
        try:
            result = result[0].text
        except IndexError:
            result = None
    
    elif item == 'id':
        selector = 'div#orcidDiv a span'
        result = await scrape_author_tab(s, url, selector)
        try:
            result = result[0].text
        except IndexError:
            result = None
    
    elif item == 'institution':
        url_dep = url + '/researcherdepartaments.html?onlytab=true'
        selector = 'table.table tr td'
        institution = await scrape_author_tab(s, url_dep, selector)
        result = {}
        try:
            result['department'] = institution[0].text
            result['institution'] = institution[1].text
        except:
            pass
    
    elif item == 'projects':
        url_proj = url + '/publicresearcherprojects.html?onlytab=true'
        selector = 'table.table tr'
        projects = await scrape_author_tab(s, url_proj, selector)
        project_list = []
        for i in range(1,len(projects)):
            project = projects[i].find('td a')[0].attrs['href']
            project_list.append(project)
        result = project_list
    
    elif item == 'groups':
        url_group = url + '/orgs.html?onlytab=true'
        selector = 'table.table tr'
        groups = await scrape_author_tab(s, url_group, selector)
        group_list = []
        for i in range(1,len(groups)):
            group = groups[i].find('td a')[0].attrs['href']
            group_list.append(group)
        result = group_list
    
    return result


async def scrape_author(s, url):
    result = await asyncio.gather(
            scrape_author_page(s, url, 'name'),
            scrape_author_page(s, url, 'id'),
            scrape_author_page(s, url, 'institution'),
            scrape_author_page(s, url, 'projects'),
            scrape_author_page(s, url, 'groups')
        )
    
    author = {}
    author['name'] = result[0]
    author['id'] = result[1]
    try:
        author['department'] = result[2]['department']
    except KeyError:
        pass
    try:
        author['institution'] = result[2]['institution']
    except KeyError:
        pass
    author['projects'] = result[3]
    author['groups'] = result[4]
    
    return author
        
    
async def scrape_authors(s, urls):
    """Wrapper for scrape_author to scrape a list of urls."""
    tasks = (scrape_author(s, url) for url in urls)
    return await asyncio.gather(*tasks)


async def scrape_batch(urls=None, items='authors', start_pos=0, batch_size=100, out_file=None):
    """
    Scrape Portal de la Reserca in batches:.
    Options:
        urls: list of urls. If items='paper_links' this is not needed.
        items: [authors, author_links, paper_links]
        start_pos: starting position
        batch_size: batch size
        out_file: output file 
    """
    
    if items == 'paper_links':
        url_root =                                            \
            'https://portalrecerca.csuc.cat/simple-search?' + \
            'query='                                        + \
            '&location=publications'                        + \
            '&filter_field_1=resourcetype'                  + \
                '&filter_type_1=equals'                     + \
                '&filter_value_1=Items'                     + \
            '&filter_field_2=itemtype'                      + \
                '&filter_type_2=notequals'                  + \
                '&filter_value_2=Phd+Thesis'                + \
            '&sort_by=dc.contributor.authors_sort'          + \
                '&order=asc'                                + \
            '&rpp=300'                                      + \
            '&etal=0'                                       + \
            '&start='
        
        print("Calculating number of pages to scrape.")
        # max_pages = get_max_pages(url_root + '0')
        max_pages = 2240 # <- hard code result
        n_pages = max_pages

        urls = [url_root + str(page*300) for page in range(n_pages)]
    
    if not urls:
        raise TypeError("Must provide list of urls or set items='paper_links'")
    
    batch_urls = [urls[i:i+batch_size] for i in range(0, len(urls), batch_size)]
    
    print(f"Scraping {len(urls)-start_pos} {items} in {len(batch_urls)} batches of {batch_size}.")
    if out_file:
        print(f"Saving results to {out_file}.")
        if items == 'authors':
            result_df = pd.DataFrame(columns=['name', 'id', 'department', 'institution', 'projects', 'groups'])
        elif items == 'paper_links':
            result_df = pd.DataFrame()
        
    result = []
    for i, batch in enumerate(batch_urls):
        print(f"Scraping batch: {i+1}/{len(batch_urls)}. {items}: {i*batch_size}-{(i+1)*batch_size-1}.", end="\r")
        s = AsyncHTMLSession()
        
        t1 = time.perf_counter()
        if items == 'authors':
            batch_result = await scrape_authors(s, batch)
        elif items == 'paper_links':
            batch_result = await scrape_urls(s, batch, items=items)
            # Flatten result
            batch_result = [i for sublist in batch_result for i in sublist]
        t2 = time.perf_counter()
        
        # Print estimated time left
        seconds_left = (len(batch_urls)-i)*(t2-t1)
        m, s = divmod(seconds_left, 60)
        h, m = divmod(m, 60)
        
        print(f"Last batch: {t2-t1:.2f} seconds. Estimated time left: {h:.0f}h{m:.0f}m{s:.0f}s.", end=" ")
        
        result.extend(batch_result)
        
        if out_file:
            result_df = result_df.append(batch_result, ignore_index=True)
            result_df.to_csv(out_file, index=None)

    print("\nDone.")
    return result

# Run Scraper

## Nodelist: Scrape authors

### Get links to author pages

In [35]:
# Get links to author pages (takes 1m30s)
author_urls = await scrape('author_links')

## Save author URLs

# author_urls_df = pd.DataFrame(author_urls, columns=['author_urls'])
# author_urls_df.to_csv('./data/author_urls.csv', index=False)

Scraping author_links from Portal de la Reserca.
Calculating number of pages to scrape.
Scraping 64 URLs...
Scraped 19050 items in 73.96 seconds.


In [33]:
## Read author URLs

author_urls = pd.read_csv('./data/author_urls.csv')

author_urls = list(author_urls['author_urls'])

### Scrape author pages

In [34]:
# Build urls
url_root = 'https://portalrecerca.csuc.cat'
urls = [url_root + url for url in author_urls]

# Run in batch
batch_size=50
out_file = './data/nodelist_test.csv'

author_data = await scrape_batch(urls, items='authors', start_pos=0, batch_size=batch_size, out_file=out_file)

Scraping 19050 authors in 381 batches of 50.
Saving results to ./data/nodelist_test.csv.
Last batch: 17.53 seconds. Estimated time left: 1h51m19s. Scraping batch: 2/381. authors: 50-99.

NameError: name 'SSLError' is not defined

## Edgelist: scrape publications

In [60]:
# Build urls
# url_root = 'https://portalrecerca.csuc.cat'
# urls = [url_root + url for url in author_urls]

# Run in batch
batch_size=50
out_file = './data/paper_links.csv'

paper_links = await scrape_batch(items='paper_links', start_pos=0, batch_size=batch_size, out_file=out_file)

Calculating number of pages to scrape.
Scraping 2240 paper_links in 45 batches of 50.
Saving results to ./data/paper_links.csv.
Could not find row in url: https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start=4200
Could not find row in url: https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start=4500
Could not find row in url: https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+

# EXTRA CODE

## Speed Test: Sync vs Async

In [None]:
url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=crisrp&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Researchers&sort_by=crisrp.fullName_sort&order=asc&rpp=300&etal=0&start='

urls = [url_root + str(page*100) for page in range(3)]

In [None]:
from requests_html import HTMLSession, AsyncHTMLSession
import time
import asyncio

def get_author_links_sync(s, url):
    print(f"Getting url: {url}")
    r = s.get(url)
    table = r.html.find('div.panel.panel-info table.table', first=True)
    rows = table.find('tr')
    return rows
    
def main_sync(urls):
    s = HTMLSession()
    result = []
    for url in urls:
        rows = get_author_links_sync(s, url)
        result.append(rows)
    return rows
        
async def get_author_links(s, url):
    print(f"Getting url: {url}")
    r = await s.get(url)
    table = r.html.find('div.panel.panel-info table.table', first=True)
    rows = table.find('tr')
    return rows

async def main(urls):
    s = AsyncHTMLSession()
    tasks = (get_author_links(s, url) for url in urls)
    return await asyncio.gather(*tasks)
        
t1 = time.perf_counter()
result = await main(urls)
t2 = time.perf_counter()
print(f"Async: {t2-t1:.2f} seconds.)")

t1 = time.perf_counter()
result_sync = main_sync(urls)
t2 = time.perf_counter()
print(f"Sync: {t2-t1:.2f} seconds.)")