# Scraping Portal de la Reserca

This notebook asynchrnously scrapes information in Portal de la Reserca. It can download the following items:

- Links to author portals
- Author information from the author portals.

# Import modules

In [20]:
import pandas as pd
from requests_html import HTMLSession, AsyncHTMLSession
import time
import asyncio

# Helper functions

## Helper Functions for links

In [21]:
async def scrape_url(s, url, items='author_links'):
    """
    Async scrape URL. 
    Items = ['paper_links', 'author_links', 'papers', 'authors']
    """
    # print(f"Scraping url: {url}")
    r = await s.get(url)
    table = r.html.find('div.panel.panel-info table.table', first=True)
    try:
        rows = table.find('tr')
    except:
        return []
    
    result_list = []
    
    for row in rows:
        # Get columns in row
        columns = row.find('td')
        # Skip if empty row
        if len(columns) == 0:
            continue
            
        if items == 'paper_links':
            # Get paper link
            paper_link = columns[1].find('a')[0].attrs['href']
            scrape_item = paper_link
            
        elif items == 'author_links':
            # Get paper link
            author_link = columns[0].find('a')[0].attrs['href']
            scrape_item = author_link
            
        elif items == 'papers':
            # Get paper data
            paper_date    = columns[0].text
            paper_title   = columns[1].text
            paper_authors = columns[2].text
            paper_type    = columns[3].text

            paper = {}
            paper['date'] = paper_date
            paper['title'] = paper_title
            paper['type'] = paper_type

            paper_authors_list= paper_authors.split(';')
            for i, author in enumerate(paper_authors_list):
                paper[f"author_{i}"] = author

            scrape_item = paper

        elif items == 'authors':
            # Get author data
            author_name = columns[0].text
            author_last = author_name.split(',')[0]

            try:
                author_first = author_name.split(',')[1]
            except IndexError:  # If there is no comma in the name
                author_first = ''

            author_inst = columns[1].text

            author_dict = {
                'Last Name': author_last,
                'First Name': author_first,
                'Institution': author_inst,
                }

            scrape_item = author_dict
            
        # Append to paper links list
        result_list.append(scrape_item)
        
    return result_list


async def main(urls, items='author_links'):
    """
    Async main loop. Run withn await main(urls) in Jupyter Notebook.
    """
    s = AsyncHTMLSession()
    tasks = (scrape_url(s, url, items) for url in urls)
    return await asyncio.gather(*tasks)

def get_max_pages(url):
    """
    Get max pages from pagination box in footer.
    """
    session = HTMLSession()
    r = session.get(url)
    pagination_box = r.html.find('ul.pagination.pull-right')
    pagination_items = pagination_box[0].find('li')
    max_pages = pagination_items[-2].text.replace('.','').replace(',','')
    return int(max_pages)

async def scrape(items='author_links', start_page=0, n_pages=None):
    """
    Scrape Portal de la Reserca.
    Options:
        items   = ['paper_links', 'author_links', 'papers', 'authors']
        n_pages = Number of pages to scrape.
    """
    print(f"Scraping {items} from Portal de la Reserca.")
    
    # Url root for authors
    if items == 'author_links':
        url_root =                                            \
            'https://portalrecerca.csuc.cat/simple-search?' + \
            'query='                                        + \
            '&location=crisrp'                              + \
            '&filter_field_1=resourcetype'                  + \
                '&filter_type_1=equals'                     + \
                '&filter_value_1=Researchers'               + \
            '&sort_by=crisrp.fullName_sort'                 + \
                '&order=asc'                                + \
            '&rpp=300'                                      + \
            '&etal=0'                                       + \
            '&start='

    elif items == 'paper_links':
        url_root =                                            \
            'https://portalrecerca.csuc.cat/simple-search?' + \
            'query='                                        + \
            '&location=publications'                        + \
            '&filter_field_1=resourcetype'                  + \
                '&filter_type_1=equals'                     + \
                '&filter_value_1=Items'                     + \
            '&filter_field_2=itemtype'                      + \
                '&filter_type_2=notequals'                  + \
                '&filter_value_2=Phd+Thesis'                + \
            '&sort_by=dc.contributor.authors_sort'          + \
                '&order=asc'                                + \
            '&rpp=300'                                      + \
            '&etal=0'                                       + \
            '&start=' 
    
    if not n_pages:
        print("Calculating number of pages to scrape.")
        max_pages = get_max_pages(url_root + '0')
        n_pages = max_pages - start_page

    urls = [url_root + str(page*300) for page in range(start_page, start_page+n_pages)]

    # items_to_scrape = 'author_links'

    print(f"Scraping {len(urls)} URLs starting in page {start_page}...")
    t1 = time.perf_counter()
    result = await main(urls, items=items)
    t2 = time.perf_counter()
    
    # Gather all results into single list
    full_list = [href for sublist in result for href in sublist]
    
    print(f"Scraped {len(full_list)} items in {t2-t1:.2f} seconds.")
    
    return full_list
   

## Helper functions for items in links

In [22]:
async def scrape_author_tab(s, url, selector):
    r = await s.get(url)
    result = r.html.find(selector)
    return result
    
    
async def scrape_author_page(s, url, item='name'):
    if item == 'name':
        selector = 'div#fullNameDiv span'
        result = await scrape_author_tab(s, url, selector)
        result = result[0].text
    
    elif item == 'id':
        selector = 'div#orcidDiv a span'
        result = await scrape_author_tab(s, url, selector)
        result = result[0].text
    
    elif item == 'institution':
        url_dep = url + '/researcherdepartaments.html?onlytab=true'
        selector = 'table.table tr td'
        institution = await scrape_author_tab(s, url_dep, selector)
        result = {}
        try:
            result['department'] = institution[0].text
            result['institution'] = institution[1].text
        except:
            pass
    
    elif item == 'projects':
        url_proj = url + '/publicresearcherprojects.html?onlytab=true'
        selector = 'table.table tr'
        projects = await scrape_author_tab(s, url_proj, selector)
        project_list = []
        for i in range(1,len(projects)):
            project = projects[i].find('td a')[0].attrs['href']
            project_list.append(project)
        result = project_list
    
    elif item == 'groups':
        url_group = url + '/orgs.html?onlytab=true'
        selector = 'table.table tr'
        groups = await scrape_author_tab(s, url_group, selector)
        group_list = []
        for i in range(1,len(groups)):
            group = groups[i].find('td a')[0].attrs['href']
            group_list.append(group)
        result = group_list
    
    return result


async def scrape_author(s, url):
    result = await asyncio.gather(
            scrape_author_page(s, url, 'name'),
            scrape_author_page(s, url, 'id'),
            scrape_author_page(s, url, 'institution'),
            scrape_author_page(s, url, 'projects'),
            scrape_author_page(s, url, 'groups')
        )
    
    author = {}
    author['name'] = result[0]
    author['id'] = result[1]
    try:
        author['department'] = result[2]['department']
    except KeyError:
        pass
    try:
        author['institution'] = result[2]['institution']
    except KeyError:
        pass
    author['projects'] = result[3]
    author['groups'] = result[4]
    
    return author
        
    
async def scrape_authors(urls):
    s = AsyncHTMLSession()
    tasks = (scrape_author(s, url) for url in urls)
    
    return await asyncio.gather(*tasks)

# Scrape authors to create Nodelist

## Scrape links

In [None]:
url_root = 'https://portalrecerca.csuc.cat'

# Get links to author pages
author_urls = await scrape('author_links')

## Scrape items within links

In [None]:
# Scrape in batches to avoid being blocked
batch_size = 100

# How many authors to scrape
n_authors = len(author_urls)

print(f"Scraping {n_authors} authors in batches of {batch_size}.")

result = []
result_df = pd.DataFrame(columns=['name', 'id', 'department', 'institution', 'projects', 'groups'])

for batch_start in range(0, n_authors, batch_size):
    print(f"Scraping batch {batch_start/batch_size+1:.0f}/{n_authors/batch_size:.0f}.", end="\r")
    try:
        batch_urls = author_urls[batch_start:batch_start+batch_size]
    except IndexError:
        batch_urls = author_urls[batch_start:n_authors+1]
        
    urls = [url_root + url for url in batch_urls]
    
    t1 = time.perf_counter()
    author_result = await scrape_authors(urls)
    t2 = time.perf_counter()
    
    print(f"Last batch: {t2-t1:.2f} seconds.", end=" ")
    
    result.extend(author_result)
    result_df = result_df.append(author_result, ignore_index=True)
    result_df.to_csv('nodelist.csv')
    
print("\nDone.")

# Scrape papers

## Scrape links to papers

In [23]:
# Test chunks function

def chunks(lst, n):
    """Creat n-sized chunks of list lst"""
    for i in range(0, len(lst), n):
        yield lst[i:i+n]
        
mylist = list(range(11))

In [24]:
n=3
[mylist[i:i+n] for i in range(0,len(mylist),n)]

[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10]]

In [25]:
next(chunks(mylist, 3))
next(next(chunks(mylist, 3)))

TypeError: 'list' object is not an iterator

In [26]:
url_root =                                            \
    'https://portalrecerca.csuc.cat/simple-search?' + \
    'query='                                        + \
    '&location=publications'                        + \
    '&filter_field_1=resourcetype'                  + \
        '&filter_type_1=equals'                     + \
        '&filter_value_1=Items'                     + \
    '&filter_field_2=itemtype'                      + \
        '&filter_type_2=notequals'                  + \
        '&filter_value_2=Phd+Thesis'                + \
    '&sort_by=dc.contributor.authors_sort'          + \
        '&order=asc'                                + \
    '&rpp=300'                                      + \
    '&etal=0'                                       + \
    '&start=' 

# print("Calculating number of pages to scrape.")
max_pages = 2240
start_page = 0
n_pages = 2240
# max_pages = get_max_pages(url_root + '0')
# n_pages = max_pages - start_page

urls = [url_root + str(page*300) for page in range(start_page, start_page+n_pages)]

In [27]:
url_root

'https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start='

In [28]:
paper_urls = urls

In [None]:
# Scrape in batches to avoid being blocked
batch_size = 100

# How many authors to scrape
# n_pages = len(author_urls)
n_pages = 2240

print(f"Scraping {n_pages} authors in batches of {batch_size}.")

result = []
# result_df = pd.DataFrame(columns=['name', 'id', 'department', 'institution', 'projects', 'groups'])

for batch_start in range(0, n_pages, batch_size):
    print(f"Scraping batch {batch_start/batch_size+1:.0f}/{n_pages/batch_size:.0f}.", end="\r")
    # try:
        # batch_urls = paper_urls[batch_start:batch_start+batch_size]
    # except IndexError:
        # batch_urls = paper_urls[batch_start:n_pages+1]
        
    # urls = batch_urls
    # urls = [url_root + url for url in batch_urls]
    
    t1 = time.perf_counter()
    paper_links = await scrape('paper_links', n_pages=batch_size, start_page=batch_start)
    # author_result = await scrape_authors(urls)
    t2 = time.perf_counter()
    
    print(f"Last batch: {t2-t1:.2f} seconds.", end=" ")
    
    result.extend(paper_links)
    # result_df = result_df.append(author_result, ignore_index=True)
    # result_df.to_csv('nodelist.csv')
    
print("\nDone.")

Scraping 2240 authors in batches of 100.
Scraping paper_links from Portal de la Reserca.
Scraping 100 URLs starting in page 0...
Scraped 22200 items in 65.35 seconds.
Scraping paper_links from Portal de la Reserca.
Scraping 100 URLs starting in page 100...
Scraped 6600 items in 46.25 seconds.
Scraping paper_links from Portal de la Reserca.
Scraping 100 URLs starting in page 200...
Scraped 24000 items in 83.03 seconds.
Scraping paper_links from Portal de la Reserca.
Scraping 100 URLs starting in page 300...
Scraped 16800 items in 73.40 seconds.
Scraping paper_links from Portal de la Reserca.
Scraping 100 URLs starting in page 400...
Scraped 17700 items in 87.59 seconds.
Scraping paper_links from Portal de la Reserca.
Scraping 100 URLs starting in page 500...
Scraped 22500 items in 99.69 seconds.
Scraping paper_links from Portal de la Reserca.
Scraping 100 URLs starting in page 600...


In [None]:
# Result of last scrape:
result

In [None]:
# Save results
pd.DataFrame(result).to_csv('data/paper_links_last.csv')

In [None]:
# url_root = 'https://portalrecerca.csuc.cat'

# Get links to author pages (Takes 2m to run)
paper_links = await scrape('paper_links', n_pages=1000, start_page=200)

In [None]:
len(paper_links)

In [None]:
# Save output
paper_links_df = pd.DataFrame(paper_links, columns=['paper_links'])
paper_links_df.to_csv('./data/paper_links_100-199.csv')

## Scrape items within links to papers

# Speed Test: Sync vs Async

In [None]:
url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=crisrp&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Researchers&sort_by=crisrp.fullName_sort&order=asc&rpp=300&etal=0&start='

urls = [url_root + str(page*100) for page in range(3)]

In [None]:
from requests_html import HTMLSession, AsyncHTMLSession
import time
import asyncio

def get_author_links_sync(s, url):
    print(f"Getting url: {url}")
    r = s.get(url)
    table = r.html.find('div.panel.panel-info table.table', first=True)
    rows = table.find('tr')
    return rows
    
def main_sync(urls):
    s = HTMLSession()
    result = []
    for url in urls:
        rows = get_author_links_sync(s, url)
        result.append(rows)
    return rows
        
async def get_author_links(s, url):
    print(f"Getting url: {url}")
    r = await s.get(url)
    table = r.html.find('div.panel.panel-info table.table', first=True)
    rows = table.find('tr')
    return rows

async def main(urls):
    s = AsyncHTMLSession()
    tasks = (get_author_links(s, url) for url in urls)
    return await asyncio.gather(*tasks)
        
t1 = time.perf_counter()
result = await main(urls)
t2 = time.perf_counter()
print(f"Async: {t2-t1:.2f} seconds.)")

t1 = time.perf_counter()
result_sync = main_sync(urls)
t2 = time.perf_counter()
print(f"Sync: {t2-t1:.2f} seconds.)")

# EXTRA CODE

## Helper functions

In [None]:
# Get max pages from pagination box in footer
def get_max_pages(session_object):
    pagination_box = session_object.html.find('ul.pagination.pull-right')
    pagination_items = pagination_box[0].find('li')
    max_pages = pagination_items[-2].text.replace('.','').replace(',','')
    return int(max_pages)

# Get papers links
def scrape( 
    items      = 'links', 
    url_root   = None,
    max_pages  = None, 
    page_start = 0):
    """
    Scrape Portal de la Reserca
    Options:
    - items = [links, papers, authors]
    """
    
    print(f"Scraping {items} from Portal de la Reserca.")
    
    if not url_root:
        if items == 'authors':
            url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=crisrp&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Researchers&sort_by=crisrp.fullName_sort&order=asc&rpp=300&etal=0&start='
        elif items == 'links' or items == 'papers':
            url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start=' 
            
    session = HTMLSession()
    
    if not max_pages:
        url = url_root + '0'
        r = session.get(url)
        max_pages = get_max_pages(r)
            
    page_number = page_start
    
    result_list = []
        
    while page_number < max_pages:
        # Progress
        print(f"Progress: {round(page_number/(max_pages)*100)}%. Scraping page: {page_number}/{max_pages}.", end="\r")

        # Create URL
        url = url_root + str(300*page_number)
        # Update page counter
        page_number += 1
        # Get page
        r = session.get(url)
        # Get table
        table = r.html.find('div.panel.panel-info table.table', first=True)
        # Get rows from table
        rows = table.find('tr')

        for row in rows:
            # Get columns in row
            columns = row.find('td')
            # Skip if empty row
            if len(columns) == 0:
                continue
                
            if items == 'links':
                # Get paper link
                paper_link = columns[1].find('a')[0].attrs['href']
                scrape_item = paper_link
            
            elif items == 'papers':
                # Get paper data
                paper_date    = columns[0].text
                paper_title   = columns[1].text
                paper_authors = columns[2].text
                paper_type    = columns[3].text

                paper = {}
                paper['date'] = paper_date
                paper['title'] = paper_title
                paper['type'] = paper_type

                paper_authors_list= paper_authors.split(';')
                for i, author in enumerate(paper_authors_list):
                    paper[f"author_{i}"] = author

                scrape_item = paper
                
            elif items == 'authors':
                # Get author data
                author_name = columns[0].text
                author_last = author_name.split(',')[0]

                try:
                    author_first = author_name.split(',')[1]
                except IndexError:  # If there is no comma in the name
                    author_first = ''

                author_inst = columns[1].text

                author_dict = {
                    'Last Name': author_last,
                    'First Name': author_first,
                    'Institution': author_inst,
                    }

                scrape_item = author_dict
                
            # Append to paper links list
            result_list.append(scrape_item)
        
    if page_number == max_pages:
        print("Progress: 100%                                  ")
        
    if items == 'links':
        result_df = pd.DataFrame(result_list, columns=['paper links'])
    else:
        result_df = pd.DataFrame.from_records(result_list)
              
    return result_df

In [None]:
def get_coauthors(paper_links_df, n_papers=None, start=0):
    """
    Get coauthors list from list of paper links
    Input:
    - paper_links_df: Dataframe with hyperlinks to papers.
    - n_papers: number of papers to scrape.
    - start: initial paper to start scraping.
    """
    if not n_papers:
        n_papers =  len(paper_links_df) - start
    print(f"Scraping author ID's of papers in Portal de la Reserca. Starting in paper {start} and scraping {n_papers} papers.")
    
    session = HTMLSession()
    
    # Get author links
    result_list = []
    for i, link in enumerate(paper_links_df['paper links']):
        
        if i < start:
            continue
            
        threshold = start + n_papers
        if i >= threshold:
            break
            
        print(f"Progress: {round(i/(threshold)*100)}%. Scraping paper: {i+1}/{threshold}.", end="\r")
        url_root = 'https://portalrecerca.csuc.cat'
        url = url_root + link
        r = session.get(url)
        # print(url, end="\r")
        # print(r.text, end="\r")
        try:
            table = r.html.find('table.itemDisplayTable')[0]
        except IndexError:
            continue
        rows = table.find('tr')
        # Get links to authors
        author_links = rows[2].find('td')[1].find('a.author')
        author_hrefs = []
        for i, link in enumerate(author_links):
            href = link.attrs['href']
            author_hrefs.append(href)

        # Visit author links and get id
        author_list = []
        for href in author_hrefs:
            url = url_root + href
            author_page = session.get(url).html
            author_id = author_page.find('div#orcidDiv span')[0].text
            author_list.append(author_id)

            # author_name = author_page.find('div#fullNameDiv span')[0].text
            # author = {'name':author_name, 'id': author_id}
            # author_list.append(author)

        result_list.append(author_list)
        
    print(f"Progress: 100%.                          ")
    return result_list

# Scrape paper links, paper authors, and author information 

## Configuration

In [None]:
session = HTMLSession()

# Choose custom URLs
# url = 'https://portalrecerca.csuc.cat/simple-search?filtername=resourcetype&filterquery=Researchers&filtertype=equals&sort_by=crisrp.fullName_sort&order=ASC&location=crisrp'
# url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=crisrp&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Researchers&sort_by=crisrp.fullName_sort&order=asc&rpp=100&etal=0&start='
# Download 300 papers, ordered by author name
# url_root = \
#     'https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start='
# # Download IGTP papers
# url_root = \
#     'https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&filter_field_3=location.coll&filter_type_3=equals&filter_value_3=349&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start='

## Scrape

In [None]:
# items_to_scrape = 'papers'
items_to_scrape = 'links'
result_df = scrape(items_to_scrape, max_pages=3)

## Save output

In [None]:
output_folder =  '../data/network of talent/research networks/Scraping/temp/'
result_df.to_csv(output_folder + items_to_scrape + '.csv', index=False)
print('')
print(f"Saved output to file {items_to_scrape}.csv")

# Scrape paper links to get coauthors

## Scrape

In [None]:
# Get links to papers
paper_links_df = scrape('links')

# data_folder =  '../data/network of talent/research networks/Scraping/'
# paper_links_df = pd.read_csv(data_folder + 'temp/links.csv')

# Get coauthor IDs from paper links
result_list = get_coauthors(paper_links_df)

In [None]:
len(result_list)

## Save output

In [None]:
# Create dataframe
result_df = pd.DataFrame.from_records(result_list)

In [None]:
result_df

In [None]:
# Save
data_folder =  '../data/network of talent/research networks/Scraping/'
result_df.to_csv(data_folder + 'processed/coauthors.csv', index=False)

In [None]:
result_list

In [None]:
## Create matrix


# Todo
1. Scrape publications list and get ids of authors. Each row should be a list of ids.
2. Check results from scraping ids of authors in publications (coauthors.csv)
2. Create matrix or edgelist or coauthors.

In [None]:
# Create nodelist

1. Get all authors links

In [None]:
session = HTMLSession()
url = 'https://portalrecerca.csuc.cat/orcid/0000-0003-0763-2695'
r = session.get(url)
r.html.text