# Scraping Portal de la Reserca

This notebook scrapes information in Portal de la Reserca. It can download the following items:

- Publications with date and authors
- Publications with links to the paper page
- Researcher list
- Researcher groups list

# Setup

## Import modules

In [123]:
import pandas as pd
from requests_html import HTMLSession

## Helper functions

In [171]:
# Get max pages from pagination box in footer
def get_max_pages(session_object):
    pagination_box = session_object.html.find('ul.pagination.pull-right')
    pagination_items = pagination_box[0].find('li')
    max_pages = pagination_items[-2].text.replace('.','').replace(',','')
    return int(max_pages)

# Get papers links
def scrape( 
    items      = 'links', 
    url_root   = None,
    max_pages  = None, 
    page_start = 0):
    """
    Scrape Portal de la Reserca
    Options:
    - items = [links, papers, authors]
    """
    
    print(f"Scraping {items} from Portal de la Reserca.")
    
    if not url_root:
        if items == 'authors':
            url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=crisrp&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Researchers&sort_by=crisrp.fullName_sort&order=asc&rpp=300&etal=0&start='
        elif items == 'links' or items == 'papers':
            url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start=' 
            
    session = HTMLSession()
    
    if not max_pages:
        url = url_root + '0'
        r = session.get(url)
        max_pages = get_max_pages(r)
            
    page_number = page_start
    
    result_list = []
        
    while page_number < max_pages:
        # Progress
        print(f"Progress: {round(page_number/(max_pages)*100)}%. Scraping page: {page_number}/{max_pages}.", end="\r")

        # Create URL
        url = url_root + str(300*page_number)
        # Update page counter
        page_number += 1
        # Get page
        r = session.get(url)
        # Get table
        table = r.html.find('div.panel.panel-info table.table', first=True)
        # Get rows from table
        rows = table.find('tr')

        for row in rows:
            # Get columns in row
            columns = row.find('td')
            # Skip if empty row
            if len(columns) == 0:
                continue
                
            if items == 'links':
                # Get paper link
                paper_link = columns[1].find('a')[0].attrs['href']
                scrape_item = paper_link
            
            elif items == 'papers':
                # Get paper data
                paper_date    = columns[0].text
                paper_title   = columns[1].text
                paper_authors = columns[2].text
                paper_type    = columns[3].text

                paper = {}
                paper['date'] = paper_date
                paper['title'] = paper_title
                paper['type'] = paper_type

                paper_authors_list= paper_authors.split(';')
                for i, author in enumerate(paper_authors_list):
                    paper[f"author_{i}"] = author

                scrape_item = paper
                
            elif items == 'authors':
                # Get author data
                author_name = columns[0].text
                author_last = author_name.split(',')[0]

                try:
                    author_first = author_name.split(',')[1]
                except IndexError:  # If there is no comma in the name
                    author_first = ''

                author_inst = columns[1].text

                author_dict = {
                    'Last Name': author_last,
                    'First Name': author_first,
                    'Institution': author_inst,
                    }

                scrape_item = author_dict
                
            # Append to paper links list
            result_list.append(scrape_item)
        
    if page_number == max_pages:
        print("Progress: 100%                                  ")
        
    if items == 'links':
        result_df = pd.DataFrame(result_list, columns=['paper links'])
    else:
        result_df = pd.DataFrame.from_records(result_list)
              
    return result_df

In [172]:
def get_coauthors(paper_links_df, n_papers=None, start=0):
    """
    Get coauthors list from list of paper links
    Input:
    - paper_links_df: Dataframe with hyperlinks to papers.
    - n_papers: number of papers to scrape.
    - start: initial paper to start scraping.
    """
    if not n_papers:
        n_papers =  len(paper_links_df) - start
    print(f"Scraping author ID's of papers in Portal de la Reserca. Starting in paper {start} and scraping {n_papers} papers.")
    
    session = HTMLSession()
    
    # Get author links
    result_list = []
    for i, link in enumerate(paper_links_df['paper links']):
        
        if i < start:
            continue
            
        threshold = start + n_papers
        if i >= threshold:
            break
            
        print(f"Progress: {round(i/(threshold)*100)}%. Scraping paper: {i+1}/{threshold}.", end="\r")
        url_root = 'https://portalrecerca.csuc.cat'
        url = url_root + link
        r = session.get(url)
        # print(url, end="\r")
        # print(r.text, end="\r")
        try:
            table = r.html.find('table.itemDisplayTable')[0]
        except IndexError:
            continue
        rows = table.find('tr')
        # Get links to authors
        author_links = rows[2].find('td')[1].find('a.author')
        author_hrefs = []
        for i, link in enumerate(author_links):
            href = link.attrs['href']
            author_hrefs.append(href)

        # Visit author links and get id
        author_list = []
        for href in author_hrefs:
            url = url_root + href
            author_page = session.get(url).html
            author_id = author_page.find('div#orcidDiv span')[0].text
            author_list.append(author_id)

            # author_name = author_page.find('div#fullNameDiv span')[0].text
            # author = {'name':author_name, 'id': author_id}
            # author_list.append(author)

        result_list.append(author_list)
        
    print(f"Progress: 100%.                          ")
    return result_list

# Scrape paper links, paper authors, and author information 

## Configuration

In [11]:
session = HTMLSession()

# Choose custom URLs
# url = 'https://portalrecerca.csuc.cat/simple-search?filtername=resourcetype&filterquery=Researchers&filtertype=equals&sort_by=crisrp.fullName_sort&order=ASC&location=crisrp'
# url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=crisrp&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Researchers&sort_by=crisrp.fullName_sort&order=asc&rpp=100&etal=0&start='
# Download 300 papers, ordered by author name
# url_root = \
#     'https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start='
# # Download IGTP papers
# url_root = \
#     'https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&filter_field_3=location.coll&filter_type_3=equals&filter_value_3=349&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start='

## Scrape

In [13]:
# items_to_scrape = 'papers'
items_to_scrape = 'links'
result_df = scrape(items_to_scrape, max_pages=3)

Scraping links from Portal de la Reserca.
Progress: 100%                                  


## Save output

In [16]:
output_folder =  '../data/network of talent/research networks/Scraping/temp/'
result_df.to_csv(output_folder + items_to_scrape + '.csv', index=False)
print('')
print(f"Saved output to file {items_to_scrape}.csv")


Saved output to file links.csv


# Scrape paper links to get coauthors

## Scrape

In [173]:
# Get links to papers
paper_links_df = scrape('links')

# data_folder =  '../data/network of talent/research networks/Scraping/'
# paper_links_df = pd.read_csv(data_folder + 'temp/links.csv')

# Get coauthor IDs from paper links
result_list = get_coauthors(paper_links_df)

Scraping links from Portal de la Reserca.
Progress: 53%. Scraping page: 1193/2240.

KeyboardInterrupt: 

In [None]:
len(result_list)

## Save output

In [163]:
# Create dataframe
result_df = pd.DataFrame.from_records(result_list)

In [164]:
result_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0000-0002-4729-5980,,,,,,,,,,,
1,0000-0001-8337-8356,0000-0002-9605-0461,,,,,,,,,,
2,0000-0001-6035-4845,,,,,,,,,,,
3,0000-0002-2991-9593,,,,,,,,,,,
4,0000-0003-0310-8386,0000-0002-1738-3787,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
894,0000-0001-5041-6432,,,,,,,,,,,
895,0000-0001-5041-6432,,,,,,,,,,,
896,0000-0001-5041-6432,,,,,,,,,,,
897,0000-0001-5041-6432,,,,,,,,,,,


In [165]:
# Save
data_folder =  '../data/network of talent/research networks/Scraping/'
result_df.to_csv(data_folder + 'processed/coauthors.csv', index=False)

In [126]:
result_list

[['0000-0002-4729-5980'],
 ['0000-0001-8337-8356', '0000-0002-9605-0461'],
 ['0000-0001-6035-4845'],
 ['0000-0002-2991-9593'],
 ['0000-0003-0310-8386', '0000-0002-1738-3787'],
 ['0000-0003-3794-3443'],
 ['0000-0002-6911-8846'],
 ['0000-0001-8946-1417'],
 ['0000-0002-4276-336X'],
 ['0000-0002-4276-336X'],
 ['0000-0002-4276-336X'],
 ['0000-0002-4276-336X'],
 ['0000-0002-4276-336X'],
 ['0000-0002-4276-336X'],
 ['0000-0002-4276-336X'],
 ['0000-0002-4276-336X'],
 ['0000-0002-4276-336X'],
 ['0000-0002-4276-336X'],
 ['0000-0002-3882-9135'],
 ['0000-0002-4276-336X']]

In [None]:
## Create matrix


# Todo
1. Scrape publications list and get ids of authors. Each row should be a list of ids.
2. Check results from scraping ids of authors in publications (coauthors.csv)
2. Create matrix or edgelist or coauthors.

In [None]:
# Create nodelist

1. Get all authors links

In [175]:
session = HTMLSession()
url = 'https://portalrecerca.csuc.cat/orcid/0000-0003-0763-2695'
r = session.get(url)
r.html.text

'function noShadow(){ } window.cookieconsent_options = {"message":"Aquest lloc web fa servir cookies. Si hi segueix navegant considerarem que n\'està acceptant el seu ús. ","dismiss":"Entesos!","learnMore":"Més informació sobre les cookies","link":"/legal","theme":"static/css/cookie_law.css"};\nPortal de la Recerca de Catalunya: PÃ\xa0gina de l\'investigador\n.ui-autocomplete-loading { background: url(\'../image/jdyna/indicator.gif\') no-repeat right center }\nvar j = jQuery.noConflict(); var $ = jQuery.noConflict(); var JQ = j; dspaceContextURL = "/orcid/0000-0003-0763-2695"; dspaceContextPath = ""; //Navigate function function goToBrowse(){ console.log("go to"); var URL = "http://"; var select1 = document.getElementById("select_explore"); var web = select1.options[select1.selectedIndex].value; //var web = document.explore.options[document.explore.selectedIndex].value; console.log(web); window.location.href = web; //window.open(web, \'_blank\', \'\'); } var _gaq = _gaq || []; _gaq.pus

In [None]:
import asyncio
from requests_html import AsyncHTMLSession


urls = [
    '
asession = AsyncHTMLSession()

def get_author_links():
    

In [None]:
# Get max pages from pagination box in footer
def get_max_pages(session_object):
    pagination_box = session_object.html.find('ul.pagination.pull-right')
    pagination_items = pagination_box[0].find('li')
    max_pages = pagination_items[-2].text.replace('.','').replace(',','')
    return int(max_pages)

# Get papers links
def scrape( 
    items      = 'links', 
    url_root   = None,
    max_pages  = None, 
    page_start = 0):
    """
    Scrape Portal de la Reserca
    Options:
    - items = [links, papers, authors]
    """
    
    print(f"Scraping {items} from Portal de la Reserca.")
    
    if not url_root:
        if items == 'authors':
            url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=crisrp&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Researchers&sort_by=crisrp.fullName_sort&order=asc&rpp=300&etal=0&start='
        elif items == 'links' or items == 'papers':
            url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start=' 
            
    session = HTMLSession()
    
    if not max_pages:
        url = url_root + '0'
        r = session.get(url)
        max_pages = get_max_pages(r)
            
    page_number = page_start
    
    result_list = []
        
    while page_number < max_pages:
        # Progress
        print(f"Progress: {round(page_number/(max_pages)*100)}%. Scraping page: {page_number}/{max_pages}.", end="\r")

        # Create URL
        url = url_root + str(300*page_number)
        # Update page counter
        page_number += 1
        # Get page
        r = session.get(url)
        # Get table
        table = r.html.find('div.panel.panel-info table.table', first=True)
        # Get rows from table
        rows = table.find('tr')

        for row in rows:
            # Get columns in row
            columns = row.find('td')
            # Skip if empty row
            if len(columns) == 0:
                continue
                
            if items == 'links':
                # Get paper link
                paper_link = columns[1].find('a')[0].attrs['href']
                scrape_item = paper_link
            
            elif items == 'papers':
                # Get paper data
                paper_date    = columns[0].text
                paper_title   = columns[1].text
                paper_authors = columns[2].text
                paper_type    = columns[3].text

                paper = {}
                paper['date'] = paper_date
                paper['title'] = paper_title
                paper['type'] = paper_type

                paper_authors_list= paper_authors.split(';')
                for i, author in enumerate(paper_authors_list):
                    paper[f"author_{i}"] = author

                scrape_item = paper
                
            elif items == 'authors':
                # Get author data
                author_name = columns[0].text
                author_last = author_name.split(',')[0]

                try:
                    author_first = author_name.split(',')[1]
                except IndexError:  # If there is no comma in the name
                    author_first = ''

                author_inst = columns[1].text

                author_dict = {
                    'Last Name': author_last,
                    'First Name': author_first,
                    'Institution': author_inst,
                    }

                scrape_item = author_dict
                
            # Append to paper links list
            result_list.append(scrape_item)
        
    if page_number == max_pages:
        print("Progress: 100%                                  ")
        
    if items == 'links':
        result_df = pd.DataFrame(result_list, columns=['paper links'])
    else:
        result_df = pd.DataFrame.from_records(result_list)
              
    return result_df