# Get Papers

## Import modules

In [None]:
from requests_html import HTMLSession
import pandas as pd

## Setup

In [None]:
# Start requests-HTML session
session = HTMLSession()

# url = 'https://portalrecerca.csuc.cat/simple-search?filtername=resourcetype&filterquery=Researchers&filtertype=equals&sort_by=crisrp.fullName_sort&order=ASC&location=crisrp'
# url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=crisrp&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Researchers&sort_by=crisrp.fullName_sort&order=asc&rpp=100&etal=0&start='

# Download 300 papers, ordered by author name
url_root = \
    'https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start='

paper_link = \
    'https://portalrecerca.csuc.cat/article/doi/10.1038/nri3262'

# Download IGTP papers
url_root = \
    'https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&filter_field_3=location.coll&filter_type_3=equals&filter_value_3=349&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start='

# https://portalrecerca.csuc.cat/57253720

# Begin loop
page_number = 0  # Start loop here
# max_pages = 2
# max_pages = 100
# max_pages = 2240
max_pages = 19
papers_list = []

## Loop

In [None]:
while page_number < max_pages:
    print(f"Progress: {round(page_number/(max_pages)*100)}%. Scraping page: {page_number}/{max_pages}.", end="\r")
    print('')

    # Create URL
    url = url_root + str(300*page_number)
    # Update page counter
    page_number += 1
    # Get page
    r = session.get(url)
    # Get table
    table = r.html.find('div.panel.panel-info table.table', first=True)
    # Read rows from table
    rows = table.find('tr')

    for row in rows:
        columns = row.find('td')
        # Skip if empty row
        if len(columns) == 0:
            continue
        # Get author data
        paper_date    = columns[0].text
        paper_title   = columns[1].text
        paper_authors = columns[2].text
        paper_type    = columns[3].text

        paper = {}
        paper['date'] = paper_date
        paper['title'] = paper_title
        paper['type'] = paper_type

        paper_authors_list= paper_authors.split(';')
        for i, author in enumerate(paper_authors_list):
            paper[f"author_{i}"] = author

        # try:
            # author_first = author_name.split(',')[1]
        # except Exception:  # If there is no comma in the name
            # author_first = ''
        # author_inst = columns[1].text

        # author_dict = {
        #     'Last Name': author_last,
        #     'First Name': author_first,
        #     'Institution': author_inst,
        #     }

        # Append to author list
        papers_list.append(paper)

papers_df = pd.DataFrame.from_records(papers_list)

data_folder =  '../data/network of talent/research networks/Scraping/'
papers_df.to_csv(data_folder + 'raw/papers_IGTP.csv')
print('')
print('Saved output to file papers_IGTP.csv')

# Get full names

In [1]:
from requests_html import HTMLSession
import pandas as pd

## Helper functions

In [45]:
# Get max pages from pagination box in footer
def get_max_pages(session_object):
    pagination_box = session_object.html.find('ul.pagination.pull-right')
    pagination_items = pagination_box[0].find('li')
    max_pages = pagination_items[-2].text.replace('.','').replace(',','')
    return max_pages

# Get papers links
def scrape( 
    items      = 'links', 
    url_root   = None,
    max_pages  = None, 
    page_start = 0):
    
    print(f"Scraping {items} from Portal de la Reserca.")
    
    if not url_root:
        if items == 'authors':
            url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=crisrp&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Researchers&sort_by=crisrp.fullName_sort&order=asc&rpp=300&etal=0&start='
            url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=crisrp&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Researchers&sort_by=crisrp.fullName_sort&order=asc&rpp=300&etal=0&start='
        elif items == 'links' or items == 'papers':
            url_root = 'https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start=' 
        
    if not max_pages:
        url = url_root + '0'
        r = session.get(url)
        max_pages = get_max_pages(r)
            
    page_number = page_start
    
    result_list = []
        
    while page_number < max_pages:
        # Progress
        print(f"Progress: {round(page_number/(max_pages)*100)}%. Scraping page: {page_number}/{max_pages}.", end="\r")

        # Create URL
        url = url_root + str(300*page_number)
        # Update page counter
        page_number += 1
        # Get page
        r = session.get(url)
        # Get table
        table = r.html.find('div.panel.panel-info table.table', first=True)
        # Get rows from table
        rows = table.find('tr')

        for row in rows:
            # Get columns in row
            columns = row.find('td')
            # Skip if empty row
            if len(columns) == 0:
                continue
                
            if items == 'links':
                # Get paper link
                paper_link = columns[1].find('a')[0].attrs['href']
                scrape_item = paper_link
            
            elif items == 'papers':
                # Get paper data
                paper_date    = columns[0].text
                paper_title   = columns[1].text
                paper_authors = columns[2].text
                paper_type    = columns[3].text

                paper = {}
                paper['date'] = paper_date
                paper['title'] = paper_title
                paper['type'] = paper_type

                paper_authors_list= paper_authors.split(';')
                for i, author in enumerate(paper_authors_list):
                    paper[f"author_{i}"] = author

                scrape_item = paper
                
            elif items == 'authors':
                # Get author data
                author_name = columns[0].text
                author_last = author_name.split(',')[0]

                try:
                    author_first = author_name.split(',')[1]
                except IndexError:  # If there is no comma in the name
                    author_first = ''

                author_inst = columns[1].text

                author_dict = {
                    'Last Name': author_last,
                    'First Name': author_first,
                    'Institution': author_inst,
                    }

                scrape_item = author_dict
                
            # Append to paper links list
            result_list.append(scrape_item)
        
    if page_number == max_pages:
        print("Progress: 100%                                  ")
        
    if items == 'links':
        result_df = pd.DataFrame(result_list, columns=['paper links'])
    else:
        result_df = pd.DataFrame.from_records(result_list)
              
    return result_df

## Scrape

## Setup

In [46]:
session = HTMLSession()

# Download 300 papers, ordered by author name
# url_root = \
#     'https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start='

# Download IGTP papers
# url_root = \
    # 'https://portalrecerca.csuc.cat/simple-search?query=&location=publications&filter_field_1=resourcetype&filter_type_1=equals&filter_value_1=Items&filter_field_2=itemtype&filter_type_2=notequals&filter_value_2=Phd+Thesis&filter_field_3=location.coll&filter_type_3=equals&filter_value_3=349&sort_by=dc.contributor.authors_sort&order=asc&rpp=300&etal=0&start='

## Scrape

In [49]:
items_to_scrape = 'papers'
result_df = scrape(items_to_scrape, max_pages=3)

Scraping papers from Portal de la Reserca.
Progress: 100%                                  


## Save output

In [34]:
data_folder =  '../data/network of talent/research networks/Scraping'
result_df.to_csv(f"{data_folder}/raw/{items_to_scrape}.csv')
print('')
print(f"Saved output to file {items_to_scrape}.csv")

world


In [None]:
# paper_link = \
# 'https://portalrecerca.csuc.cat/article/doi/10.1038/nri3262'



# Begin loop
page_number = 0  # Start loop here
# max_pages = 2
# max_pages = 100
# max_pages = 2240
# max_pages = 10
max_pages = 2240

    


# Save
data_folder =  '../data/network of talent/research networks/Scraping/'
papers_df.to_csv(data_folder + 'raw/paper_links.csv')

print('')
print('Saved output to file paper_links.csv')

In [None]:
papers_df