# Scraping Portal de la Reserca

This notebook asynchrnously scrapes information in Portal de la Reserca. It can download the following items:

- Links to author portals
- Author information from the author portals.

# Import modules

In [2]:
import pandas as pd
import asyncio
from datetime import date

# Helper functions
from src.scrape import (get_max_pages, retry_url,
                         scrape, scrape_url,
                         scrape_author, scrape_project, scrape_group)

# Run Scraper

## Nodes: Scrape authors

### Get links to author pages

In [None]:
items = 'author_links'
batch_size = 20
date_today = date.today().strftime("%Y%m%d")
out_file = f'./data/author_urls_{date_today}.csv'
author_urls = await scrape(items=items, batch_size=batch_size, out_file=out_file)

### Scrape author pages

In [3]:
# Build urls
date_today = date.today().strftime("%Y%m%d")
author_urls = pd.read_csv(f'./data/author_urls_{date_today}.csv')
author_urls = list(author_urls['0'])
url_root = 'https://portalrecerca.csuc.cat'
urls = [url_root + url for url in author_urls]

# Get author data in batch
items = 'authors'
batch_size = 100
date_today = date.today().strftime("%Y%m%d")
out_file = f'./data/nodes_{date_today}.csv'
author_data = await scrape(items=items, urls=urls, batch_size=batch_size, out_file=out_file)

Scraping authors from Portal de la Reserca.
Saving results to ./data/nodes_20220309.csv.
Scraping authors from 18,900 URLs in 189 batches of 100, starting at 0.
Progress: 3% (6/189). URLs: 500-599. Batch time: 11.02s. Time left: 0h33m47s.

CancelledError: 

In [None]:
# Last batch got stuck 
# Scraping authors from Portal de la Reserca.
# Saving results to ./data/nodes_20220301.csv.
# Scraping authors from 19,050 URLs in 191 batches of 100, starting at 0.
# Progress: 58% (111/191). URLs: 11000-11099. Batch time: 22.55s. Time left: 0h30m27s.
author_data = await scrape(items=items, urls=urls, batch_size=batch_size, out_file=out_file, start_pos=11000)

## Nodes: Scrape projects

### Get links to projects from Portal de la Reserca

In [None]:
items = 'project_links'
batch_size = 10
date_today = date.today().strftime("%Y%m%d")
out_file = f'./data/project_links_{date_today}.csv'
project_urls = await scrape(items=items, batch_size=batch_size, out_file=out_file)

### Alternative: get links to projects from nodes

In [None]:
# nodes_df = pd.read_csv("./data/nodes.csv")
# projects = set()
# for projects_string in nodes_df['projects']:
#     projects_list = ast.literal_eval(projects_string)
#     projects.update(projects_list)

### Build URLS

In [None]:
project_urls = pd.read_csv(f'./data/project_links_{date_today}.csv')
project_urls = list(project_urls['0'])
url_root = 'https://portalrecerca.csuc.cat'
urls = [url_root + url for url in project_urls]

### Scrape projects

In [None]:
items = 'projects'
batch_size = 10
date_today = date.today().strftime("%Y%m%d")
out_file = f'./data/projects_{date_today}.csv'
projects = await scrape(items=items, urls=urls, batch_size=batch_size, out_file=out_file)

## Nodes: Scrape groups

### Get links to groups from Portal de la Reserca

In [None]:
items = 'group_links'
batch_size = 10
date_today = date.today().strftime("%Y%m%d")
out_file = f'./data/group_links_{date_today}.csv'
group_urls = await scrape(items=items, batch_size=batch_size, out_file=out_file)

### Alternative: get links to groups from nodes

In [None]:
# nodes_df = pd.read_csv("./data/nodes.csv")
# groups = set()
# for groups_string in nodes_df['groups']:
#     groups_list = ast.literal_eval(groups_string)
#     groups.update(groups_list)

### Build URLs

In [None]:
group_urls = pd.read_csv(f'./data/group_links_{date_today}.csv')
group_urls = list(group_urls['0'])
url_root = 'https://portalrecerca.csuc.cat'
urls = [url_root + url for url in group_urls]

###  Scrape groups

In [None]:
items = 'groups'
batch_size = 10
date_today = date.today().strftime("%Y%m%d")
out_file = f'./data/groups_{date_today}.csv'
groups = await scrape(items=items, urls=urls, batch_size=batch_size, out_file=out_file)

## Edgelist: scrape publications

### Get links to papers

In [None]:
items = 'paper_links'
batch_size = 10
date_today = date.today().strftime("%Y%m%d")
out_file = f'./data/paper_links_{date_today}.csv'

paper_urls = await scrape(items=items, batch_size=batch_size, out_file=out_file)

### Get coauthors in paper pages

In [None]:
# Build urls
paper_urls = pd.read_csv(f'./data/paper_links_{date_today}.csv')
paper_urls = list(paper_urls['0'])
url_root = 'https://portalrecerca.csuc.cat'
urls = [url_root + url + '?mode=full' for url in paper_urls]

# Run in batch
items = 'papers'
batch_size = 10
start_pos = 56040 # old starting position with problems
# start_pos = 0
date_today = date.today().strftime("%Y%m%d")
out_file = f'./data/papers_{date_today}.csv'

papers = await scrape(items=items, urls=urls, start_pos=start_pos, batch_size=batch_size, out_file=out_file)

# For script execution

In [None]:
if __name__ == "__main__":
    import sys
    if len(sys.argv) == 2:
        items = sys.argv[1]
        batch_size = 20
    elif len(sys.argv) == 3:
        items = sys.argv[1]
        batch_size = sys.argv[2]
    elif len(sys.argv) > 3:
        print("Too many arguments. Choose argument 1: author_links, paper_links, authors, papers, projects, groups. Choose argument 2: batch size.")
        return 1
    else:
        print("Choose argument 1: author_links, paper_links, authors, papers, projects, groups. Choose argument 2: batch size.")
        return 1

    date_today = date.today().strftime("%Y%m%d")
    url_root = 'https://portalrecerca.csuc.cat'
    
    if items == 'author_links':
        urls = None
        out_file = f'./data/author_urls_{date_today}.csv'
        
    elif items == 'authors':
        author_urls = pd.read_csv(f'./data/author_urls_{date_today}.csv')
        author_urls = list(author_urls['author_urls'])
        urls = [url_root + url for url in author_urls]
        out_file = f'./data/nodes_{date_today}.csv'
        
    elif items == 'project_links':
        urls=None
        out_file = f'./data/project_links_{date_today}.csv'
        
    elif items == 'projects:
        project_urls = pd.read_csv(f'./data/project_links_{date_today}.csv')
        project_urls = list(project_urls['0'])
        urls = [url_root + url for url in project_urls]
        out_file = f'./data/projects_{date_today}.csv'
    
    elif items == 'group_links':
        urls=None
        out_file = f'./data/group_links_{date_today}.csv'
        
    elif items == 'groups':
        group_urls = pd.read_csv(f'./data/group_links_{date_today}.csv')
        group_urls = list(group_urls['0'])
        urls = [url_root + url for url in group_urls]
        out_file = f'./data/groups_{date_today}.csv'
    
    elif items == 'paper_links':
        urls = None
        out_file = f'./data/paper_links_{date_today}.csv'
        
    elif items == 'papers':
        paper_urls = pd.read_csv(f'./data/paper_links_{date_today}.csv')
        paper_urls = list(paper_urls['0'])
        urls = [url_root + url + '?mode=full' for url in paper_urls]
        out_file = f'./data/papers_{date_today}.csv'

    asyncio.run(scrape(items=items, urls=urls, batch_size=batch_size, out_file=out_file))