# Save Data

This notebook post-processed the scraped data from Portal de la Reserca and saves filtered versions of data to avoid wasting time later in the computing.

**NOTE: you only need to run this notebook once. Every run will overwrite data that is the input of the following steps.** 

# Import modules

In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
from itertools import combinations
import random

# Save papers by institution

## Helper Functions

In [2]:
def convert_to_list(x):
    """Convert string column to list"""
    try:
        result = literal_eval(x)
    except ValueError:
        result = np.nan
    return result

def has_1_author(row, df):
    try:
        result = bool(set(row['orcids']) & set(df))
    except TypeError:
        result = False
    return result

def has_2_authors(row, df):
    try:
        result = len(set(row['orcids']) & set(df)) > 1
    except TypeError:
        result = False
    return result

## Combine scraped data into single file
(takes ~5mins, often gets stuck)

In [3]:
# Combine papers datasets that were downloaded in two batches
# papers0_df = pd.read_csv('./data/papers_0.csv')
# papers1_df = pd.read_csv('./data/papers_1.csv')
# papers_df = papers0_df.append(papers1_df)
# papers_df = papers_df.drop_duplicates()
# papers_df.to_csv('./data/papers.csv', index=False)

## Save papers and authors by institution

### Load papers and authors

In [4]:
# Load papers
papers_df = pd.read_csv('./data/papers.csv')
# Convert strings to list of coauthors
papers_df['orcids'] = papers_df['orcids'].apply(lambda x: convert_to_list(x))
# Load authors
authors_df = pd.read_csv('./data/nodes.csv')

### Filter papers and authors

In [6]:
authors_df.loc[authors_df['institution'] == 'CIMNE']

Unnamed: 0,name,id,department,institution,projects,groups
118,"Agelet de Saracibar Bosch, Carlos",0000-0002-0352-1720,Centre Internacional de Mètodes Numèrics a l’E...,CIMNE,"['/cris/project/pj3462653', '/cris/project/pj3...",['/cris/ou/ou167744']
478,"Alonso Perez de Agreda, Eduardo",0000-0003-2472-3951,Centre Internacional de Mètodes Numèrics a l’E...,CIMNE,"['/cris/project/pj3453409', '/cris/project/pj3...",['/cris/ou/ou167817']
1055,"Arroyo Alvarez de Toledo, Marcos",0000-0001-9384-9107,Centre Internacional de Mètodes Numèrics a l’E...,CIMNE,"['/cris/project/pj3422856', '/cris/project/pj3...",[]
1128,"Attarnejad, Reza",0000-0003-1955-2887,Centre Internacional de Mètodes Numèrics a l’E...,CIMNE,[],[]
1261,"Badia Rodríguez, Santiago Ignacio",0000-0003-2391-4086,Centre Internacional de Mètodes Numèrics a l’E...,CIMNE,[],[]
...,...,...,...,...,...,...
17898,"Ubach, Pere-Andreu",0000-0001-5509-1622,Centre Internacional de Mètodes Numèrics a l’E...,CIMNE,[],[]
18173,"Vaunat, Jean",0000-0003-3579-9652,Centre Internacional de Mètodes Numèrics a l’E...,CIMNE,"['/cris/project/pj3422856', '/cris/project/pj3...",['/cris/ou/ou167817']
18246,"Velasquez, C. A.",0000-0001-9248-0299,Centre Internacional de Mètodes Numèrics a l’E...,CIMNE,[],[]
18326,"Verdugo, Francesc",0000-0003-3667-443X,Centre Internacional de Mètodes Numèrics a l’E...,CIMNE,[],[]


In [8]:
authors_df

Unnamed: 0,name,id,department,institution,projects,groups,institution_ext
0,"Aasen, Trond",0000-0003-0763-2695,Àrea de Oncologia,VHIR,"['/cris/project/pj3440995', '/cris/project/pj3...",['/cris/ou/ou168865'],VHIR
1,"Abad Calvo, Pilar",0000-0002-6078-0956,Institut de Recerca Germans Trias i Pujol,IGTP,[],[],IGTP
2,"Abad Capa, Jorge",0000-0002-8129-7859,Departament de Medicina,UAB,[],[],UAB
3,"Abad Cuñado, Vanessa",0000-0003-4265-1656,"Departament d'Enginyeria Química, Biològica i ...",UAB,[],['/sgr/2017SGR0014'],UAB
4,"Abad Lázaro, Aina",0000-0003-0260-9170,,,[],['/cris/ou/ou167707'],
...,...,...,...,...,...,...,...
19045,"Zurczak, Marek",0000-0003-4184-3410,,,[],[],
19046,"Zuriguel Pérez, Esperanza",0000-0002-0622-8423,Àrea de Malalties Digestives i Hepàtiques,VHIR,"['/cris/project/pj3459159', '/cris/project/pj3...",['/cris/ou/ou168823'],VHIR
19047,"Zwart, Marta",0000-0003-4888-0927,Institut de Recerca Germans Trias i Pujol,IGTP,[],[],IGTP
19048,"Zweizig, John",0000-0002-1521-3397,Institut d'Estudis Espacials de Catalunya,IEEC,[],[],IEEC


In [9]:
# Create groups of institutions
authors_df['institution_group'] = authors_df['institution']

In [10]:
mask = (authors_df['institution'] == 'UPC') | (authors_df['institution'] == 'CIMNE')
authors_df.loc[mask, 'institution_group'] = 'UPC_CIMNE'

In [15]:
institution_2_list = ['Institut de Recerca contra la Leucèmia Josep Carreras', 'IrsiCaixa AIDS Research Institute']
mask = (authors_df['institution'] == 'IGTP' | (authors_df['institution_2'] in institution_2_list))
authors_df.loc[mask, 'institution_group'] = 'IGTP+'

In [16]:
authors_df.loc[mask]

Unnamed: 0,name,id,department,institution,projects,groups,institution_group


In [14]:
authors_df['institution'].unique()

array(['VHIR', 'IGTP', 'UAB', nan, 'UPC', 'URL', 'UB', 'IRTA', 'URV',
       'UAO CEU', 'UdG', 'UdL', 'UIC', 'ICP', 'UVic-UCC', 'IISPV', 'IFAE',
       'UOC', 'IREC', 'CIMNE', 'IDIBELL', 'IEEC', 'UPF', 'CRM', 'CVC',
       'CREAF', 'CRG', 'CTTC', 'CTFC', 'IRBLleida', 'ICN2', 'CREI',
       'IDIBGI', 'CED'], dtype=object)

In [None]:
institution_list = ['IGTP', 'UPC', 'CIMNE', 'UB', 'UPF', 'UVic-UCC', 'UOC']

for institution in institution_list:
    # Extract authors from institution
    print(f"Extracting authors from {institution}.")
    authors_inst_df = authors_df.loc[authors_df['institution'] == institution]
    authors_inst_df = authors_inst_df.rename(columns={'name':'label'})
    authors_inst = authors_inst_df['id'].unique()
    # Save
    authors_inst_df.to_csv(f'./data/nodes_{institution}.csv', index=None)
    print(f"Saved './data/nodes_{institution}.csv'")

    # Extract papers with authors from institution
    print(f"Extracting papers of researchers from {institution}.")
    mask = papers_df.apply(lambda x: has_2_authors(x, authors_inst), axis=1)
    papers_inst_df = papers_df[mask]
    # Save
    papers_inst_df.to_csv(f'./data/papers_{institution}.csv', index=None)
    print(f"Saved './data/papers_{institution}.csv'")

## Create papers matrix
This block is not necessary since its output is not used in the next steps.

In [None]:
for institution in institution_list:
    print(f"Institution: {institution}.")
    papers_inst_df = pd.read_csv(f'./data/papers_{institution}.csv', converters = {'orcids': eval})
    # Get papers column
    papers = papers_inst_df['orcids'].copy()
    papers = papers.reset_index(drop=True)

    # Get unique list of authors from papers
    authors_index = list(set(papers.sum()))

    authors_index.sort()

    # Create boolean matrix with papers
    paper_bool_df = pd.DataFrame(columns=authors_index, index=range(len(papers)))
    for i, paper in enumerate(papers):
        paper_bool_df.loc[i,:] = 0
        for orcid in paper:
            paper_bool_df.loc[i,orcid] = 1

    paper_bool_df.to_csv(f"./data/paper_author_matrix_{institution}.csv", index=None)