# Save Data

This notebook post-processed the scraped data from Portal de la Reserca and saves filtered versions of data to avoid wasting time later in the computing.

**NOTE: you only need to run this notebook once. Every run will overwrite data that is the input of the following steps.** 

# Import modules

In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
from itertools import combinations
from datetime import date
import itertools
import random
import os
import time

# Save papers by institution

## Helper Functions

In [2]:
def convert_to_list(x):
    """Convert string column to list"""
    try:
        result = literal_eval(x)
    except ValueError:
        result = np.nan
    return result

## Combine scraped data into single file
(takes ~5mins, often gets stuck)

In [3]:
# Combine papers datasets that were downloaded in two batches
# papers0_df = pd.read_csv('./data/papers_0.csv')
# papers1_df = pd.read_csv('./data/papers_1.csv')
# papers_df = papers0_df.append(papers1_df)
# papers_df = papers_df.drop_duplicates()
# papers_df.to_csv('./data/papers.csv', index=False)

## Load papers and authors

In [4]:
# Set date
os.environ['TZ'] = 'America/New_York'
time.tzset()
date_today = date.today().strftime("%Y%m%d")
date_today = '20220309'

# Load papers
papers_df = pd.read_csv('./data/papers.csv')
# Debug:
# papers_df_backup = papers_df.copy()
# papers_df = papers_df_backup.copy()

papers_df['orcids'] = papers_df['orcids'].apply(lambda x: convert_to_list(x))

# Replace NaNs to empty lists to avoid loops from breaking
mask = papers_df['orcids'].isna()
papers_df.loc[mask, 'orcids'] = pd.Series([[] for _ in range(len(mask))])

# Load authors
authors_df = pd.read_csv(f'./data/nodes_{date_today}.csv')
authors_df['institution'] = authors_df['institution'].apply(lambda x: convert_to_list(x))
authors_df['projects'] = authors_df['projects'].apply(lambda x: convert_to_list(x))
authors_df['groups'] = authors_df['groups'].apply(lambda x: convert_to_list(x))

## Create groups of institutions

### Setup

In [5]:
# Drop duplicate institutions
authors_df['institution'] = authors_df['institution'].apply(lambda x: list(set(x)))

# Helper function for assignment
def assign_to_group(row, institution_list, label):
    if bool(set(row['institution']) & set(institution_list)):
#         print(lst)  # Debug
        inst_groups = row['institution'].copy()
        inst_groups.append(label)
#         print(lst_extended)  # Debug
        return inst_groups
    else:
        return row['institution_group']

# Append missing acronyms: IGTP + IJC + IrsiCaixa
acronym_dict = {
    'IJC':'Institut de Recerca contra la Leucèmia Josep Carreras', 
    'IrsiCaixa':'IrsiCaixa AIDS Research Institute' }

def add_acronym(row):
    for acronym, name in acronym_dict.items():
        if row['institution_2'] == name:
            row['institution'].append(acronym)
    return row['institution']

authors_df['institution'] = authors_df.apply(add_acronym, axis=1)

### Assign to groups

In [6]:
# Create groups of institutions
authors_df['institution_group'] = authors_df['institution']

# Assign to UPC + CIMNE
institution_list = ['UPC', 'CIMNE']
label = 'UPC_CIMNE'
authors_df['institution_group'] = authors_df.apply(
                                    lambda x: assign_to_group(x, institution_list, label), axis=1)

In [7]:
# Assign to IGTP}
institution_list = ['IGTP', 'IJC', 'IrsiCaixa']
label = 'IGTP+'
authors_df['institution_group'] = authors_df.apply(
                                    lambda x: assign_to_group(x, institution_list, label), axis=1)

# Drop duplicate institution groups
authors_df['institution_group'] = authors_df['institution_group'].apply(lambda x: list(set(x)))

In [8]:
# authors_df

# institution = 'UPC'
# mask = authors_df['institution'].apply(lambda x: institution in x)
# mask.sum()
# institution = 'UPC_CIMNE'
# mask = authors_df['institution_group'].apply(lambda x: institution in x)
# mask.sum()
# mask
# authors_inst_df = authors_df.copy()[mask]
# authors_inst_df

# sorted(list(set(authors_df['institution_group'].sum())))

## Filter and save authors and papers

In [9]:
institution_group_list = ['IGTP+', 'UPC_CIMNE', 'UB', 'UPF', 'UVic-UCC', 'UOC']
# institution_group_list = ['UB']
# institution_group_list = ['IGTP+']
# institution_group_list = ['UPC_CIMNE']

for institution in institution_group_list:
    print(f"Processing institution group: {institution}.")
    
    # Extract authors from institution
    mask = authors_df['institution_group'].apply(lambda x: institution in x)
    authors_inst_df = authors_df.copy()[mask]
#     authors_inst_df = authors_df.copy().loc[authors_df['institution_group'] == institution]
    
    # Calculate number of affiliations
    authors_inst_df['n_affiliations'] = authors_inst_df.copy()['institution'].apply(len)

    # Calculate if single or multiple affilations
    authors_inst_df['single_affiliation'] = 'Multiple affiliations'
    mask = authors_inst_df['n_affiliations'] == 1
    authors_inst_df.loc[mask, 'single_affiliation'] = authors_inst_df.loc[mask, 'institution'].apply(lambda x: x[0])
    
    # Add projects and groups
    print("Adding projects and groups.")
    authors_inst_df['n_projects'] = authors_inst_df['projects'].apply(len)
    authors_inst_df['n_groups']   = authors_inst_df['groups'].apply(len)

    # Save
    out_file = f'./data/nodes_{institution}_{date_today}.csv'
    authors_inst_df.to_csv(out_file, index=None)
    print(f"Saved '{out_file}'.")

    # Extract papers with authors from institution
    print(f"Extracting papers of researchers from {institution}.")
    authors_inst = authors_inst_df['id'].unique() # Get list of authors
    mask_1plus = papers_df['orcids'].apply(lambda x: bool(set(x) & set(authors_inst)))
    # Debug:
#     mask_1plus_backup = mask_1plus.copy()
#     mask_1plus = mask_1plus_backup.copy()
    print("Extracting papers with more than 2 authors.")
    mask_2plus = papers_df.loc[mask_1plus, 'orcids'].apply(lambda x: len(set(x) & set(authors_inst)) > 1)
    papers_inst_1plus_df = papers_df[mask_1plus]
    papers_inst_2plus_df = papers_df[mask_1plus][mask_2plus]
    
    # Save
    out_file_1 = f'./data/papers_{institution}_{date_today}.csv'
    out_file_2 = f'./data/papers_{institution}_2plus_{date_today}.csv'
    papers_inst_1plus_df.to_csv(out_file_1, index=None)
    papers_inst_2plus_df.to_csv(out_file_2, index=None)
    print(f"Saved '{out_file_1}'.")
    print(f"Saved '{out_file_2}'.")
    print("")

Processing institution group: IGTP+.
Adding projects and groups.
Saved './data/nodes_IGTP+_20220309.csv'.
Extracting papers of researchers from IGTP+.
Extracting papers with more than 2 authors.
Saved './data/papers_IGTP+_20220309.csv'.
Saved './data/papers_IGTP+_2plus_20220309.csv'.

Processing institution group: UPC_CIMNE.
Adding projects and groups.
Saved './data/nodes_UPC_CIMNE_20220309.csv'.
Extracting papers of researchers from UPC_CIMNE.
Extracting papers with more than 2 authors.
Saved './data/papers_UPC_CIMNE_20220309.csv'.
Saved './data/papers_UPC_CIMNE_2plus_20220309.csv'.

Processing institution group: UB.
Adding projects and groups.
Saved './data/nodes_UB_20220309.csv'.
Extracting papers of researchers from UB.
Extracting papers with more than 2 authors.
Saved './data/papers_UB_20220309.csv'.
Saved './data/papers_UB_2plus_20220309.csv'.

Processing institution group: UPF.
Adding projects and groups.
Saved './data/nodes_UPF_20220309.csv'.
Extracting papers of researchers fr

In [10]:
print(institution)
mask = authors_df['institution_group'].apply(lambda x: institution in x)
mask.sum()

UOC


370

# Add variables to nodelist

In [12]:
date_today = '20220309'
institution_list = ['IGTP+', 'UPC_CIMNE', 'UB', 'UPF', 'UVic-UCC', 'UOC']
# institution_list = ['UB']
# institution_list = ['IGTP+']
# institution_list = ['UPC_CIMNE']

for institution in institution_list:
    print(f"Institution: {institution}.")
    
    # Load authors
    authors_inst_df = pd.read_csv(f'./data/nodes_{institution}_{date_today}.csv')
    authors_inst_df = authors_inst_df.set_index('id')
    
    # Load papers
    papers_inst_df = pd.read_csv(f'./data/papers_{institution}_{date_today}.csv', converters = {'orcids': eval})
    # Get papers column
    papers = papers_inst_df[['orcids', 'type']].copy()
    papers = papers.reset_index(drop=True)

    # Create unique list of authors from papers
#     authors_index = list(set(papers.sum()))
#     authors_index.sort()

    # Create unique list of paper ids
#     papers_index = list(papers_inst_df['url_id'].unique())
#     papers_index.sort()
    
    # Publication type: ['Journal Article', 'Chapter in Book', 'Book', 'NaN']
    authors_inst_df['n_publications'] = 0
    authors_inst_df['n_articles'] = 0
    authors_inst_df['n_chapters'] = 0
    authors_inst_df['n_books'] = 0
    authors_inst_df['n_other'] = 0
    
    paper_types = {'Journal Article':'n_articles', 'Chapter in Book':'n_chapters', 'Book':'n_books'}
    
    def add_publication_stats(paper):
#         print(f"Progress: {index/len(papers)*100:.0f}%. Paper: {index:,d}/{len(papers):,d}.", end="\r")
        for orcid in paper['orcids']:
            # Add publication
            try:
                authors_inst_df.loc[orcid, 'n_publications'] += 1
            except KeyError: # author is not in the institution
                continue
                
            # Assign type
            assigned_type = False
            for paper_type, column in paper_types.items():
                if paper['type'] == paper_type:
                    authors_inst_df.loc[orcid, column] +=1
                    assigned_type = True
                    break
            if not assigned_type:
                authors_inst_df.loc[orcid, 'n_other'] +=1
    
    print("Adding publications number and types to nodelist. This might take a while...")
#     t1 = time.perf_counter()
    papers.apply(lambda x: add_publication_stats(x), axis=1)
#     t2 = time.perf_counter()
#     print(f"Lambda function time: {t2-t1:.2f} seconds.")
            
    
#     print("Starting for loop.")
#     t3 = time.perf_counter()
#     for index, paper in papers.iterrows():
#         print(f"Progress: {index/len(papers)*100:.0f}%. Paper: {index:,d}/{len(papers):,d}.", end="\r")
#         for orcid in paper['orcids']:
#             # Add publication
#             try:
#                 authors_inst_df.loc[orcid, 'n_publications'] += 1
#             except KeyError: # author is not in the institution
#                 continue
                
#             # Assign type
#             assigned_type = False
#             for paper_type, column in paper_types.items():
#                 if paper['type'] == paper_type:
#                     authors_inst_df.loc[orcid, column] +=1
#                     assigned_type = True
#                     break
#             if not assigned_type:
#                 authors_inst_df.loc[orcid, 'n_other'] +=1
#     t4 = time.perf_counter()
#     print(f"For loop time: {t4-t2:.2f} seconds.")
            
                    
    # Save
    out_file = f'./data/nodes_{institution}_full_{date_today}.csv'
    authors_inst_df.to_csv(out_file, index=None)
    print("Done.")
    print(f"Saved {out_file}.")
    print("")

Institution: IGTP+.
Adding publications number and types to nodelist. This might take a while...
Done.
Saved ./data/nodes_IGTP+_full_20220309.csv.

Institution: UPC_CIMNE.
Adding publications number and types to nodelist. This might take a while...
Done.
Saved ./data/nodes_UPC_CIMNE_full_20220309.csv.

Institution: UB.
Adding publications number and types to nodelist. This might take a while...
Done.
Saved ./data/nodes_UB_full_20220309.csv.

Institution: UPF.
Adding publications number and types to nodelist. This might take a while...
Done.
Saved ./data/nodes_UPF_full_20220309.csv.

Institution: UVic-UCC.
Adding publications number and types to nodelist. This might take a while...
Done.
Saved ./data/nodes_UVic-UCC_full_20220309.csv.

Institution: UOC.
Adding publications number and types to nodelist. This might take a while...
Done.
Saved ./data/nodes_UOC_full_20220309.csv.



In [None]:
# authors_inst_df = pd.read_csv(f'./data/nodes_{institution}_{date_today}.csv')
# authors_inst_df

# EXTRA CODE

In [None]:
# authors_df['institution']
# institution = 'UB'
# authors_inst_df = authors_df.copy().loc[authors_df['institution_group'] == institution]

# Add publication variables to nodelist

In [None]:
date_today = '20220309'
institution_list = ['IGTP+', 'UPC_CIMNE', 'UB', 'UPF', 'UVic-UCC', 'UOC']
institution_list = ['UB']
institution_list = ['IGTP+']

for institution in institution_list:
    print(f"Institution: {institution}.")
    
    # Load authors
    authors_inst_df = pd.read_csv(f'./data/nodes_{institution}_{date_today}.csv')
    authors_inst_df = authors_inst_df.set_index('id')
    
    # Load papers
    papers_inst_df = pd.read_csv(f'./data/papers_{institution}_{date_today}.csv', converters = {'orcids': eval})
    # Get papers column
    papers = papers_inst_df[['orcids', 'type']].copy()
    papers = papers.reset_index(drop=True)

    # Create unique list of authors from papers
#     authors_index = list(set(papers.sum()))
#     authors_index.sort()

    # Create unique list of paper ids
#     papers_index = list(papers_inst_df['url_id'].unique())
#     papers_index.sort()
    
    # Publication type: ['Journal Article', 'Chapter in Book', 'Book', 'NaN']
    authors_inst_df['n_publications'] = 0
    authors_inst_df['n_articles'] = 0
    authors_inst_df['n_chapters'] = 0
    authors_inst_df['n_books'] = 0
    authors_inst_df['n_other'] = 0
    
    paper_types = {'Journal Article':'n_articles', 'Chapter in Book':'n_chapters', 'Book':'n_books'}
    
    def add_publcation_stats(orcids):
#         print(f"Progress: {index/len(papers)*100:.0f}%. Paper: {index:,d}/{len(papers):,d}.", end="\r")
        for orcid in orcids:
            # Add publication
            try:
                authors_inst_df.loc[orcid, 'n_publications'] += 1
            except KeyError: # author is not in the institution
                continue
                
            # Assign type
            assigned_type = False
            for paper_type, column in paper_types.items():
                if paper['type'] == paper_type:
                    authors_inst_df.loc[orcid, column] +=1
                    assigned_type = True
                    break
            if not assigned_type:
                authors_inst_df.loc[orcid, 'n_other'] +=1
    
    import time
    t1 = time.perf_counter()
    papers.apply(lambda x: add_publication_stats(x))
    t2 = time.perf_counter()
    print(f"Lambda function time: {t2-t1:.2f} seconds."
            
    
    t1 = time.perf_counter()
    for index, paper in papers.iterrows():
        print(f"Progress: {index/len(papers)*100:.0f}%. Paper: {index:,d}/{len(papers):,d}.", end="\r")
        for orcid in paper['orcids']:
            # Add publication
            try:
                authors_inst_df.loc[orcid, 'n_publications'] += 1
            except KeyError: # author is not in the institution
                continue
                
            # Assign type
            assigned_type = False
            for paper_type, column in paper_types.items():
                if paper['type'] == paper_type:
                    authors_inst_df.loc[orcid, column] +=1
                    assigned_type = True
                    break
            if not assigned_type:
                authors_inst_df.loc[orcid, 'n_other'] +=1
    t2 = time.perf_counter()
    print(f"For loop time: {t2-t1:.2f} seconds."
            
                    
    # Save
    out_file = f'./data/nodes_{institution}_full_{date_today}.csv'
    authors_inst_df.to_csv(out_file, index=None)
    print("Done.")
    print(f"Saved {out_file}.")
    print("")

## Create matrix of papers
This strategy creates 1,0 matrices where each row is a paper and each column is an author. It is a cool strategy but unfortunately very expensive in terms of resources. The script takes too long to run and the output are 500MB files. It is better to just collect results researcher by researcher.

In [None]:
date_today = '20220309'
institution_list = ['IGTP+', 'UPC_CIMNE', 'UB', 'UPF', 'UVic-UCC', 'UOC']
# institution_list = ['IGTP+']

for institution in institution_list:
    print(f"Processing institution: {institution}.")
    # Read paper from instituion
    papers_inst_df = pd.read_csv(f'./data/papers_{institution}_{date_today}.csv', converters = {'orcids': eval})
    papers_orcids_df = papers_inst_df[['url_id', 'orcids']]

    # Fix list of orcids that contains 'null'
    mask = papers_orcids_df['orcids'].apply(lambda x: bool('null' in x))
    try:
        print(f"Replaced 'null' value in paper: {papers_orcids_df.loc[mask, 'orcids'].values[0]}")
        papers_orcids_df.loc[mask, 'orcids'].values[0].remove('null')
    except IndexError: # all rows in the mask are False
        pass

    # Sort and index
    papers_orcids_df = papers_orcids_df.sort_values(by='url_id')
    papers_orcids_df = papers_orcids_df.set_index('url_id')
    
    # Convert list to long df
    print("Converting lists to long df.")
    papers_matrix = papers_orcids_df.explode('orcids').reset_index()
    papers_matrix['count'] = 1

    # Long to wide
    print("Converting long df to wide df.")
    papers_matrix = papers_matrix.pivot_table(index='url_id', columns='orcids', values='count', aggfunc=sum)

    # Replace NaNs
    papers_matrix = papers_matrix.fillna(0)

    # Sort orcids in columns
    papers_matrix = papers_matrix.reindex(sorted(papers_matrix.columns), axis=1)
    
    # Save
    out_file = f'data/papers_matrix_{institution}.csv'
    papers_matrix.to_csv(out_file)
    print(f"Saved '{out_file}'.")
    print("")

In [None]:
                    
                    
    # Create boolean matrix with papers
#     paper_bool_df = pd.DataFrame(columns=authors_index, index=papers_index)
    
#     for i, paper in enumerate(papers):
#         paper_bool_df.loc[i,:] = 0
#         for orcid in paper:
#             paper_bool_df.loc[i,orcid] = 1

#     paper_bool_df.to_csv(f"./data/paper_author_matrix_{institution}.csv", index=None)