# Save Data

This notebook post-processed the scraped data from Portal de la Reserca and saves filtered versions of data to avoid wasting time later in the computing.

**NOTE: you only need to run this notebook once. Every run will overwrite data that is the input of the following steps.** 

# Import modules

In [None]:
import numpy as np
import pandas as pd
from ast import literal_eval
from itertools import combinations
import random

# Save papers by institution

## Helper Functions

In [None]:
def convert_to_list(x):
    """Convert string column to list"""
    try:
        result = literal_eval(x)
    except ValueError:
        result = np.nan
    return result

def has_1_author(row, df):
    try:
        result = bool(set(row['orcids']) & set(df))
    except TypeError:
        result = False
    return result

def has_2_authors(row, df):
    try:
        result = len(set(row['orcids']) & set(df)) > 1
    except TypeError:
        result = False
    return result

## Combine scraped data into single file
(takes ~5mins, often gets stuck)

In [None]:
# Combine papers datasets that were downloaded in two batches
# papers0_df = pd.read_csv('./data/papers_0.csv')
# papers1_df = pd.read_csv('./data/papers_1.csv')
# papers_df = papers0_df.append(papers1_df)
# papers_df = papers_df.drop_duplicates()
# papers_df.to_csv('./data/papers.csv', index=False)

## Save papers and authors by institution

### Load papers and authors

In [None]:
# Load papers
papers_df = pd.read_csv('./data/papers.csv')
# Convert strings to list of coauthors
papers_df['orcids'] = papers_df['orcids'].apply(lambda x: convert_to_list(x))
# Load authors
authors_df = pd.read_csv('./data/nodes.csv')

### Filter papers and authors

In [None]:
institution_list = ['IGTP', 'UPC', 'UB', 'UPF', 'UVic-UCC', 'UOC']

for institution in institution_list:
    # Extract authors from institution
    print(f"Extracting authors from {institution}.")
    authors_inst_df = authors_df.loc[authors_df['institution'] == institution]
    authors_inst_df = authors_inst_df.rename(columns={'name':'label'})
    authors_inst = authors_inst_df['id'].unique()
    # Save
    authors_inst_df.to_csv(f'./data/nodes_{institution}.csv', index=None)
    print(f"Saved './data/nodes_{institution}.csv'")

    # Extract papers with authors from institution
    print(f"Extracting papers of researchers from {institution}.")
    mask = papers_df.apply(lambda x: has_2_authors(x, authors_inst), axis=1)
    papers_inst_df = papers_df[mask]
    # Save
    papers_inst_df.to_csv(f'./data/papers_{institution}.csv', index=None)
    print(f"Saved './data/papers_{institution}.csv'")

## Todo: Add columns to nodes

## Create papers matrix
This block is not necessary since its output is not used in the next steps.

In [None]:
for institution in institution_list:
    print(f"Institution: {institution}.")
    papers_inst_df = pd.read_csv(f'./data/papers_{institution}.csv', converters = {'orcids': eval})
    # Get papers column
    papers = papers_inst_df['orcids'].copy()
    papers = papers.reset_index(drop=True)

    # Get unique list of authors from papers
    authors_index = list(set(papers.sum()))

    authors_index.sort()

    # Create boolean matrix with papers
    paper_bool_df = pd.DataFrame(columns=authors_index, index=range(len(papers)))
    for i, paper in enumerate(papers):
        paper_bool_df.loc[i,:] = 0
        for orcid in paper:
            paper_bool_df.loc[i,orcid] = 1

    paper_bool_df.to_csv(f"./data/paper_author_matrix_{institution}.csv", index=None)