# Post processing scraped data

This notebook processes the scraped data from Portal de la Reserca to create the Nodelist and Edgelist to plot in Gephi.

# Import modules

In [149]:
import numpy as np
import pandas as pd
from ast import literal_eval
from itertools import combinations

# Load data

## Get researchers of interest

In [12]:
res_df = pd.read_csv('./data/nodelist.csv')
res_df_IGTP = res_df.loc[res_df['institution'] == 'IGTP']
res_IGTP = res_df_IGTP['id'].unique()

## Get edgelist

In [13]:
# Pre-process edgelist
# papers0_df = pd.read_csv('./data/papers_0.csv')
# papers1_df = pd.read_csv('./data/papers_1.csv')
# papers_df = papers0_df.append(papers1_df)
# papers_df = papers_df.drop_duplicates()
# papers_df.to_csv('./data/papers.csv', index=False)

In [16]:
# Load edgelist
# papers_df = pd.read_csv('./data/papers.csv')

In [17]:
# papers_df_backup = papers_df.copy()
papers_df = papers_df_backup.copy()

# Calculate collaborations

## Helper Functions

In [18]:
def convert_to_list(x):
    """Convert string column to list"""
    try:
        result = literal_eval(x)
    except ValueError:
        result = np.nan
    return result

def belongs_to_list(row, df):
    try:
        result = bool(set(row['orcids']) & set(df))
    except TypeError:
        result = False
    return result

## Filter papers based on institution

In [18]:
# Convert strings to list of coauthors
papers_df['orcids'] = papers_df['orcids'].apply(lambda x: convert_to_list(x))

# Identify authors in institution
mask = papers_df.apply(lambda x: belongs_to_list(x, res_IGTP), axis=1)

selected_df = papers_df[mask]

# Save papers of IGTP researchers
# selected_df.to_csv('./data/nodelist_IGTP.csv', index=None)

# Load papers
selected_df = pd.read_csv('./data/nodelist_IGTP.csv', converters = {'orcids': eval})

# Test with n papers for debugging (max 2800)
# n = 100
# selected_df = selected_df[:n]

# Get papers column
papers = selected_df['orcids'].copy()
papers = papers.reset_index(drop=True)

## Create papers matrix

In [18]:
# Get unique list of authors from papers
authors_index = list(set(papers.sum()))

authors_index.sort()

# Create boolean matrix with papers
paper_bool_df = pd.DataFrame(columns=authors_index, index=range(len(papers)))

for i, paper in enumerate(papers):
    paper_bool_df.loc[i,:] = 0
    for orcid in paper:
        paper_bool_df.loc[i,orcid] = 1
        
# Create papers matrix in numpy
papers_mat = paper_bool_df.to_numpy()

## Create boolean matrix of coauthor combinations

In [167]:
# Build collaboration vector to store results
n_authors = len(authors_index)
collabs_length = int(n_authors*(n_authors+1)/2 - n_authors) 
collabs = np.zeros(shape=(collabs_length))

## Main loop: calculate collaborations

In [167]:
# Store copy of papers_mat for iterative updating
papers_mat_i = papers_mat

# Initialize writing position
start_pos = 0

for i in range(0, n_authors-1): #last author loop is unnecessary
    print(f"Progress: {i/(n_authors-2)*100:.0f}%.", end="\r")
    
    # Initialize matrix
    C = np.identity(n_authors-i)
    C = C[:,1:]
    C[0] = 1
    
    # Main inner product
    result = np.dot(papers_mat_i, C)

    # Calculate number of collaborations
    result = result - 1
    result = result.clip(0)
    collabs_author = result.sum(axis=0)

    # Store in collabs vector
    end_pos = start_pos + n_authors - i - 1
    
    collabs[start_pos:end_pos] = collabs_author
    
    # Update start_pos for writing next loop
    start_pos = end_pos 
    
    # Remove first author from papers_mat for next loop
    papers_mat_i = papers_mat_i[:,1:]

Progress: 100%.

## Create Dataframe

In [None]:
author_combinations = combinations(authors_index,2)
collabs_df = pd.DataFrame(list(author_combinations), columns=['source', 'target'])
collabs_df['value']=collabs

## Save

In [None]:
outfile = pd.DataFrame(links)
outfile.to_csv('./data/edgelist_IGTP.csv')