# Post processing scraped data

This notebook processes the scraped data from Portal de la Reserca to create the Nodelist and Edges to plot in Gephi.

# Import modules

In [14]:
import numpy as np
import pandas as pd
from ast import literal_eval
from itertools import combinations
import random

# Load data

## Get researchers

### Select institution

In [15]:
# Select institution
institution = 'IGTP'
# institution = 'UPC'
# institution = 'UB'
# institution = 'UPF'
# institution = 'UVic-UCC'
# institution = 'UOC'


### Get papers of institution

In [16]:
# Load authors from papers dataset
papers = pd.read_csv(f'./data/papers_{institution}.csv', converters = {'orcids': eval}, usecols=['orcids'])

# Get papers column
# authors = papers_inst['orcids'].copy()
papers = papers['orcids']

## Extract random sample

In [17]:
# threshold = 400
# papers = papers.sample(threshold, random_state=0)

## Create papers matrix

In [18]:
# Get unique list of authors from papers
authors_papers = list(set(papers.sum()))
authors_papers.sort()

# Get list of authors from institution
authors_inst = pd.read_csv(f'./data/nodelist_{institution}.csv', usecols=['id'])
authors_inst = authors_inst['id']
authors_inst = authors_inst.unique()
authors_inst.sort()

# Combine both
authors_index = list(set(authors_papers) & set(authors_inst))
authors_index.sort()

# Create boolean matrix with papers
paper_bool_df = pd.DataFrame(columns=authors_index, index=range(len(papers)))

for i, paper in enumerate(papers):
    paper_bool_df.loc[i,:] = 0
    for orcid in paper:
        if orcid in paper_bool_df.columns: # only count authors in institution
            paper_bool_df.loc[i,orcid] = 1
        
# Convert to numpy
papers_mat = paper_bool_df.to_numpy()

# Calculate collaborations

## Fast computation

1. create df to store all author combinations

2. loop through papers, order authors alphabetically, add all combinations to store_df

In [47]:
# 1. create df to store all combinations
author_combinations = combinations(authors_index,2)
collabs_df = pd.DataFrame(list(author_combinations), columns=['source', 'target'])
collabs_df['value'] = 0

collabs_df = collabs_df.set_index(['source', 'target'])

# 2. loop through papers
for paper in papers:
    paper = list(set(paper))
    paper.sort()
    author_pairs = combinations(paper, 2)
    for pair in author_pairs:
        try:
            collabs_df.loc[pair] += 1
        except:
            pass

collabs_df = collabs_df.reset_index()

## Check

In [52]:
# pairs with multiple joint publications
pos_pairs = collabs_df.loc[collabs_df['value'] > 0]

# Count number of errors
errors = 0
count = 0
# for idx, row in collabs_df.iterrows():
for idx, row in pos_pairs.iterrows():
    print(f"{count}/{len(pos_pairs)}", end="\r")
    sum=0
    for paper in papers:
        if row['source'] in paper and row['target'] in paper:
            sum += 1
                
    result = row['value'] == sum
    if not result:
        errors += 1
        
    count += 1
        
print('done')
    
errors

In [45]:
row = pos_pairs.iloc[2]
row = pos_pairs.loc[44791]
row

source    0000-0001-9016-0515
target    0000-0002-1436-5823
value                       2
Name: 44791, dtype: object

In [46]:
sum=0
for paper in papers:
    if row['source'] in paper and row['target'] in paper:
        print(row['source'])
        print(row['target'])
        print(paper)
        sum += 1
        
print(sum)
print(row['value'])

0000-0001-9016-0515
0000-0002-1436-5823
['0000-0001-9016-0515', '0000-0001-9975-407X', '0000-0002-0153-291X', '0000-0002-0548-5778', '0000-0002-1436-5823', '0000-0002-1436-5823', '0000-0002-5215-7363', '0000-0002-6396-1162', '0000-0003-3232-4598', '0000-0003-3982-7577']
1
2


In [92]:
pos_pairs

Unnamed: 0,source,target,value
679,0000-0001-5105-9836,0000-0002-1436-5823,2
1178,0000-0001-5119-2266,0000-0002-1196-4724,16
1457,0000-0001-5119-2266,0000-0003-2661-9525,41
1541,0000-0001-5143-4042,0000-0001-5915-5515,3
1611,0000-0001-5143-4042,0000-0001-8750-0195,7
...,...,...,...
129373,0000-0003-3932-788X,0000-0003-4427-9413,4
129375,0000-0003-3932-788X,0000-0003-4564-1963,2
129424,0000-0003-3982-7577,0000-0003-4126-2202,3
129623,0000-0003-4126-2202,0000-0003-4947-4648,10


In [54]:
collabs_df.loc[myindex]

value    0
Name: (0000-0001-5049-3673, 0000-0001-5105-9836), dtype: int64

In [23]:
i = 0
for paper in papers:
    paper.sort()
    if i > 5:
        break
    print(paper)
    print(paper)
    i += 1

['0000-0001-5445-4508', '0000-0001-5694-0921', '0000-0001-9531-961X', '0000-0002-0325-2233', '0000-0002-6403-3317', '0000-0002-7529-9399', '0000-0003-2593-833X', '0000-0003-3051-3926']
['0000-0001-5445-4508', '0000-0001-5694-0921', '0000-0001-9531-961X', '0000-0002-0325-2233', '0000-0002-6403-3317', '0000-0002-7529-9399', '0000-0003-2593-833X', '0000-0003-3051-3926']
['0000-0001-7888-0309', '0000-0001-8159-2455']
['0000-0001-7888-0309', '0000-0001-8159-2455']
['0000-0001-5709-6443', '0000-0001-6944-6383', '0000-0001-8865-8111', '0000-0002-1808-5968', '0000-0002-3725-0180', '0000-0002-4763-5252', '0000-0002-5601-4466', '0000-0002-6736-1535', '0000-0003-0493-5340', '0000-0003-3116-1352']
['0000-0001-5709-6443', '0000-0001-6944-6383', '0000-0001-8865-8111', '0000-0002-1808-5968', '0000-0002-3725-0180', '0000-0002-4763-5252', '0000-0002-5601-4466', '0000-0002-6736-1535', '0000-0003-0493-5340', '0000-0003-3116-1352']
['0000-0002-1968-9966', '0000-0002-5119-7929', '0000-0002-5272-0806']
['00

In [17]:
# collabs_df

## Alternative method

Main loop is inefficient, ends up with a large matrix with too much detail. not only you know how many collaborations but you are also storing in which exact paper.

Only thing we need is for a given paper

`[1 0 1 0 0 0 ]`

to add 1 into the right coordinate of a results list, in this case

`results[0][2] + = 1`

## Main loop: calculate collaborations

In [7]:
# Build collaboration vector to store results
n_authors = len(authors_index)
collabs_length = int(n_authors*(n_authors+1)/2 - n_authors) 
collabs = np.zeros(shape=(collabs_length))

# Store copy of papers_mat for iterative updating
papers_mat_i = papers_mat

In [None]:
print("Running main loop")
# Initialize writing position
start_pos = 0

for i in range(0, n_authors-1): #last author loop is unnecessary
    print(f"Progress: {i/(n_authors-2)*100:.0f}%. Researcher: {i}/{n_authors}.", end="\r", flush=True)
    
    # Initialize matrix
    
    print(f"Progress: {i/(n_authors-2)*100:.0f}%. Researcher: {i}/{n_authors}. Generating Identity matrix.", end="\r", flush=True)
    C = np.identity(n_authors-i)
    C = C[:,1:] # remove first column (self-referencing)
    C[0] = 1
    
    # Main inner product
    print(f"Progress: {i/(n_authors-2)*100:.0f}%. Researcher: {i}/{n_authors}. Inner product. P {papers_mat_i.shape} x C {C.shape}      ", end="\r", flush=True)
#     ones_mat = np.ones(shape=(1,papers_mat_i.shape[0]))
#     temp_mat = np.dot(ones_mat, papers_mat_i)
#     result = np.dot(temp_mat, C)
    result = np.dot(papers_mat_i, C)

    # Calculate number of collaborations
    print(f"Progress: {i/(n_authors-2)*100:.0f}%. Researcher: {i}/{n_authors}. Calculating collaborations.", end="\r", flush=True)
    result = result - 1
    result = result.clip(0)
    collabs_author = result.sum(axis=0)
#     collabs_author = result

    # Store in collabs vector
    end_pos = start_pos + n_authors - i - 1
    
    print(f"Progress: {i/(n_authors-2)*100:.0f}%. Researcher: {i}/{n_authors}. Storing results.           ", end="\r", flush=True)
    collabs[start_pos:end_pos] = collabs_author
    
    # Update start_pos for writing next loop
    start_pos = end_pos 
    
    # Remove first author from papers_mat for next loop
    print(f"Progress: {i/(n_authors-2)*100:.0f}%. Researcher: {i}/{n_authors}. Updating matrix next loop. ", end="\r", flush=True)
    papers_mat_i = papers_mat_i[:,1:]
    

Running main loop
Progress: 0%. Researcher: 0/510. Inner product. P (1937, 510) x C (510, 509)      

## Create Dataframe

In [10]:
author_combinations = combinations(authors_index,2)
collabs_df = pd.DataFrame(list(author_combinations), columns=['source', 'target'])
collabs_df['value']=collabs

In [12]:
collabs_df

Unnamed: 0,source,target,value
0,0000-0001-5049-3673,0000-0001-5105-9836,3.0
1,0000-0001-5049-3673,0000-0001-5119-2266,42.0
2,0000-0001-5049-3673,0000-0001-5143-4042,27.0
3,0000-0001-5049-3673,0000-0001-5144-2165,15.0
4,0000-0001-5049-3673,0000-0001-5154-0013,39.0
...,...,...,...
129790,0000-0003-4898-3424,0000-0003-4939-8411,6.0
129791,0000-0003-4898-3424,0000-0003-4947-4648,25.0
129792,0000-0003-4902-9739,0000-0003-4939-8411,10.0
129793,0000-0003-4902-9739,0000-0003-4947-4648,29.0


In [13]:
row = 0
source = collabs_df.loc[row, 'source']
# source = '0000-0002-6403-3317'
target = collabs_df.loc[row, 'target']
# target = '0000-0001-5694-0921'

sum = 0
for paper in papers:
    if source in paper:
        if target in paper:
            sum +=1

print(sum) 

0


In [43]:
papers

0       [0000-0002-6403-3317, 0000-0001-5694-0921, 000...
1              [0000-0001-8159-2455, 0000-0001-7888-0309]
2       [0000-0002-6736-1535, 0000-0002-4763-5252, 000...
3       [0000-0002-5272-0806, 0000-0002-5119-7929, 000...
4       [0000-0001-8306-5798, 0000-0002-9233-1776, 000...
                              ...                        
1932    [0000-0003-1713-7110, 0000-0003-1615-4592, 000...
1933           [0000-0003-1713-7110, 0000-0002-0295-1307]
1934    [0000-0003-1713-7110, 0000-0003-1615-4592, 000...
1935           [0000-0002-3064-1648, 0000-0001-9372-1007]
1936           [0000-0001-8159-2455, 0000-0001-7888-0309]
Name: orcids, Length: 1937, dtype: object

In [27]:
collabs_df

Unnamed: 0,source,target,value
0,0000-0001-5049-3673,0000-0001-5105-9836,2.0
1,0000-0001-5049-3673,0000-0001-5119-2266,41.0
2,0000-0001-5049-3673,0000-0001-5143-4042,26.0
3,0000-0001-5049-3673,0000-0001-5144-2165,14.0
4,0000-0001-5049-3673,0000-0001-5154-0013,38.0
...,...,...,...
129790,0000-0003-4898-3424,0000-0003-4939-8411,5.0
129791,0000-0003-4898-3424,0000-0003-4947-4648,24.0
129792,0000-0003-4902-9739,0000-0003-4939-8411,9.0
129793,0000-0003-4902-9739,0000-0003-4947-4648,28.0


## Save

In [None]:
collabs_df.to_csv(f'./data/edges_{institution}.csv')

## Helper functions

In [None]:
def convert_to_list(x):
    """Convert string column to list"""
    try:
        result = literal_eval(x)
    except ValueError:
        result = np.nan
    return result

def belongs_to_list(row, df):
    try:
        result = bool(set(row['orcids']) & set(df))
    except TypeError:
        result = False
    return result

## Define main function

In [None]:
# Load edges
papers_df = pd.read_csv('./data/papers.csv')
res_df = pd.read_csv('./data/nodelist.csv')

def calculate_collaborations(institution, papers_df, res_df, save=False, threshold=None):

    # Filter researchers
    res_df_inst = res_df.loc[res_df['institution'] == institution]
    res_inst = res_df_inst['id'].unique()

    # Convert strings to list of coauthors
    papers_df['orcids'] = papers_df['orcids'].apply(lambda x: convert_to_list(x))

    # Identify authors in institution
    institution_df = res_inst
    mask = papers_df.apply(lambda x: belongs_to_list(x, institution_df), axis=1)

    selected_df = papers_df[mask]
    
    # Test with n papers for debugging (max 2800)
    if threshold:
        selected_df = selected_df[:threshold]

    # Get papers column
    papers = selected_df['orcids'].copy()
    papers = papers.reset_index(drop=True)

    # Get unique list of authors from papers
    authors_index = list(set(papers.sum()))

    authors_index.sort()

    # Create boolean matrix with papers
    paper_bool_df = pd.DataFrame(columns=authors_index, index=range(len(papers)))

    for i, paper in enumerate(papers):
        paper_bool_df.loc[i,:] = 0
        for orcid in paper:
            paper_bool_df.loc[i,orcid] = 1

    # Create papers matrix in numpy
    papers_mat = paper_bool_df.to_numpy()

    # Build collaboration vector to store results
    n_authors = len(authors_index)
    collabs_length = int(n_authors*(n_authors+1)/2 - n_authors) 
    collabs = np.zeros(shape=(collabs_length))

    # Store copy of papers_mat for iterative updating
    papers_mat_i = papers_mat

    # Initialize writing position
    start_pos = 0

    for i in range(0, n_authors-1): #last author loop is unnecessary
        print(f"Progress: {i/(n_authors-2)*100:.0f}%.", end="\r")

        # Initialize matrix
        C = np.identity(n_authors-i)
        C = C[:,1:]
        C[0] = 1

        # Main inner product
        result = np.dot(papers_mat_i, C)

        # Calculate number of collaborations
        result = result - 1
        result = result.clip(0)
        collabs_author = result.sum(axis=0)

        # Store in collabs vector
        end_pos = start_pos + n_authors - i - 1

        collabs[start_pos:end_pos] = collabs_author

        # Update start_pos for writing next loop
        start_pos = end_pos 

        # Remove first author from papers_mat for next loop
        papers_mat_i = papers_mat_i[:,1:]

    author_combinations = combinations(authors_index,2)
    collabs_df = pd.DataFrame(list(author_combinations), columns=['source', 'target'])
    collabs_df['value']=collabs

    if save:
        collabs_df.to_csv(f'./data/edges_{institution}.csv')

## Execute

In [None]:
# Select institution
institution_list = ['IGTP']

for institution in institution_list:
    calculate_collaborations(institution, papers_df, res_df, save=True, threshold=100)