# Create edges

This notebook processes the publications data and achieves the following:

- Creates the edges between authors

# Import modules

In [1]:
import numpy as np
import pandas as pd
from itertools import combinations

# Select institution

In [2]:
# Select institution
# institution_list = ['IGTP', 'UPC', 'UB', 'UPF', 'UVic-UCC', 'UOC']
institution_list = ['IGTP+', 'UPC_CIMNE', 'UB', 'UPF', 'UVic-UCC', 'UOC']

# institution = 'IGTP+'
# institution = 'UPC_CIMNE'
# institution = 'UB'
# institution = 'UPF'
# institution = 'UVic-UCC'
# institution = 'UOC'

# Create edgelist

In [3]:
def create_edgelist(institution, save=True):
    print(f"Institution: {institution}.")
    ## Get authors from papers

    # Set date for file versions
    date_today = '20220309'

    # Load papers with coauthors list
    print(f"{institution} - Loading papers.")
    papers_df = pd.read_csv(f'./data/papers_{institution}_2plus_{date_today}.csv', converters = {'orcids': eval})
    papers = papers_df['orcids'].copy()

    # Get unique list of authors from papers
    authors_papers = list(set(papers.sum()))
    authors_papers.sort()

    ## Get authors from institution

    # Get list of authors from institution
    print(f"{institution} - Loading nodes.")
    authors_inst_df = pd.read_csv(f'./data/nodes_{institution}_{date_today}.csv')

    authors_inst = authors_inst_df['id']
    authors_inst = authors_inst.unique()
    authors_inst.sort()

    ## Combine authors

    # Combine both
    authors_index = list(set(authors_papers) & set(authors_inst))
    authors_index.sort()
    
    ## Create df to store collaborations

    print(f"{institution} - Calculting combinations of authors.")
    author_combinations = combinations(authors_index,2)
    collabs_df = pd.DataFrame(list(author_combinations), columns=['Source', 'Target'])
    collabs_df['Weight'] = 0
    collabs_df = collabs_df.set_index(['Source', 'Target'])

    # Calculate collaborations

    ## Main loop: add collaborations to df

    print(f"{institution} - Main loop: counting collaborations.")
    for i, paper in enumerate(papers):
        print(f"{institution} - Progress: {i/len(papers)*100:.0f}%. ({i:,.0f}/{len(papers):,.0f}).", end="\r")

        # Store collaboration
        paper = list(set(paper))
        paper.sort()
        author_pairs = combinations(paper, 2)
        for pair in author_pairs:
            try:
                collabs_df.loc[pair] += 1
            except:
                pass
    collabs_df = collabs_df.reset_index()
    
    if save:
        ## Save
        outfile = f'./data/edges_{institution}_{date_today}.csv'
        collabs_df.to_csv(outfile, index=None)
        print(f"{institution} - Done. Saved '{outfile}'.")
        
    return collabs_df

# MAIN LOOP: Institution

In [4]:
for institution in institution_list:
    collabs_df = create_edgelist(institution)

Institution: IGTP+.
IGTP+ - Loading papers.
IGTP+ - Loading nodes.
IGTP+ - Calculting combinations of authors.
IGTP+ - Main loop: counting collaborations.
IGTP+ - Done. Saved './data/edges_IGTP+_20220309.csv'.
Institution: UPC_CIMNE.
UPC_CIMNE - Loading papers.
UPC_CIMNE - Loading nodes.
UPC_CIMNE - Calculting combinations of authors.
UPC_CIMNE - Main loop: counting collaborations.
UPC_CIMNE - Done. Saved './data/edges_UPC_CIMNE_20220309.csv'.
Institution: UB.
UB - Loading papers.
UB - Loading nodes.
UB - Calculting combinations of authors.
UB - Main loop: counting collaborations.
UB - Done. Saved './data/edges_UB_20220309.csv'.
Institution: UPF.
UPF - Loading papers.
UPF - Loading nodes.
UPF - Calculting combinations of authors.
UPF - Main loop: counting collaborations.
UPF - Done. Saved './data/edges_UPF_20220309.csv'.
Institution: UVic-UCC.
UVic-UCC - Loading papers.
UVic-UCC - Loading nodes.
UVic-UCC - Calculting combinations of authors.
UVic-UCC - Main loop: counting collaboration

# EXTRA CODE

# Check results

In [None]:
# Check for pairs with joint publications
pos_pairs = collabs_df.loc[collabs_df['value'] > 0]

# Count number of errors
errors = 0
count = 0
# for idx, row in collabs_df.iterrows():
for idx, row in pos_pairs.iterrows():
    print(f"{count}/{len(pos_pairs)}", end="\r")
    sum=0
    for paper in papers:
        if row['Source'] in paper and row['Target'] in paper:
            sum += 1
                
    result = row['Weight'] == sum
    if not result:
        errors += 1
        
    count += 1
        
print(f"Done. Found {errors} errors.")