# Create edges

This notebook processes the publications data and creates the edges between authors.

Inputs:
- `f'data/papers_{institution}_2plus_{date_today}.csv'`
- `f'data/nodes_{institution}_{date_today}.csv'`

Output:
- `f'data/edges_{institution}_{date_today}.csv'`

# Import modules

In [None]:
import numpy as np
import pandas as pd
from itertools import combinations

# Select institution

# Create edgelist

In [None]:
def create_edgelist(institution, date_today, save=False):
    print(f"Institution: {institution}.")
    
    
    input_authors = f'../data/{date_today}/{date_today}_nodes_{institution}.csv'
    input_papers  = f'../data/{date_today}/{date_today}_papers_{institution}_2plus.csv'
    output        = f'../data/{date_today}/{date_today}_edges_{institution}.csv'
    
    
    ## Get authors from papers
    
    # Load papers with coauthors list
    print(f"{institution} - Loading papers.")
    papers_df = pd.read_csv(input_papers, converters = {'orcids': eval})
    papers = papers_df['orcids'].copy()

    # Get unique list of authors from papers
    authors_papers = list(set(papers.sum()))
    authors_papers.sort()

    ## Get authors from institution

    # Get list of authors from institution
    print(f"{institution} - Loading nodes.")
    authors_inst_df = pd.read_csv(input_authors)

    authors_inst = authors_inst_df['id']
    authors_inst = authors_inst.unique()
    authors_inst = authors_inst[pd.notnull(authors_inst)]

    authors_inst.sort()

    ## Combine authors

    # Combine both
    authors_index = list(set(authors_papers) & set(authors_inst))
    authors_index.sort()
    
    ## Create df to store collaborations

    print(f"{institution} - Calculting combinations of authors.")
    author_combinations = combinations(authors_index,2)
    collabs_df = pd.DataFrame(list(author_combinations), columns=['Source', 'Target'])
    collabs_df['Weight'] = 0
    collabs_df = collabs_df.set_index(['Source', 'Target'])

    # Calculate collaborations

    ## Main loop: add collaborations to df

    print(f"{institution} - Main loop: counting collaborations.")
    for i, paper in enumerate(papers):
        print(f"{institution} - Progress: {i/len(papers)*100:.0f}%. ({i:,.0f}/{len(papers):,.0f}).", end="\r")

        # Store collaboration
        paper = list(set(paper))
        paper.sort()
        author_pairs = combinations(paper, 2)
        for pair in author_pairs:
            try:
                collabs_df.loc[pair] += 1
            except:
                pass
    collabs_df = collabs_df.reset_index()
    
    collabs_df = collabs_df.loc[collabs_df['Weight'] > 0]  # Drop zero weights
    
    if save:
        ## Save
        collabs_df.to_csv(output, index=None)
        print(f"{institution} - Done. Saved '{outfile}'.")
        
    return collabs_df

# MAIN LOOP: Institution

In [None]:
# Select institution
institution_list = ['UdL']
date_today='20220422'

for institution in institution_list:
    collabs_df = create_edgelist(institution, date_today)

In [None]:
collabs_df

## Debug