# Collapse at research group level

In [None]:
import pandas as pd

In [None]:
institution_list = ['IGTP+', 'UPC_CIMNE', 'UB', 'UPF', 'UVic-UCC', 'UOC']

for institution in institution_list:
    print(f"Institution: {institution}.")
    # Get Nodes

    author_df = pd.read_csv(f'data/nodes_{institution}_20220309.csv', converters = {'groups': eval})
    author_full = pd.read_csv(f'data/nodes_{institution}_full_20220309.csv', converters = {'groups': eval})
    author_df = author_full.merge(author_df['id'], left_index=True, right_index=True)

    # Filter nodelist for researchers with nonempty research groups
    mask = author_df['groups'].apply(len) > 0
    author_gp_df = author_df[mask]

    # Clean data
    author_gp_lst = list(author_gp_df['id'].unique())
    author_gp_df.loc[:,'url_id'] = author_gp_df.loc[:,'groups'].apply(lambda x: x[0][1:])

    # Create group level nodelist

    # Get group names
    group_df = pd.read_csv('data/groups.csv')
    group_df = group_df[['name', 'url_id']]
    author_gp_df = author_gp_df.merge(group_df, how='left', on='url_id')

    # Collapse at group level
    nodes_df = author_gp_df.groupby('url_id').first().reset_index()
    nodes_df = nodes_df[['url_id','name', 'institution', 'institution_2', 'department', 'institution_group','n_publications']]
    nodes_df = nodes_df.rename(columns={'url_id':'id', 'name':'label'})

    # Save
    nodes_df.to_csv(f'data/group_nodes_{institution}.csv', index=None)

    # Create group level edgelist

    edges_df = pd.read_csv(f'data/edges_{institution}_20220309.csv')
    mask = edges_df.apply(lambda row: row['Source'] in author_gp_lst and row['Target'] in author_gp_lst, axis=1)
    edges_gp_df = edges_df[mask]
    edges_gp_df = edges_gp_df.merge(author_gp_df[['id', 'url_id']], how='left', left_on='Source', right_on='id')
    edges_gp_df = edges_gp_df.rename(columns={'url_id':'Source_gp'})
    edges_gp_df = edges_gp_df.merge(author_gp_df[['id', 'url_id']], how='left', left_on='Target', right_on='id')
    edges_gp_df = edges_gp_df.rename(columns={'url_id':'Target_gp'})
    edges_gp_df = edges_gp_df[['Source_gp', 'Target_gp', 'Weight']]
    edges_gp_df.columns = ['Source', 'Target', 'Weight']
    edges_gp_df = edges_gp_df.groupby(['Source', 'Target']).sum().reset_index()

    # Save
    edges_gp_df.to_csv(f'data/group_edges_{institution}.csv', index=None)
    print("Done.")