# Filter Groups linked to food and health

This notebook creates a list of unique group names by institution so Gauthier can select the food and health related ones.

In [None]:
# Import modules
import pandas as pd

def add_group_names(institution, date_today, df_groups, save=False):
    # Get Researchers
    df = pd.read_csv(f'../data/{date_today}/{date_today}_nodes_{institution}.csv', converters={'groups':eval})
    
    # Add group names
    groups_dict = {}
    
    def get_group_names(lst, df_groups):
        groups_names = []
        for group_id in lst:
            group_name = df_groups.loc[df_groups['url_id']==group_id, 'name'].values[0]
            groups_names.append(group_name)
            groups_dict[group_id] = group_name
        return groups_names
    # Merge
    df['groups_names'] = df['groups'].apply(lambda x: get_group_names(x, df_groups))
    
    groups_inst = pd.DataFrame.from_dict(groups_dict, orient='index').reset_index()
    if len(groups_inst) == 0:
        return 0
    groups_inst.columns=['url_id', 'name']
    groups_df = groups_inst.merge(df_groups, how='left', on='url_id', suffixes=['','_y'])
    groups_df = groups_df.drop(columns=['name_y'])
    
    if save:
        df.to_csv(f'../data/{date_today}/{date_today}_nodes_{institution}_withgroups.csv', index=None)
        groups_df_filename = f'../data/{date_today}/{date_today}_groups_{institution}.csv'
        groupd_df.to_csv(groups_df_filename, index=None)
        print(f"Saved: {groups_df_filename})
    
    return df, groups_df

def get_unique_groups(df, institution, date_today, save=False):
    unique_groups = set(df['groups_names'].sum())
    if save:
        unique_groups_df = pd.DataFrame(unique_groups)
        unique_groups_df.to_csv(f'../data/{date_today}/{date_today}_groups_unique_{institution}.csv', index=None)
    return unique_groups

In [None]:
# Get Groups
date_today='20220419'
df_groups = pd.read_csv(f'../data/{date_today}/{date_today}_group_data.csv')

In [None]:
df_groups['url_id'] = df_groups['url'].str[30:]

In [None]:
institution_list = ['UVic-UCC', 'UB', 'UPC_CIMNE', 'UPF', 
                   'CRAG', 'Agrotecnio', 'URV', 'UdL', 'UdG']

for institution in institution_list:
    add_group_names(institution, date_today, df_groups, save=True)

## Version of the above function to export to Snakemake scripts

In [None]:
# Import modules
import pandas as pd


def add_group_names(institution, date_today, df_groups, save=False):
    # Get groups
    df_groups = pd.read_csv(f'../data/{date_today}/{date_today}_group_data.csv')
    df_groups['url_id'] = df_groups['url'].str[30:]
    
    # Get Researchers
    df = pd.read_csv(f'../data/{date_today}/{date_today}_nodes_{institution}.csv', converters={'groups':eval})
    
    # Add group names
#     groups_dict = {}
    
    def get_group_names(lst, df_groups):
        groups_names = []
        for group_id in lst:
            group_name = df_groups.loc[df_groups['url_id']==group_id, 'name'].values[0]
            groups_names.append(group_name)
#             groups_dict[group_id] = group_name
        return groups_names
    # Merge
    df['groups_names'] = df['groups'].apply(lambda x: get_group_names(x, df_groups))
    
#     groups_inst = pd.DataFrame.from_dict(groups_dict, orient='index').reset_index()
#     if len(groups_inst) == 0:
#         return 0
#     groups_inst.columns=['url_id', 'name']
#     groups_df = groups_inst.merge(df_groups, how='left', on='url_id', suffixes=['','_y'])
#     groups_df = groups_df.drop(columns=['name_y'])
    
    if save:
        df.to_csv(f'../data/{date_today}/{date_today}_nodes_{institution}_withgroups.csv', index=None)
#         groups_df_filename = f'../data/{date_today}/{date_today}_groups_{institution}.csv'
#         groupd_df.to_csv(groups_df_filename, index=None)
#         print(f"Saved: {groups_df_filename})
    
#     return df, groups_df

def get_unique_groups(df, institution, date_today, save=False):
    unique_groups = set(df['groups_names'].sum())
    if save:
        unique_groups_df = pd.DataFrame(unique_groups)
        unique_groups_df.to_csv(f'../data/{date_today}/{date_today}_groups_unique_{institution}.csv', index=None)
    return unique_groups