# Clean Nodes

This notebook cleans the nodes CSVs to run further analysis.

In [230]:
import pandas as pd
from pathlib import Path
import ast

# Read and append files

In [231]:
data_folder = Path("../data/20240317")
files_to_append =  [
    "20240317_nodes_ICFO.csv",
    "20240317_nodes_IDIBELL.csv",
    "20240317_nodes_UPC_EETAC.csv"]

df_full = pd.DataFrame()
for file in files_to_append:
    df = pd.read_csv(data_folder / file)
    df_full = pd.concat([df_full, df])

# Clean author files

In [232]:
def find_duplicates(value):
    if not value:
        return False
    if value == "UPC":
        return False
        
    result = ast.literal_eval(value)
    if not result:
        return np.nan
    if any(element in ["ICFO", "IDIBELL", "UPC"] for element in result):
        return False 
    else:
        return True
    return np.nan

df_full["outside_researcher"] = df_full["institution"].apply(find_duplicates)

# Drop all outside researchers
df_full = df_full[df_full["outside_researcher"] != True]

def fill_institution(row):
    if row["institution"] in ["UPC", "ICFO", "IDIBELL"]:
        return row["institution"]
    result = ast.literal_eval(row["institution"])
    if row["institution_2"] == "Fundació Institut de Ciències Fotòniques":
        return "ICFO"
    if row["institution_2"] == "Institut d'Investigació Biomèdica de Bellvitge":
        return "IDIBELL"
    result = ast.literal_eval(row["institution_2"])
    if "Institut d'Investigació Biomèdica de Bellvitge" in result:
        return "IDIBELL"
    if "Fundació Institut de Ciències Fotòniques" in result:
        return "ICFO"
    return row["institution_2"]

df_full["institution"] = df_full.apply(fill_institution, axis=1)

# Add group labels

In [233]:
# Create groups dataset

groups_to_append = [
    "20240317_group_data_ICFO.csv",
    "20240317_group_data_IDIBELL.csv",
    "20240317_group_data_ICFO.csv"]

df_groups = pd.DataFrame()
for file in groups_to_append:
    df = pd.read_csv(data_folder / file)
    df_groups = pd.concat([df_groups, df])

df_groups["group_stem"] = df_groups["url"].str[-16:]
df_groups = df_groups[["group_stem", "name"]]
df_groups = df_groups.rename(columns={"name":"group_name"})
df_groups = df_groups.drop_duplicates()

# Clean author df
def get_group(value):
    result = ast.literal_eval(value)
    if not result:
        return np.nan
    group_stem = result[0]
    if group_stem[0] == "/":
        group_stem = group_stem[1:]
    return group_stem 

df_full["group_stem"] = df_full["groups"].apply(get_group)

# Merge
df_merge = df_full.merge(df_groups, how="left", on="group_stem")

# Save

In [234]:
df_merge.to_csv(data_folder / "nodes_all.csv", index=None)