# Sandbox

This notebook creates the code to scrape the authors from the Aeronautics School (EETAC).

# Load modules

In [121]:
import subprocess
import json
import pandas as pd 
from pathlib import Path

# Get Aeronautics Researchers (EETAC)

In [51]:
response_EETAC = subprocess.run(["bash", "fetch_futur_data.sh", "EETAC"], capture_output=True, text=True)
output = json.loads(response_EETAC.stdout)

author_list = output["content"][0]["secondaryFields"]["organization_en_person"]

author_data = []
for author_string in author_list:
    author_tokens = author_string.split("##")
    author = {
        "name": author_tokens[0],
        "id": author_tokens[1]
    }
    author_data.append(author)

# Get ORCID from each researcher

In [111]:
df_authors = pd.DataFrame()

for n, author in enumerate(author_data):
    print(n, author)
    
    response_author = subprocess.run(
        ["bash", "fetch_futur_data.sh", author['id']], 
        capture_output=True, 
        text=True)
    output = json.loads(response_author.stdout)
    if not output['content']:
        continue
        
    author_details = output['content'][0]['secondaryFields']
    for key,value in author_details.items():
        if len(value)>1:
            author_details[key] = str(value)

    author_details['id'] = author['id']
    author_details['name'] = author['name']
    author_details['accessURL'] = str(output['content'][0]['accessURL'])

    author_row = pd.DataFrame(author_details)
    df_authors = pd.concat([df_authors, author_row])

# Rearrange output
new_col_order = ["id", "name", "person_ca_orcid"] + [col for col in df_authors.columns if col not in ["id", "name", "person_ca_orcid"]]
df_authors = df_authors[new_col_order]

# Save

In [117]:
df_authors.to_csv("EETAC_authors.csv", index=None)

# Load UPC nodes

In [122]:
data_folder = Path("../data")

# Create set of ORCIDs from EETAC
orcids_EETAC = set(df_authors["person_ca_orcid"].unique())

# Get UPC nodes
df = pd.read_csv(data_folder / "nodes_UPC_ext.csv")

# Identify UPC authors from EETAC
df["EETAC"] = df["id"].apply(lambda x: 1 if x in orcids_EETAC else 0)

In [144]:
# Assign Department to EETAAC
df.loc[df["EETAC"] == 1, "department"] = "EETAC"

# Save

In [149]:
df.to_csv(data_folder / "nodes_UPC_EETAC.csv", index=False)

# EXTRA CODE

In [None]:
raise Exception("STOP HERE")

# Filter UPC 

Filter UPC data to single out UPC Escuela de Ingeniería de Telecomunicación y Aeroespacial de Castelldefels

In [3]:
data_folder = Path("../data/")

df = pd.read_csv(data_folder / "nodes_UPC.csv")
df = pd.read_csv(data_folder / "groups.csv")
search_string = "Escuela de Ingeniería de Telecomunicación"
mask = df["name"].str.contains(search_string)

df[mask]