# Fetch Artists Bios

Automate fetching artist bios from ACM's Morphé.

In [1]:
import pandas as pd
import requests

In [2]:
from networks import load_acm_feather

In [3]:
df = load_acm_feather()

In [8]:
architects = df.Auteurs_cleaned.explode().sort_values().dropna()

In [None]:
architects = architects.str.replace(r'^\s*([^,]+),\s*(.+)$', r'\2 \1', regex=True)

In [19]:
architects.value_counts()

Auteurs_cleaned
Alphonse Laverrière       49
Georges Epitaux           42
Charles Thévenaz          42
Fonso Boschetti           29
Jean Béguin               29
                          ..
Yvonne Hausmann-Schmid     1
Georges Hayoz              1
Clément Heaton             1
Hebeisen                   1
Maruf Ünal                 1
Name: count, Length: 4014, dtype: int64

## Standardize to Morphe's format, make requests

In [25]:
type(architects)

pandas.core.series.Series

In [37]:
architects_df = pd.DataFrame(architects.unique(), columns=["Auteurs_cleaned"])

In [38]:
architects_df

Unnamed: 0,Auteurs_cleaned
0,AA 83
1,ACAU
2,N. Abali
3,Raphaël Abbet
4,Hans Ruedi Abbühl
...,...
4009,H von Weissenfluh
4010,Hans von der Mühll
4011,Erik zu Putlitz
4012,Gündüz Özdes


In [None]:
import os
import re
import requests         # pip install requests
import unidecode        # pip install unidecode
import pandas as pd

# Make sure this directory exists (creates if not present)
os.makedirs("Data/ACM/bios", exist_ok=True)

def normalize_name(raw_name):
    """
    1) Swap 'Last, First' → 'First Last' if comma is present.
    2) Remove accents.
    3) Lowercase.
    4) Convert spaces to dashes.
    """
    # If format is "Last, First", swap to "First Last"
    swapped = re.sub(r'^\s*([^,]+),\s*(.+)$', r'\2 \1', raw_name)
    
    # Remove accents
    no_accents = unidecode.unidecode(swapped)
    
    # Lowercase
    lowercased = no_accents.lower()
    
    # Trim whitespace
    stripped = lowercased.strip()
    
    # Replace one or more spaces with a single dash
    dashed = re.sub(r'\s+', '-', stripped)
    
    return dashed


# Create a new column to store whether we found a record or not
architects_df['RecordFound'] = False

for idx, row in architects_df.iterrows():
    raw_name = str(row['Auteurs_cleaned'])
    norm_name = normalize_name(raw_name)
    
    # Construct EAC-CPF URL
    url = f"https://morphe.epfl.ch/index.php/{norm_name};eac?sf_format=xml"        
    try:
        response = requests.get(url)
        response.raise_for_status()  # raises exception if not 2xx
        
        # Save the XML in Data/ACM/bios/<norm_name>-bio.xml
        file_path = f"Data/ACM/bios/{norm_name}-bio.xml"
        with open(file_path, "wb") as f:
            f.write(response.content)
        
        # Record success
        architects_df.at[idx, 'RecordFound'] = True
        print(f"[OK] Found valid record for '{raw_name}' → saved to {file_path}")
    
    except requests.RequestException as e:
        # Record failure
        architects_df.at[idx, 'RecordFound'] = False
        print(f"[ERROR] No valid record for '{raw_name}' at {url}: {e}")


[ERROR] No valid record for 'AA 83' at https://morphe.epfl.ch/index.php/aa-83;eac?sf_format=xml: 500 Server Error: Internal Server Error for url: https://morphe.epfl.ch/index.php/aa-83;eac?sf_format=xml
[ERROR] No valid record for 'ACAU' at https://morphe.epfl.ch/index.php/acau;eac?sf_format=xml: 500 Server Error: Internal Server Error for url: https://morphe.epfl.ch/index.php/acau;eac?sf_format=xml
[ERROR] No valid record for 'N. Abali' at https://morphe.epfl.ch/index.php/n.-abali;eac?sf_format=xml: 500 Server Error: Internal Server Error for url: https://morphe.epfl.ch/index.php/n.-abali;eac?sf_format=xml
[ERROR] No valid record for 'Raphaël Abbet' at https://morphe.epfl.ch/index.php/raphael-abbet;eac?sf_format=xml: 500 Server Error: Internal Server Error for url: https://morphe.epfl.ch/index.php/raphael-abbet;eac?sf_format=xml
[ERROR] No valid record for 'Hans Ruedi Abbühl' at https://morphe.epfl.ch/index.php/hans-ruedi-abbuhl;eac?sf_format=xml: 500 Server Error: Internal Server Err

In [45]:
import os
import re
import unidecode
import requests
import pandas as pd

# Make sure "Data/ACM/bios" directory exists
os.makedirs("Data/ACM/bios", exist_ok=True)

def normalize_name(raw_name):
    """
    1) Swap "Last, First" => "First Last" if a comma is present.
    2) Remove accents.
    3) Lowercase.
    4) Replace spaces with dashes.
    """
    swapped = re.sub(r'^\s*([^,]+),\s*(.+)$', r'\2 \1', raw_name)  # "Last, First" -> "First Last"
    no_accents = unidecode.unidecode(swapped)                      # e.g. "é" -> "e"
    lowercased = no_accents.lower().strip()
    dashed = re.sub(r'\s+', '-', lowercased)                       # spaces -> dashes
    return dashed

results = []

for original_name in architects_df["Auteurs_cleaned"]:
    norm_name = normalize_name(original_name)
    filename = f"Data/ACM/bios/{norm_name}-bio.xml"

    # If file already exists, mark bio_found = True and skip the request
    if os.path.exists(filename):
        print(f"[SKIP] File already exists for '{original_name}' => {filename}")
        bio_found = True
    else:
        # Attempt the standard URL first
        url_1 = f"https://morphe.epfl.ch/index.php/{norm_name};eac?sf_format=xml"
        try:
            response = requests.get(url_1, timeout=10)
            response.raise_for_status()
            with open(filename, "wb") as f:
                f.write(response.content)
            bio_found = True
            print(f"[OK] Found standard record for '{original_name}' => {url_1}")
        except requests.exceptions.RequestException as e:
            # Attempt the "-2" URL
            url_2 = f"https://morphe.epfl.ch/index.php/{norm_name}-2;eac?sf_format=xml"
            try:
                response = requests.get(url_2, timeout=10)
                response.raise_for_status()
                with open(filename, "wb") as f:
                    f.write(response.content)
                bio_found = True
                print(f"[OK] Found '-2' record for '{original_name}' => {url_2}")
            except requests.exceptions.RequestException as e2:
                # Not found at either URL
                bio_found = False
                print(f"[ERROR] No record found for '{original_name}'. Tried:\n"
                      f"   - {url_1}\n"
                      f"   - {url_2}\n"
                      f"Exception(s):\n"
                      f"   - {e}\n"
                      f"   - {e2}")

    results.append({
        "Auteurs_cleaned": original_name,
        "normalized_name": norm_name,
        "bio_found": bio_found
    })

# Convert list of dicts to DataFrame
results_df = pd.DataFrame(results)

# Display or save results if desired
print("\n--- Results DataFrame Preview ---")
print(results_df.head(10))


[ERROR] No record found for 'AA 83'. Tried:
   - https://morphe.epfl.ch/index.php/aa-83;eac?sf_format=xml
   - https://morphe.epfl.ch/index.php/aa-83-2;eac?sf_format=xml
Exception(s):
   - 500 Server Error: Internal Server Error for url: https://morphe.epfl.ch/index.php/aa-83;eac?sf_format=xml
   - 500 Server Error: Internal Server Error for url: https://morphe.epfl.ch/index.php/aa-83-2;eac?sf_format=xml
[ERROR] No record found for 'ACAU'. Tried:
   - https://morphe.epfl.ch/index.php/acau;eac?sf_format=xml
   - https://morphe.epfl.ch/index.php/acau-2;eac?sf_format=xml
Exception(s):
   - 500 Server Error: Internal Server Error for url: https://morphe.epfl.ch/index.php/acau;eac?sf_format=xml
   - 500 Server Error: Internal Server Error for url: https://morphe.epfl.ch/index.php/acau-2;eac?sf_format=xml
[ERROR] No record found for 'N. Abali'. Tried:
   - https://morphe.epfl.ch/index.php/n.-abali;eac?sf_format=xml
   - https://morphe.epfl.ch/index.php/n.-abali-2;eac?sf_format=xml
Exception(

In [None]:
import pandas as pd

# 1. Calculate how often each name appears
counts = architects.value_counts()

# 2. Determine the cutoff for the top 5% by frequency
top_5pct_cutoff = counts.quantile(0.95)

# 3. Identify those architects whose frequency is >= the 90th percentile
top_5pct_architects = counts[counts >= top_5pct_cutoff].index

# 4. Filter the results for those in the top 5% but missing a bio
missing_in_top_5 = results_df[
    (results_df['bio_found'] == False) &
    (results_df['Auteurs_cleaned'].isin(top_5pct_architects))
]

# 5. Display or otherwise handle the architects in the top 5% with no bio found
print("Architects in the top 5% whose EAC-CPF was not found:")
print(missing_in_top_5)
missing_in_top_5.to_csv("Data/ACM/missing_top_5_percent.csv")


Architects in the top 10% whose EAC-CPF was not found:
                                        Auteurs_cleaned  \
121   Ateliers de constructions mécaniques de Vevey ...   
160                                     Pierre Baechler   
199                                      Robert R Barro   
220                                        Henry Baudin   
223                                         Robert Baum   
...                                                 ...   
3895                                      Eugène Yonner   
3918                                   Hansjörg Zentner   
3949                                 Conrad SA Zschokke   
3956                                    Michel Zufferey   
3977                                   Ed. & Cie Züblin   

                                        normalized_name  bio_found  
121   ateliers-de-constructions-mecaniques-de-vevey-...      False  
160                                     pierre-baechler      False  
199                          

In [50]:
# 235 auto-found.

# Now, I manually go through the top 5% of occuring people who are missing from bio
top_5pct_with_links = pd.read_csv("Data/ACM/missing_top_5_percent.csv")

In [51]:
for link in top_5pct_with_links.new_link:
    print(link)

N
https://morphe.epfl.ch/index.php/pierre-bechler;eac?sf_format=xml
N
N
N
N
N
https://morphe.epfl.ch/index.php/charles-eric-andre-billaud;eac?sf_format=xml
https://morphe.epfl.ch/index.php/charles-eric-andre-billaud;eac?sf_format=xml
https://morphe.epfl.ch/index.php/charles-eric-andre-billaud;eac?sf_format=xml
N
N
N
N
N
N
N
N
N
N
N
N
N
N
https://morphe.epfl.ch/index.php/j-s-buffat;eac?sf_format=xml
N
N
N
N
N
https://morphe.epfl.ch/index.php/charles-francois-chamorel-garnier;eac?sf_format=xml
https://morphe.epfl.ch/index.php/chappuis;eac?sf_format=xml
https://morphe.epfl.ch/index.php/chappuis;eac?sf_format=xml
N
N
N
N
https://morphe.epfl.ch/index.php/marc-henry-collomb;eac?sf_format=xml
N
N
N
https://morphe.epfl.ch/index.php/pierre-debrot;eac?sf_format=xml
N
N
N
N
N
https://morphe.epfl.ch/index.php/paul-dubois;eac?sf_format=xml
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
N
https://morphe.epfl.ch/index.php/f-huguenin;eac?sf_format=xml
N
N
N
N
N
N
N
N
N
N
N
N
N
N
https://morph