In [None]:
import requests
import pandas as pd
import requests
from bs4 import BeautifulSoup
import sys
from pathlib import Path 

In [None]:
src = str(Path.cwd().resolve().parents[1]/ 'src')
sys.path.append(src)
from config.paths import DATA, EXPERIMENTS, METADATA


In [None]:
musees_listes = os.path.join(METADATA, 'acquisition', 'musees_listes')
musees_tsv = os.path.join(METADATA, 'acquisition', 'musees_tsv')

# Base POP

In [None]:
liste_path = os.path.join(musees_listes, 'pop.txt')
output_tsv = os.path.join(musees_tsv, 'base_pop.tsv')


In [None]:
with open(liste_path, encoding='UTF-8', 'r') as f:
    liste_urls = f.readlines()

In [None]:
def get_notice(soup) :
    script_tag = soup.find('script', id='__NEXT_DATA__')
    json_text = script_tag.string
    data = json.loads(json_text)

    notice =(
    data.get("props", {})
        .get("pageProps", {})
        .get("notice", {})
    )
    return notice


In [None]:


def process_pop_list(url_list):
    # Define columns
    columns = ['Titre','Auteur','Date','Description','Technique','Lien',
               'Fichiers_img','Institution','Ville','Num_inventaire','Qualite_img','Bibliographie']
    
    # Create an empty dataframe
    df_corpus = pd.DataFrame(columns=columns)
    
    for url_pop in url_list:
        # --- Begin original extraction code ---
        row_dico = {col: 'Nan' for col in columns} 

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
        } 

        result = requests.get(url_pop, headers=headers)
        soup = BeautifulSoup(result.content, 'html.parser')

        notice = get_notice(soup)  # Assumes get_notice is already defined

        row_dico['Titre'] = notice.get('TITR') or notice.get('APPL') or 'Nan'
        row_dico['Auteur'] = notice.get('AUTR', 'Nan')
        row_dico['Date'] = notice.get('MILL', 'Nan')
        row_dico['Description'] = notice.get("DESC", 'Nan')
        row_dico['Technique'] = notice.get("TECH", 'Nan')
        row_dico['Lien'] = url_pop
        row_dico['Institution'] = notice.get("NOMOFF", 'Nan')
        row_dico['Ville'] = notice.get("VILLE_M", 'Nan')
        row_dico['Bibliographie'] = notice.get("BIBL", 'Nan')

        inv_raw = notice.get("INV", 'Nan')
        row_dico['Num_inventaire'] = inv_raw.split(';')[0] if ';' in inv_raw else inv_raw

        # Extract image URLs safely
        row_dico['image_urls'] = [img["src"] for img in soup.select(".swiper-slide img")]

        nb_img = len(row_dico['image_urls'])
        Fichiers_img = []

        inst = row_dico['Institution'].replace(' ', '') if row_dico['Institution'] != 'Nan' else 'UnknownInstitution'
        inv_num = row_dico['Num_inventaire'].replace('.', '--').replace(' ', '__') if row_dico['Num_inventaire'] != 'Nan' else 'UnknownInvNum'

        for i in range(nb_img):
            filename = f'{inst}_{inv_num}-{i+1:02d}.jpg'
            Fichiers_img.append(filename)

        row_dico['Fichiers_img'] = Fichiers_img
        # --- End original extraction code ---

        # Append row to dataframe
        df_corpus.loc[len(df_corpus)] = row_dico

    return df_corpus

In [None]:
df_corpus.to_csv(output_tsv, encoding='UTF-8', sep='\t', index=False)

# Victoria and Albert

In [None]:
liste_va = os.path.join (musees_listes, 'va.txt' )
output_tsv = os.path.join(musees_tsv, 'VA.tsv')

In [None]:
with open(liste_va, encoding='UTF-8', 'r') as f :
    liste_liens = f.readlines()

In [None]:
liste_liens =[lien.strip() for lien in liste_liens]
liste_liens

['https://collections.vam.ac.uk/item/O160903/',
 'https://collections.vam.ac.uk/item/O282160/',
 'https://collections.vam.ac.uk/item/O276333/',
 'https://collections.vam.ac.uk/item/O175503/',
 'https://collections.vam.ac.uk/item/O122941/',
 'https://collections.vam.ac.uk/item/O84929/',
 'https://collections.vam.ac.uk/item/O282141/',
 'https://collections.vam.ac.uk/item/O122993/',
 'https://collections.vam.ac.uk/item/O175496/',
 'https://collections.vam.ac.uk/item/O308219/',
 'https://collections.vam.ac.uk/item/O332154/',
 'https://collections.vam.ac.uk/item/O332155/',
 'https://collections.vam.ac.uk/item/O150300/',
 'https://collections.vam.ac.uk/item/O281915/',
 'https://collections.vam.ac.uk/item/O282011/',
 'https://collections.vam.ac.uk/item/O161558/',
 'https://collections.vam.ac.uk/item/O77990/',
 'https://collections.vam.ac.uk/item/O77759/',
 'https://collections.vam.ac.uk/item/O11149/',
 'https://collections.vam.ac.uk/item/O150353/',
 'https://collections.vam.ac.uk/item/O150300

In [None]:
def get_manifest(manifest_url) :
    result = requests.get(manifest_url)
    if result.status_code != 200:
        print('Erreur')
        exit()

    manifest = result.json()
    return manifest

In [None]:
def get_va_data(link) : 
    dico_objet = {
    'date' : None,
    'description' : None,
    'image_url': None,
    'mus_id':None,
    'num_inv' :None,
    'artistMakerPerson' : None,
    'artistMakerOrganisations' : None,
    'technique' : None
    }
    
    json_item = get_manifest(link)


    primary_thumbnail = json_item.get('meta', {}).get('images', {}).get('_primary_thumbnail')
    img_url = primary_thumbnail.replace('!100,100', 'full')
    dico_objet['image_url'] = img_url

    record = json_item.get('record', {})
    description = record.get('objectType')
    dico_objet['description'] = description



    artistMakerPerson = record.get('artistMakerPerson', [])
    if artistMakerPerson : 
        artistMakerPerson = artistMakerPerson[0].get('name',{}).get('text')

    dico_objet['artistMakerPerson'] = artistMakerPerson

    artistMakerOrganisations = record.get('artistMakerOrganisations', [])
    if artistMakerOrganisations : 
        artistMakerOrganisations = artistMakerOrganisations[0].get('name',{}).get('text')

    dico_objet['artistMakerOrganisations']=artistMakerOrganisations

    
    date = record.get('productionDates', [])[0].get('date', {}).get('text')
    dico_objet['date']=date
    
    num_inv = record.get('accessionNumber')
    dico_objet['num_inv']=num_inv

    technique = []
    categories = record.get('categories', [])
    if categories : 
        for dico in categories :
            technique.append(dico['text'])
    dico_objet['technique'] = technique

    mus_id=record.get('systemNumber')

    dico_objet['mus_id']=mus_id

    return dico_objet


In [None]:
liste_dico =[]
for lien in liste_liens : 
    json_link = lien.replace('https://collections.vam.ac.uk/item/', 'https://api.vam.ac.uk/v2/object/')
    print(json_link)
    dico = get_va_data(json_link)
    image_url = dico['image_url']
    object_id = dico['num_inv'].replace('.', '--')
    object_id = object_id.replace(' ', '__')
    liste_dico.append(dico)


df = pd.DataFrame(liste_dico)
df.to_csv(output_tsv, encoding='utf-8', sep='\t', index=False)


https://api.vam.ac.uk/v2/object/O160903/
https://api.vam.ac.uk/v2/object/O282160/
https://api.vam.ac.uk/v2/object/O276333/
https://api.vam.ac.uk/v2/object/O175503/
https://api.vam.ac.uk/v2/object/O122941/
https://api.vam.ac.uk/v2/object/O84929/
https://api.vam.ac.uk/v2/object/O282141/
https://api.vam.ac.uk/v2/object/O122993/
https://api.vam.ac.uk/v2/object/O175496/
https://api.vam.ac.uk/v2/object/O308219/
https://api.vam.ac.uk/v2/object/O332154/
https://api.vam.ac.uk/v2/object/O332155/
https://api.vam.ac.uk/v2/object/O150300/
https://api.vam.ac.uk/v2/object/O281915/
https://api.vam.ac.uk/v2/object/O282011/
https://api.vam.ac.uk/v2/object/O161558/
https://api.vam.ac.uk/v2/object/O77990/
https://api.vam.ac.uk/v2/object/O77759/
https://api.vam.ac.uk/v2/object/O11149/
https://api.vam.ac.uk/v2/object/O150353/
https://api.vam.ac.uk/v2/object/O150300/
https://api.vam.ac.uk/v2/object/O1104561/
https://api.vam.ac.uk/v2/object/O150352/
https://api.vam.ac.uk/v2/object/O1104615/
https://api.vam.ac

# RMN

In [None]:
liste_rmn = os.path.join(musees_listes, 'rmn.txt')
output_tsv = os.path.join(musees_tsv, 'RMN.tsv')

In [None]:
def extract_data_rmn(object_url):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/115.0.0.0 Safari/537.36"
        )
    }
    
    r = requests.get(object_url, headers=headers)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')

    def get_text_by_class(class_name):
        el = soup.find('div', class_=class_name)
        if el:
            return ' '.join(el.get_text(strip=True).split())
        return None

    auteur = get_text_by_class("metafield metafield-247_062")
    num_inventaire = get_text_by_class("metafield metafield-247_240")
    localisation = get_text_by_class("metafield metafield-247_055")

    # Date: try primary date, else fallback to end date with prefix
    date = get_text_by_class("metafield metafield-2_055")
    if not date:
        end_date = get_text_by_class("metafield metafield-241_016")
        if end_date:
            date = f"av. {end_date}"

    tech_el = soup.find('div', class_="metafield metafield-247_065")
    technique = None
    if tech_el:
        technique = ', '.join([span.get_text(strip=True) for span in tech_el.find_all('span')])

    # Title priority:
    titre = None
    h1_meta = soup.find("h1", class_="metafield metafield-2_204 pm_headline previewmeta-headline")
    if h1_meta:
        span = h1_meta.find("span", class_="metadata-value")
        if span:
            titre = span.get_text(strip=True)
    if not titre and soup.title:
        titre = soup.title.get_text().split('|')[0].strip()
    if not titre:
        h1 = soup.find('h1')
        if h1:
            titre = h1.get_text(strip=True)

    # Main image from og:image
    og_img = soup.find("meta", property="og:image")
    main_image_url = og_img["content"] if og_img else None

    time.sleep(3)
    
    return {
        'titre': titre,
        'auteur': auteur,
        'date': date,
        'technique': technique,
        'lien': object_url,
        'system_id': object_url.split('/')[-1],
        'localisation': localisation,
        'num_inventaire': num_inventaire,
        'image_url': main_image_url
    }

In [None]:
with open(liste_rmn, 'r', encoding='UTF-8') as f :
    urls = f.readlines()

In [None]:
urls = [url.strip() for url in urls]
urls

['https://images.grandpalaisrmn.fr/ark:/36255/15-612680',
 'https://images.grandpalaisrmn.fr/ark:/36255/15-612665',
 'https://images.grandpalaisrmn.fr/ark:/36255/15-612694',
 'https://images.grandpalaisrmn.fr/ark:/36255/10-545668',
 'https://images.grandpalaisrmn.fr/ark:/36255/15-612679',
 'https://images.grandpalaisrmn.fr/ark:/36255/15-612695',
 'https://images.grandpalaisrmn.fr/ark:/36255/15-612640',
 'https://images.grandpalaisrmn.fr/ark:/36255/15-612662',
 'https://images.grandpalaisrmn.fr/ark:/36255/09-509814',
 'https://images.grandpalaisrmn.fr/ark:/36255/10-545667',
 'https://images.grandpalaisrmn.fr/ark:/36255/15-612647',
 'https://images.grandpalaisrmn.fr/ark:/36255/94-055751',
 'https://images.grandpalaisrmn.fr/ark:/36255/15-612666',
 'https://images.grandpalaisrmn.fr/ark:/36255/15-612661',
 'https://images.grandpalaisrmn.fr/ark:/36255/15-612646',
 'https://images.grandpalaisrmn.fr/ark:/36255/10-545666',
 'https://images.grandpalaisrmn.fr/ark:/36255/05-520991',
 'https://imag

In [None]:
data_rmn = [extract_data_rmn(url) for url in unique_links]

In [None]:

df_rmn = pd.DataFrame(data_rmn)
df_rmn.to_csv(output_dir, sep='\t', index=False)
