In [1]:
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import itertools

In [2]:
BASE_URL = 'https://www.louvre.fr'

In [3]:
current_appendix = '/en/selections'

In [4]:
form_url = lambda appendix: f'{BASE_URL}{appendix}'

In [5]:
def get_next_appendix(page):
    arrow = page.find('a', attrs={'class':'next-page'})
    if arrow is None:
        return arrow
    return arrow['href']

In [6]:
def parse_selected_works(page):
    uls = page.findAll('ul', attrs={'class':'list-items-1'})
    lis = [x.findAll('li') for x in uls]
    lis = list(itertools.chain(*lis))
    return {x.find('h2').text : x.find('a')['href'] for x in lis}

In [7]:
def scrape_collections(current_appendix):
    works_collections = {}

    while current_appendix is not None:
        url = form_url(current_appendix)
        page = urllib.request.urlopen(url)
        page_soup = BeautifulSoup(page, 'html.parser')
        works_collections = {**works_collections, **parse_selected_works(page_soup)}
        
        current_appendix = get_next_appendix(page_soup)
    
    return works_collections

In [8]:
collections = scrape_collections(current_appendix)

In [9]:
def parse_artifacts(page):
    links = page_soup.findAll('a', attrs={'class':'roll-4'})
    result = {}
    for l in links:
        image_link = form_url(l.find('img')['src'])
        item_link = l['href']
        name = l.find('img')['alt']
        result[name] = [item_link, image_link]
    return result

In [10]:
for key, item in collections.items():
    current_appendix = str(item)
    collections[key] = []
    while current_appendix is not None:
        url = form_url(current_appendix)
        print(url)
        page = urllib.request.urlopen(url)
        page_soup = BeautifulSoup(page, 'html.parser')
        
        collections[key].append(parse_artifacts(page_soup))
        

        current_appendix = get_next_appendix(page_soup)

https://www.louvre.fr/en/selections/masterpieces
https://www.louvre.fr/en/selections/masterpieces?page=1
https://www.louvre.fr/en/selections/masterpieces?page=2
https://www.louvre.fr/en/selections/french-revolution
https://www.louvre.fr/en/selections/napoleon-1769-1821
https://www.louvre.fr/en/selections/napoleon-1769-1821?page=1
https://www.louvre.fr/en/selections/louis-xiv-1638-1715
https://www.louvre.fr/en/selections/major-events-history
https://www.louvre.fr/en/selections/major-events-history?page=1
https://www.louvre.fr/en/selections/travel
https://www.louvre.fr/en/selections/travel?page=1
https://www.louvre.fr/en/selections/art-portraiture
https://www.louvre.fr/en/selections/art-portraiture?page=1
https://www.louvre.fr/en/selections/landscapes
https://www.louvre.fr/en/selections/landscapes?page=1
https://www.louvre.fr/en/selections/jewelry
https://www.louvre.fr/en/selections/jewelry?page=1
https://www.louvre.fr/en/selections/music
https://www.louvre.fr/en/selections/music?page=1


In [11]:
item_cols = ['collection', 'artifact_name', 
             'artifact_link', 'image_link', 
             'text', 'creation_info',
             'physical_characteristics', 'acquired_by', 'category']
dataframe = pd.DataFrame(columns=item_cols)

In [12]:
def scrape_artifact_page(appendix):
    url = form_url(appendix)
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, 'html.parser')
    
    result = {}
    
    text = soup.find('div', attrs={'class':'col-desc'}).text
    result['text'] = text
    
    category = soup.find('p', attrs={'class':'department'}).text
    result['category'] = category
    
    infos = soup.find('div', attrs={'class':'box-cartel'}).findAll('li')
    result['artifact_link'] = url
    result['creation_info'] = infos[0].text
    result['physical_characteristics'] = infos[1].text
    result['acquired_by'] = infos[2].text
    
    return result

In [13]:
for collection, items in collections.items():
    print(f"Scraping {collection} collection")
    for item in items:
        for name, links in item.items():
            row = {}
            row['collection'] = collection
            row['artifact_name'] = name
            row['image_link'] = links[1]
            artifact_info = scrape_artifact_page(links[0])
            dataframe = dataframe.append({**artifact_info, **row}, ignore_index=True)

Scraping Masterpieces collection
Scraping The French Revolution collection
Scraping Napoleon (1769-1821) collection
Scraping Louis XIV (1638-1715) collection
Scraping Major Events in History collection
Scraping Travel... collection
Scraping The Art of Portraiture collection
Scraping Landscapes collection
Scraping Jewelry collection
Scraping Music collection
Scraping Animals collection
Scraping Time collection
Scraping Heroes collection
Scraping Children collection
Scraping Saints collection
Scraping Let There be Light! collection
Scraping Mirror, Mirror on the Wall! collection
Scraping At the Seaside… collection
Scraping Hunting and Fishing collection
Scraping Writing collection
Scraping Beauty collection
Scraping City Defenses collection
Scraping Remarkable Enigmas collection
Scraping Gastronomy collection
Scraping Smiles collection
Scraping Leisure & Celebration collection
Scraping Game’s On! collection
Scraping Blue collection
Scraping Kings, Queens and Emperors collection


In [14]:
SAVING_PATH = 'raw_data/louvre_artifacts.csv'

In [15]:
dataframe.head()

Unnamed: 0,collection,artifact_name,artifact_link,image_link,text,creation_info,physical_characteristics,acquired_by,category
0,Masterpieces,"Aphrodite, known as the ""Venus de Milo""",https://www.louvre.fr/en/oeuvre-notices/aphrod...,https://www.louvre.frhttps://www.louvre.fr/sit...,\nThis graceful statue of a goddess has intrig...,"\n\n\nAphrodite, known as the ""Venus de Milo""\...",\n\nH. 2.02 m\n,\n\n Gift of the marquis de Rivière t...,"\nGreek, Etruscan, and Roman Antiquities\nHell..."
1,Masterpieces,"The ""Regent"" Diamond",https://www.louvre.fr/en/oeuvre-notices/diamon...,https://www.louvre.frhttps://www.louvre.fr/sit...,"\n\nDiscovered in 1698 in Golconda, India, thi...","\n\n\nThe ""Regent"" Diamond\n\n\n\n",\n140.64 metric carats\n\n,\n\n Former Crown Diamonds collection...,\nDecorative Arts\n18th century: rococo
2,Masterpieces,Frise des archers,https://www.louvre.fr/en/oeuvre-notices/frieze...,https://www.louvre.frhttps://www.louvre.fr/sit...,\nThis decorative frieze of polychrome glazed ...,\n\n\nFrise des archers\nEpoque achéménideRègn...,\nBriques siliceuses à glaçure\nH. 4.75 m; W. ...,"\n\n Mission Dieulafoy, 1885 - 1886\t...",\nNear Eastern Antiquities\nIran
3,Masterpieces,"Horse Restrained by a Groom, called ""Horse of ...",https://www.louvre.fr/en/oeuvre-notices/horses...,https://www.louvre.frhttps://www.louvre.fr/sit...,\nThese two large marble sculptures representi...,"\nGuillaume COUSTOU \n\t\t (Lyon, 1677 - Par...",\nCarrara marble\nH. 3.40 m; W. 2.84 m; D. 1.2...,\n\n\n\nM.R. 1802\n,"\nSculptures\nFrance, 17th and 18th centuries ..."
4,Masterpieces,"July 28. Liberty Leading the People (July 28, ...",https://www.louvre.fr/en/oeuvre-notices/july-2...,https://www.louvre.frhttps://www.louvre.fr/sit...,"\nThe Paris uprising of July 27, 28, and 29, 1...",\nEugène DELACROIX \n\t\t (Charenton-Saint-M...,\n\nH. 2.60 m; W. 3.25 m\n,"\n\n Acquired in 1983\t\t\t\t\t, ...",\nPaintings\nFrench painting


In [16]:
dataframe.to_csv(SAVING_PATH)

In [17]:
len(dataframe)

450