In [2]:
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import itertools

In [3]:
BASE_URL = 'https://www.louvre.fr'

In [4]:
current_appendix = '/en/selections'

In [5]:
form_url = lambda appendix: f'{BASE_URL}{appendix}'

In [6]:
def get_next_appendix(page):
    arrow = page.find('a', attrs={'class':'next-page'})
    if arrow is None:
        return arrow
    return arrow['href']

In [7]:
def parse_selected_works(page):
    uls = page.findAll('ul', attrs={'class':'list-items-1'})
    lis = [x.findAll('li') for x in uls]
    lis = list(itertools.chain(*lis))
    return {x.find('h2').text : x.find('a')['href'] for x in lis}

In [8]:
def scrape_collections(current_appendix):
    works_collections = {}

    while current_appendix is not None:
        url = form_url(current_appendix)
        page = urllib.request.urlopen(url)
        page_soup = BeautifulSoup(page, 'html.parser')
        works_collections = {**works_collections, **parse_selected_works(page_soup)}
        
        current_appendix = get_next_appendix(page_soup)
    
    return works_collections

In [9]:
collections = scrape_collections(current_appendix)

In [10]:
def parse_artifacts(page):
    links = page_soup.findAll('a', attrs={'class':'roll-4'})
    result = {}
    for l in links:
        image_link = l.find('img')['src']
        item_link = l['href']
        name = l.find('img')['alt']
        result[name] = [item_link, image_link]
    return result

In [11]:
for key, item in collections.items():
    current_appendix = str(item)
    collections[key] = []
    while current_appendix is not None:
        url = form_url(current_appendix)
        print(url)
        page = urllib.request.urlopen(url)
        page_soup = BeautifulSoup(page, 'html.parser')
        
        collections[key].append(parse_artifacts(page_soup))
        
        current_appendix = get_next_appendix(page_soup)

https://www.louvre.fr/en/selections/masterpieces
https://www.louvre.fr/en/selections/masterpieces?page=1
https://www.louvre.fr/en/selections/masterpieces?page=2
https://www.louvre.fr/en/selections/french-revolution
https://www.louvre.fr/en/selections/napoleon-1769-1821
https://www.louvre.fr/en/selections/napoleon-1769-1821?page=1
https://www.louvre.fr/en/selections/louis-xiv-1638-1715
https://www.louvre.fr/en/selections/major-events-history
https://www.louvre.fr/en/selections/major-events-history?page=1
https://www.louvre.fr/en/selections/travel
https://www.louvre.fr/en/selections/travel?page=1
https://www.louvre.fr/en/selections/art-portraiture
https://www.louvre.fr/en/selections/art-portraiture?page=1
https://www.louvre.fr/en/selections/landscapes
https://www.louvre.fr/en/selections/landscapes?page=1
https://www.louvre.fr/en/selections/jewelry
https://www.louvre.fr/en/selections/jewelry?page=1
https://www.louvre.fr/en/selections/music
https://www.louvre.fr/en/selections/music?page=1


{'Masterpieces': [{'Aphrodite, known as the "Venus de Milo"': ['/en/oeuvre-notices/aphrodite-known-venus-de-milo?selection=44909',
    'https://www.louvre.fr/sites/default/files/imagecache/140x87/medias/medias_images/images/louvre-aphrodite-dite-venus-milo_0.jpg?1527823809'],
   'The "Regent" Diamond': ['/en/oeuvre-notices/diamond-known-regent?selection=44909',
    'https://www.louvre.fr/sites/default/files/imagecache/140x87/medias/medias_images/images/louvre-diamant-dit-quotle-regentquot.jpg?1527828413'],
   'Frise des archers': ['/en/oeuvre-notices/frieze-archers?selection=44909',
    'https://www.louvre.fr/sites/default/files/imagecache/140x87/medias/medias_images/images/louvre-frise-des-archers_1.jpg?1527823475'],
   'Horse Restrained by a Groom, called "Horse of Marly"': ['/en/oeuvre-notices/horses-restrained-grooms-known-marly-horses?selection=44909',
    'https://www.louvre.fr/sites/default/files/imagecache/140x87/medias/medias_images/images/louvre-cheval-retenu-par-palefrenier.

In [14]:
pd.DataFrame.from_dict(collections, orient='index', columns=['collection', 'name','image_source', 'page_source'])

Unnamed: 0,collection,name,image_source,page_source
Masterpieces,"{'Aphrodite, known as the ""Venus de Milo""': ['...",{'Portrait of the Artist Holding a Thistle': [...,{'The Winged Victory of Samothrace': ['/en/oeu...,
The French Revolution,{'Portrait bust of Nathalie de Laborde (1774-1...,,,
Napoleon (1769-1821),{'Athénienne de Napoléon Ier': ['/en/oeuvre-no...,{'L'Impératrice Joséphine (1763 - 1814)': ['/e...,,
Louis XIV (1638-1715),{'Equestrian Portrait of Pierre Séguier (1588-...,,,
Major Events in History,"{'Alexander Entering Babylon, or The Triumph o...","{'The Barricade, Rue de la Mortellerie, June 1...",,
Travel...,{'Carriage on the Beach at Scheveningen': ['/e...,{'Arrival of a Stagecoach in the Cour des Mess...,,
The Art of Portraiture,{'Equestrian statue: Charlemagne or Charles th...,{'Portrait of Alexander the Great (356-323 BC)...,,
Landscapes,{'Landscape Inspired by the View of Frascati (...,{'The Fall of Phaeton': ['/en/oeuvre-notices/t...,,
Jewelry,{'Piece of jewellery composed of beads': ['/en...,{'Pendentif au nom du roi Osorkon II : la fami...,,
Music,{'Harpe triangulaire': ['/en/oeuvre-notices/an...,{'Sainte Cécile à l'orgue entourée d'anges': [...,,


In [15]:
def flatten_dict(d, prefix='__'):
    def items():
        # A clojure for recursively extracting dict like values
        for key, value in d.items():
            if isinstance(value, dict):
                for sub_key, sub_value in flatten_dict(value).items():
                    # Key name should imply nested origin of the dict,
                    # so we use a default prefix of __ instead of _ or .
                    yield key + prefix + sub_key, sub_value
            else:
                yield key, value
    return dict(items())

In [None]:
flatten_dict(col)