In [1]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.request import urlretrieve
import os.path

In [27]:
def get_soup(url):
    
    req = requests.get(url)

    html_doc = req.text

    soup = BeautifulSoup(html_doc)
    
    return soup

In [161]:
def painting_downloader(soup, painting_name):
    
    get_img_wrapper = soup.find('img', {'itemprop':'image'})
    
    img_url = get_img_wrapper.get('src')

    extension = img_url.split('.')[-1]
    output_name = painting_name.replace(' ','-')
    output_name = output_name.lower()
    filename = output_name + '.' + extension
    
    if os.path.isfile(filename):
        return print("A file with the name '{}' already exists.".format(filename))
    else :      
        urlretrieve(img_url, filename)
        return print("Painting downloaded as '{}'".format(filename))

In [158]:
def get_info_from_soup(soup):

    get_article = soup.find('article')
    
    info = []
    
    ######## Creator
    
    creator = get_article.find('span', {'itemprop':'name'})
    if creator is not None:
        creator = re.sub('<.*?>', "", str(creator))
        creator = re.sub('\n', "", str(creator)).strip()
    info = info + [creator]
    
    ######## Name
    
    name = get_article.find('h3')
    if name is not None:
        name = re.sub('<.*?>', "", str(name))
        name = re.sub('\xa0', " ", str(name))
        name = re.sub('\n', "", str(name)).strip()
    info = info + [name]
    
    ######## Date

    date = get_article.find('span', {'itemprop':'dateCreated'})
    if date is not None:
        date = re.sub('<.*?>', "", str(date))
        date = re.sub('\n', "", str(date)).strip()
    info = info + [date]
    
    ######## Place
    
    place = get_article.find('span', {'itemprop':'locationCreated'})
    if place is not None:
        place = re.sub('<.*?>', "", str(place))
        place = re.sub('\n', "", str(place)).strip()
    info = info + [place]
    
    ######## Style
    
    style = get_article.find_all('li')
    
    for li in style:
        if 'Style' in str(li):
            style = li
            style = re.sub('<.*?>', "", str(style))
            style = re.sub('\n', "", str(style))
            style = style.split(':')[1].strip()
            break
        else:
            style = None
        
    info = info + [style]
    
    ######## Genre
    
    genre = get_article.find('span', {'itemprop':'genre'})
    if genre is not None:
        genre = re.sub('<.*?>', "", str(genre))
        genre = re.sub('\n', "", str(genre)).strip()
    info = info + [genre]
    
    ######## Media
    
    media = get_article.find_all('li')
    
    for li in media:
        if 'Media' in str(li):
            media = li
            media = re.sub('<.*?>', "", str(media))
            media = re.sub('\n', "", str(media))
            media = media.split(':')[1].strip()
            break
        else:
            media = None
        
    info = info + [media]
    
    ######## Location
    
    location = get_article.find_all('li')
    
    for li in location:
        if 'Location' in str(li):
            location = li
            location = re.sub('<.*?>', "", str(location))
            location = re.sub('\n', "", str(location))
            location = location.split(':')[1].strip()
            break
        else:
            location = None
        
    info = info + [location]
    
    ######## Dimensions
    
    dimensions = get_article.find_all('li')
    
    for li in dimensions:
        if 'Dimensions' in str(li):
            dimensions = li
            dimensions = re.sub('<.*?>', "", str(dimensions))
            dimensions = re.sub('\n', "", str(dimensions))
            dimensions = dimensions.split(':')[1].strip()
            break
        else:
            dimensions = None
        
    info = info + [dimensions]

    return info

In [164]:
# url_rand = 'https://www.wikiart.org/en/App/Painting/random'
# soup_rand = get_soup(url_rand)
name = get_info_from_soup(soup_rand)[1]
name
painting_downloader(soup_rand,name)

Painting downloaded as 'no-kakinomoto-hitomaro.jpg'


In [82]:
url_mona = 'https://www.wikiart.org/en/leonardo-da-vinci/mona-lisa'
soup_mona = get_soup(url_mona)

# url_apelles = 'https://www.wikiart.org/en/sandro-botticelli/calumny-of-apelles'
# soup_apelles = get_soup(url_apelles)

# url = 'https://www.wikiart.org/en/leonardo-da-vinci/the-virgin-of-the-rocks'
# soup_virgin = get_soup(url)

['Leonardo da Vinci', 'Mona Lisa', '1504', 'Florence', 'High Renaissance', 'portrait', 'oil,panel', 'Louvre, Paris, France', '53 x 77 cm']


In [29]:
print(get_info_from_soup(soup_mona))
print(get_info_from_soup(soup_apelles))
print(get_info_from_soup(soup_virgin))

['Leonardo da Vinci', 'Mona Lisa', '1504', 'Florence', 'High Renaissance', 'portrait', 'oil,panel', 'Louvre, Paris, France', '53 x 77 cm']
['Sandro Botticelli', 'Calumny of Apelles', '1495', None, 'Early Renaissance', 'literary painting', 'oil,panel,tempera', 'Uffizi Gallery, Florence, Italy', '91 x 62 cm']
['Leonardo da Vinci', 'The Virgin of the Rocks', '1505', 'Florence', 'High Renaissance', 'religious painting', 'oil,panel', 'National Gallery, London, UK', '189.5 x 120 cm']


In [92]:
N = 100
keys = ['creator', 'name', 'year', 'place', 'style', 'genre', 'media', 'location', 'dimension']
paintings_info_list = [[0 for x in keys] for y in range(N)]

url_rand = 'https://www.wikiart.org/en/App/Painting/random'
for i in range(N):
    soup_rand = get_soup(url_rand)
    paintings_info_list[i] = get_info_from_soup(soup_rand)

df = pd.DataFrame(paintings_info_list, columns=keys)

In [153]:
df['creator'].value_counts()


Roger Weik                   8
Gustave Dore                 3
Richard Jack                 3
Vincent van Gogh             2
Nicholas Roerich             2
Wojciech Siudmak             2
Yamamura Toyonari            1
Sigmar Polke                 1
Max Gubler                   1
Leon Berkowitz               1
William Merritt Chase        1
Giuseppe Arcimboldo          1
Tess Jaray                   1
James Charles                1
Julian Schnabel              1
Antonio Carneiro             1
William Hart                 1
Mary Pratt                   1
Paolo Veronese               1
Vladimir Borovikovsky        1
Andre Lanskoy                1
Dante Gabriel Rossetti       1
Giacomo Balla                1
Raoul Ubac                   1
Vasile Kazar                 1
Claude Lorrain               1
Tea Jorjadze                 1
Boris Kustodiev              1
Henri de Toulouse-Lautrec    1
Karl Schrag                  1
                            ..
Aleksandr Deyneka            1
Ernst Lu

In [152]:
df.groupby('creator').filter(lambda x: x['name'].count() >= 2)

Unnamed: 0,creator,name,year,place,style,genre,media,location,dimension
1,Gustave Dore,The Embarkation of the Souls,,,Romanticism,illustration,,,
2,Roger Weik,"""Infusion"" 2016",2016.0,Los Angeles,Abstract Expressionism,abstract,,,
11,Wojciech Siudmak,Three Graces,,,Fantastic Realism,symbolic painting,,,
30,Richard Jack,The Swedish Dyehouse,,,Impressionism,genre painting,,,
35,Gustave Dore,David Shows Saul How He Spared His Life,,,Romanticism,religious painting,,,
39,Vincent van Gogh,Stooping Woman in Landscape,1883.0,,Realism,sketch and study,"ink,paper","Van Gogh Museum, Amsterdam, Netherlands",
42,Roger Weik,"""Infusion"" 2016",2016.0,Los Angeles,Abstract Expressionism,abstract,,,
48,Richard Jack,The Swedish Dyehouse,,,Impressionism,genre painting,,,
62,Roger Weik,"""Infusion"" 2016",2016.0,Los Angeles,Abstract Expressionism,abstract,,,
64,Roger Weik,"""Infusion"" 2016",2016.0,Los Angeles,Abstract Expressionism,abstract,,,


In [139]:
repeated.filter(lambda x: x['creator'].count() >= 2)

pandas.core.series.Series