<h1>Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [19]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.request import urlretrieve
import os.path
import pandas as pd

# Get painting's info

In [2]:
def get_soup(url):
    
    req = requests.get(url)

    html_doc = req.text

    soup = BeautifulSoup(html_doc)
    
    return soup

In [3]:
def get_creator_name(article):
    creator = article.find('span', {'itemprop':'name'})
    if creator is not None:
        creator = re.sub('<.*?>', "", str(creator))
        creator = re.sub('\n', "", str(creator)).strip()
    return creator

In [4]:
def get_painting_name(article):
    name = article.find('h3')
    if name is not None:
        name = re.sub('<.*?>', "", str(name))
        name = re.sub('\xa0', " ", str(name))
        name = re.sub('\n', "", str(name)).strip()
    return name

In [5]:
def get_painting_date(article):
    date = article.find('span', {'itemprop':'dateCreated'})
    if date is not None:
        date = re.sub('<.*?>', "", str(date))
        date = re.sub('\n', "", str(date)).strip()
    return date

In [6]:
def get_painting_place(article):
    place = article.find('span', {'itemprop':'locationCreated'})
    if place is not None:
        place = re.sub('<.*?>', "", str(place))
        place = re.sub('\n', "", str(place)).strip()
    return place

In [7]:
def get_painting_style(article):
    style = article.find_all('li')
    
    for li in style:
        if 'Style' in str(li):
            style = li
            style = re.sub('<.*?>', "", str(style))
            style = re.sub('\n', "", str(style))
            style = style.split(':')[1].strip()
            break
        else:
            style = None
        
    return style

In [8]:
def get_painting_genre(article):
    genre = article.find('span', {'itemprop':'genre'})
    if genre is not None:
        genre = re.sub('<.*?>', "", str(genre))
        genre = re.sub('\n', "", str(genre)).strip()
    return genre

In [9]:
def get_painting_media(article):
    media = article.find_all('li')
    
    for li in media:
        if 'Media' in str(li):
            media = li
            media = re.sub('<.*?>', "", str(media))
            media = re.sub('\n', "", str(media))
            media = media.split(':')[1].strip()
            break
        else:
            media = None
        
    return media

In [10]:
def get_painting_current_location(article):
    location = article.find_all('li')
    
    for li in location:
        if 'Location' in str(li):
            location = li
            location = re.sub('<.*?>', "", str(location))
            location = re.sub('\n', "", str(location))
            location = location.split(':')[1].strip()
            break
        else:
            location = None
        
    return location

In [11]:
def get_painting_dimensions(article):
    dimensions = article.find_all('li')
    
    for li in dimensions:
        if 'Dimensions' in str(li):
            dimensions = li
            dimensions = re.sub('<.*?>', "", str(dimensions))
            dimensions = re.sub('\n', "", str(dimensions))
            dimensions = dimensions.split(':')[1].strip()
            break
        else:
            dimensions = None
        
    return dimensions

In [12]:
def get_info_from_soup(soup):

    article = soup.find('article')
            
    creator = get_creator_name(article)    
    name = get_painting_name(article)
    date = get_painting_date(article)    
    place = get_painting_place(article)    
    style = get_painting_style(article)    
    genre = get_painting_genre(article)    
    media = get_painting_media(article)    
    location = get_painting_current_location(article)    
    dimensions = get_painting_dimensions(article)
    
    info = [creator, name, date, place, style, genre, media, location, dimensions]

    return info

# Downloading the painting

In [13]:
def painting_downloader(soup, painting_name):
    
    get_img_wrapper = soup.find('img', {'itemprop':'image'})
    
    img_url = get_img_wrapper.get('src')

    extension = img_url.split('.')[-1]
    output_name = painting_name.replace(' ','-')
    output_name = output_name.lower()
    filename = output_name + '.' + extension
    save_path = 'paintings/'+filename
    
    if os.path.isfile(save_path):
        return print("A file with the name '{}' already exists.".format(filename))
    else :      
        urlretrieve(img_url, save_path)
        return print("Painting downloaded as '{}'".format(filename))

In [14]:
url_rand = 'https://www.wikiart.org/en/App/Painting/random'
soup_rand = get_soup(url_rand)
info = get_info_from_soup(soup_rand)
painting_downloader(soup_rand,info[1])

Painting downloaded as 'head.jpg'


# Look

In [16]:
url_mona = 'https://www.wikiart.org/en/leonardo-da-vinci/mona-lisa'
soup_mona = get_soup(url_mona)
info = get_info_from_soup(soup_mona)
# painting_downloader(soup_mona,info[1])
# url_apelles = 'https://www.wikiart.org/en/sandro-botticelli/calumny-of-apelles'
# soup_apelles = get_soup(url_apelles)

# url = 'https://www.wikiart.org/en/leonardo-da-vinci/the-virgin-of-the-rocks'
# soup_virgin = get_soup(url)

In [17]:
print(get_info_from_soup(soup_mona))
# print(get_info_from_soup(soup_apelles))
# print(get_info_from_soup(soup_virgin))

['Leonardo da Vinci', 'Mona Lisa', '1504', 'Florence', 'High Renaissance', 'portrait', 'oil,panel', 'Louvre, Paris, France', '53 x 77 cm']


In [20]:
N = 10
keys = ['creator', 'name', 'year', 'place', 'style', 'genre', 'media', 'location', 'dimension']
paintings_info_list = [[0 for x in keys] for y in range(N)]

url_rand = 'https://www.wikiart.org/en/App/Painting/random'
for i in range(N):
    soup_rand = get_soup(url_rand)
    paintings_info_list[i] = get_info_from_soup(soup_rand)

df = pd.DataFrame(paintings_info_list, columns=keys)

In [21]:
df

Unnamed: 0,creator,name,year,place,style,genre,media,location,dimension
0,Fra Angelico,Visitation,1434.0,,Early Renaissance,religious painting,"panel,tempera",,
1,Hilly van Eerten,"'The gate again' - lithography print art, 1997...",1997.0,Amsterdam,Abstract Art,abstract,"lithography,paper",,48 x 30 cm
2,Jamie Wyeth,Wolf Dog,1976.0,,Contemporary Realism,animal painting,,,
3,Vasily Polenov,The boy in Nazareth. Jews in Tabor.,1882.0,,Realism,sketch and study,,,
4,Richard Jack,The Swedish Dyehouse,,,Impressionism,genre painting,,,
5,Wilhelm Trubner,Pomona,1898.0,,"Realism,Art Nouveau (Modern)",nude painting (nu),,,
6,Martiros Sarian,Yerevan,1923.0,,Expressionism,sketch and study,,,
7,Joshua Reynolds,The Children of Edward Hollen Cruttenden,,,Rococo,portrait,,,
8,Robert Qualters,Mary Shaw,1980.0,,Neo-Expressionism,interior,,,
9,Eugene Boudin,High seas,1875.0,,Impressionism,marina,,,


In [152]:
df.groupby('creator').filter(lambda x: x['name'].count() >= 2)

Unnamed: 0,creator,name,year,place,style,genre,media,location,dimension
1,Gustave Dore,The Embarkation of the Souls,,,Romanticism,illustration,,,
2,Roger Weik,"""Infusion"" 2016",2016.0,Los Angeles,Abstract Expressionism,abstract,,,
11,Wojciech Siudmak,Three Graces,,,Fantastic Realism,symbolic painting,,,
30,Richard Jack,The Swedish Dyehouse,,,Impressionism,genre painting,,,
35,Gustave Dore,David Shows Saul How He Spared His Life,,,Romanticism,religious painting,,,
39,Vincent van Gogh,Stooping Woman in Landscape,1883.0,,Realism,sketch and study,"ink,paper","Van Gogh Museum, Amsterdam, Netherlands",
42,Roger Weik,"""Infusion"" 2016",2016.0,Los Angeles,Abstract Expressionism,abstract,,,
48,Richard Jack,The Swedish Dyehouse,,,Impressionism,genre painting,,,
62,Roger Weik,"""Infusion"" 2016",2016.0,Los Angeles,Abstract Expressionism,abstract,,,
64,Roger Weik,"""Infusion"" 2016",2016.0,Los Angeles,Abstract Expressionism,abstract,,,


# Creating a dataset