In [1]:
import pandas as pd
import html
import os
import re

In [2]:
#! ls ../data/WikiArt-Emotions/

# Load file

In [3]:
art_info = pd.read_csv("../data/WikiArt-Emotions/WikiArt-info.tsv", sep='\t')

# Clean html text

In [4]:
def clean_html_text(df):
    """
    This function, applies the html.unescape() function to each row 
    in the'Title' and 'Artist' columns to convert any HTML escape characters 
    in the strings back to their original form.
    """
    df['Title'] = df['Title'].apply(lambda x: html.unescape(x))
    df['Artist'] = df['Artist'].apply(lambda x: html.unescape(x))
    return df

In [5]:
clean_html_text (art_info)

Unnamed: 0,ID,Category,Artist,Title,Year,Image URL,Painting Info URL,Artist Info URL
0,58c6237dedc2c9c7dc0de1ae,Impressionism,Charles Courtney Curran,In the Luxembourg Garden,1889,https://uploads3.wikiart.org/00123/images/char...,https://www.wikiart.org/en/charles-courtney-cu...,https://www.wikiart.org/en/charles-courtney-cu...
1,577280dfedc2cb3880f28e76,Neo-Expressionism,Keith Haring,The Marriage of Heaven and Hell,1984,https://uploads1.wikiart.org/images/keith-hari...,https://www.wikiart.org/en/keith-haring/the-ma...,https://www.wikiart.org/en/keith-haring
2,57727f2dedc2cb3880ed5fa9,Post-Impressionism,Jozsef Rippl-Ronai,Uncle Piacsek in front of the Black Sideboard,1906,https://uploads3.wikiart.org/images/j-zsef-rip...,https://www.wikiart.org/en/jozsef-rippl-ronai/...,https://www.wikiart.org/en/jozsef-rippl-ronai
3,58d1240cedc2c94f900fc610,Cubism,Vadym Meller,Monk. For the Play 'Mazeppa',1920,https://uploads2.wikiart.org/00124/images/vady...,https://www.wikiart.org/en/vadym-meller/monk-f...,https://www.wikiart.org/en/vadym-meller
4,57727de7edc2cb3880e91f26,Romanticism,David Wilkie,The Defence of Saragoça,1828,https://uploads6.wikiart.org/images/david-wilk...,https://www.wikiart.org/en/david-wilkie/the-de...,https://www.wikiart.org/en/david-wilkie
...,...,...,...,...,...,...,...,...
4114,577287aeedc2cb388007fba9,Color Field Painting,Rupprecht Geiger,OE 260,1957,https://uploads7.wikiart.org/images/rupprecht-...,https://www.wikiart.org/en/rupprecht-geiger/oe...,https://www.wikiart.org/en/rupprecht-geiger
4115,57728001edc2cb3880efddcf,Surrealism,Oscar Dominguez,Máquina de coser electro-sexual,1934,https://uploads4.wikiart.org/images/oscar-domi...,https://www.wikiart.org/en/oscar-dominguez/m-q...,https://www.wikiart.org/en/oscar-dominguez
4116,57728ac7edc2cb3880123cc2,Neo-Expressionism,Georg Baselitz,Female Nude on a Kitchen Chair,1979,https://uploads2.wikiart.org/images/georg-base...,https://www.wikiart.org/en/georg-baselitz/fema...,https://www.wikiart.org/en/georg-baselitz
4117,57728412edc2cb3880fc9ff4,Expressionism,Marie Laurencin,Apollinaire and His Friends,1909,https://uploads0.wikiart.org/images/marie-laur...,https://www.wikiart.org/en/marie-laurencin/apo...,https://www.wikiart.org/en/marie-laurencin


# Clean category

In [6]:
art_info.Category.value_counts()

Category
Realism                                        200
Northern Renaissance                           200
Baroque                                        200
Impressionism                                  199
Pop Art                                        199
Rococo                                         199
Neo-Expressionism                              198
Art Informel                                   197
Post-Impressionism                             196
Surrealism                                     194
Minimalism                                     193
Lyrical Abstraction                            192
Abstract Expressionism                         191
Romanticism                                    191
Expressionism                                  191
Neoclassicism                                  191
Color Field Painting                           188
Abstract Art                                   187
Cubism                                         183
Magic Realism         

In [7]:
def clean_category(df):
    
    """
    This function is used to clean and update the "Category" column. 
    It modifies some "incorrect" categories based on specific conditions.
    """
    
    # Neoclassicism,Romanticism
    df.loc[df['Title'] == 'Malvine, Dying in the Arms of Fingal', 'Category'] = 'Neoclassicism'
    df.loc[df['Title'] == 'Charlotte Rothsch, Baroness Anselm De Rothschild', 'Category'] = 'Romanticism'
    df.loc[df['Title'] == 'O Milagre de Ourique', 'Category'] = 'Neoclassicism'
    df.loc[df['Title'] == 'Mademoiselle Lange as Venus', 'Category'] = 'Neoclassicism'
    df.loc[df['Title'] == 'Pygmalion et Galatée', 'Category'] = 'Neoclassicism'
    df.loc[df['Title'] == 'Retrato equestre de João V de Portugal', 'Category'] = 'Neoclassicism'
    df.loc[df['Title'] == 'The Worship of the Mages', 'Category'] = 'Neoclassicism'
    df.loc[df['Title'] == 'Napoleon I in Coronation robes', 'Category'] = 'Romanticism'

    # Cubism,Expressionism
    df.loc[df['Title'] == 'Harvest', 'Category'] = 'Cubism'
    df.loc[df['Title'] == 'Orfeu nos Infernos', 'Category'] = 'Cubism'
    df.loc[df['Title'] == 'Portret Van Elizabeth Sergejevna Potehinoj', 'Category'] = 'Cubism'
    df.loc[df['Title'] == 'The Street Enters the House', 'Category'] = 'Cubism'
    df.loc[df['Title'] == 'Landscape with a sail', 'Category'] = 'Expressionism'
    df.loc[df['Title'] == 'Orfeu nos Infernos (detail)', 'Category'] = 'Cubism'
    df.loc[(df['Title'] == 'Nude') & (df['Artist'] == 'Lajos Tihanyi'), 'Category'] = 'Expressionism'
    df.loc[df['Title'] == 'Yard in Crimea', 'Category'] = 'Expressionism'

    # Abstract Art,Cubism
    df.loc[df['Title'] == 'Composition monumentale', 'Category'] = 'Abstract Art'
    df.loc[df['Title'] == 'Himmel', 'Category'] = 'Abstract Art'
    df.loc[df['Title'] == 'Femme à la blouse jaune', 'Category'] = 'Cubism'
    df.loc[df['Title'] == 'Lucky Strike', 'Category'] = 'Abstract Art'
    df.loc[df['Title'] == 'Chess Players III', 'Category'] = 'Cubism'
    df.loc[df['Title'] == 'Composition I (Still life)', 'Category'] = 'Abstract Art'
    df.loc[df['Title'] == 'In the Hold', 'Category'] = 'Abstract Art'

    # Color Field Painting,Lyrical Abstraction
    df.loc[df['Category'] == 'Color Field Painting,Lyrical Abstraction', 'Category'] = 'Color Field Painting'

    # Abstract Art,Surrealism
    df.loc[df['Title'] == 'Hot Air Balloon', 'Category'] = 'Abstract Art'
    df.loc[df['Title'] == 'Modality Series, Spring Awakening 854A', 'Category'] = 'Surrealism'
    df.loc[df['Title'] == 'Dux et Comes I', 'Category'] = 'Abstract Art'

    # Abstract Expressionism,Minimalism
    df.loc[df['Category'] == 'Abstract Expressionism,Minimalism', 'Category'] = 'Abstract Expressionism'

    # Color Field Painting,Minimalism 
    df.loc[df['Title'] == '586/69 (Gerundetes Rot)', 'Category'] = 'Color Field Painting'
    df.loc[(df['Title'] == 'Untitled') & (df['Artist'] == 'Mark Rothko'), 'Category'] = 'Color Field Painting'
    df.loc[df['Title'] == 'The Wild', 'Category'] = 'Minimalism'

    # Art Informel,Magic Realism
    df.loc[df['Category'] == 'Art Informel,Magic Realism', 'Category'] = 'Magic Realism'

    #Abstract Expressionism,Lyrical Abstraction
    df.loc[df['Category'] == 'Abstract Expressionism,Lyrical Abstraction', 'Category'] = 'Lyrical Abstraction'

    # Abstract Expressionism,Color Field Painting
    df.loc[df['Title'] == 'Tanabata', 'Category'] = 'Color Field Painting'
    df.loc[df['Title'] == 'Reefs', 'Category'] = 'Abstract Expressionism'
    
    # 1_element
    df.loc[df['Category'] == 'Neo-Expressionism,Pop Art', 'Category'] = 'Neo-Expressionism'
    df.loc[df['Category'] == 'Magic Realism,Neoclassicism', 'Category'] = 'Magic Realism'
    df.loc[df['Category'] == 'Art Informel,Minimalism', 'Category'] = 'Art Informel'
    df.loc[df['Category'] == 'Abstract Expressionism,Surrealism', 'Category'] = 'Surrealism'
    df.loc[df['Category'] == 'Magic Realism,Surrealism', 'Category'] = 'Surrealism'
    df.loc[df['Category'] == 'Rococo,Romanticism', 'Category'] = 'Romanticism'
    df.loc[df['Category'] == 'Cubism,Post-Impressionism', 'Category'] = 'Cubism'
    df.loc[df['Category'] == 'Expressionism,Post-Impressionism', 'Category'] = 'Expressionism'
    df.loc[df['Category'] == 'Abstract Art,Post-Impressionism', 'Category'] = 'Abstract Art'
    df.loc[df['Category'] == 'Abstract Art,Color Field Painting', 'Category'] = 'Abstract Art'
    df.loc[df['Category'] == 'Impressionism,Post-Impressionism', 'Category'] = 'Post-Impressionism'
    df.loc[df['Category'] == 'Abstract Art,Abstract Expressionism', 'Category'] = 'Abstract Expressionism'
    df.loc[df['Category'] == 'Magic Realism,Neo-Expressionism', 'Category'] = 'Magic Realism'
    df.loc[df['Category'] == 'Cubism,Surrealism', 'Category'] = 'Surrealism'

    df.loc[df['Category'] == 'Northern Renaissance', 'Category'] = 'Northern-Renaissance'
    df.loc[df['Category'] == 'Pop Art', 'Category'] = 'Pop-Art'
    df.loc[df['Category'] == 'Art Informel ', 'Category'] = 'Art-Informel'
    df.loc[df['Category'] == 'Pop Art', 'Category'] = 'Color-Field-Painting'
    df.loc[df['Category'] == 'Art Informel', 'Category'] = 'Art-Informel'
    df.loc[df['Category'] == 'Abstract Art', 'Category'] = 'Abstract-Art'
    df.loc[df['Category'] == 'Abstract Expressionism', 'Category'] = 'Abstract-Expressionism'
    df.loc[df['Category'] == 'Lyrical Abstraction', 'Category'] = 'Lyrical-Abstraction'
    df.loc[df['Category'] == 'Magic Realism', 'Category'] = 'Magic-Realism'
    df.loc[df['Category'] == 'Early Renaissance', 'Category'] = 'Early-Renaissance'
    df.loc[df['Category'] == 'High Renaissance', 'Category'] = 'High-Renaissance'
    
    df.loc[df['Category'] == 'Color Field Painting', 'Category'] = 'Color-Field-Painting'
    
    
    return df

In [8]:
art_info = clean_category(art_info)

In [9]:
art_info.Category.value_counts()

Category
Baroque                   200
Realism                   200
Northern-Renaissance      200
Impressionism             199
Rococo                    199
Neo-Expressionism         199
Pop-Art                   199
Surrealism                198
Art-Informel              198
Color-Field-Painting      198
Neoclassicism             197
Post-Impressionism        197
Abstract-Art              196
Abstract-Expressionism    196
Expressionism             195
Romanticism               194
Lyrical-Abstraction       194
Minimalism                193
Cubism                    191
Magic-Realism             152
Early-Renaissance         119
High-Renaissance          105
Name: count, dtype: int64

# Get images from URL

In [10]:
def get_images_from_URL(df):
    """
    This function is used to download and save images from URLs.
    The function uses the curl command-line tool to download the
    images and save them to a specified folder based on the category.
    """

    folder_base_path = "../images/WikiArt/"

    unique_categories = df["Category"].unique()

    new_list = []

    for index, row in df.iterrows():
        ID = row["ID"]
        Category = row["Category"]
        image_url = row["Image URL"]

        folder_path = os.path.join(folder_base_path, Category)
        name = f"{ID}.jpg"
        link = f"{image_url.replace('(', '%28').replace(')', '%29')}"

        try:
            # create folder
            os.makedirs(folder_path, exist_ok=True)

            # download and save image
            command_curl = f"curl {link} > {os.path.join(folder_path, name)}"
            os.system(command_curl)
        except:
            new_list.append(image_url)

    os.system("say im done")

    return new_list


In [11]:
art_info.shape

(4119, 8)

In [12]:
#get_images_from_URL(art_info)