In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [79]:
artists = pd.read_parquet('https://kuleuven-datathon-2023.s3.eu-central-1.amazonaws.com/data/Artist.parquet.gzip')
artists.head()

Unnamed: 0,id,name,url,summary,picture,birthplace,deathplace,birthdate,deathdate,cause_of_death
0,0,Vincent Van Gogh,http://wikigallery.org/wiki/artist36933/Vincen...,Vincent Willem van Gogh (Dutch: [ˈvɪnsɛnt ˈʋɪl...,0.0,0.0,342.0,1853-03-30,1890-07-29,Gunshot wound
1,1,Pierre Auguste Renoir,http://wikigallery.org/wiki/artist39254/Pierre...,Pierre-Auguste Renoir (French: [pjɛʁ oɡyst ʁən...,1.0,1.0,343.0,1841-02-25,1919-12-03,
2,2,Claude Oscar Monet,http://wikigallery.org/wiki/artist39249/Claude...,"Oscar-Claude Monet (UK: , US: , French: [klod ...",2.0,2.0,344.0,1840-11-14,1926-12-05,
3,3,Fernando Botero,http://wikigallery.org/wiki/artist37052/Fernan...,Fernando Botero Angulo (born 19 April 1932) is...,3.0,3.0,,1932-04-19,,
4,4,Jean-Léon Gérôme,http://wikigallery.org/wiki/artist46453/Jean-L...,Jean-Léon Gérôme (11 May 1824 – 10 January 190...,4.0,4.0,2.0,1824-05-11,1904-01-10,


<h3>Artist</h3>

The 'Artist' table is a central component of the dataset and provides information on famous painters and their careers. Next you'll find some basic info about this part of the dataset. But first, let's do some cleaning.

In [80]:
# find all rows with non-alphabetic characters in the 'name' column
non_alpha = artists[artists['name'].str.contains('[^a-zA-Z\séèêëôöîïçñ\-]')]
print(non_alpha['name'])
print(artists.shape)
print(non_alpha.shape)

46                 Bartlett, William Henry
81                (after) Hieronymus Bosch
83           (after) Pietro Antonio Rotari
94                    (after) William Etty
107         (after) Philippe De Champaigne
                      ...                 
565               (after) Andrea Del Sarto
567                  Christian F. Schwerdt
578               Pieter Pietersz. Lastman
594    Hendrick van and Brueghel, J. Balen
608                     Ercole de' Roberti
Name: name, Length: 79, dtype: object
(616, 10)
(79, 10)


In [81]:
# remove all the '(after)' from the 'name' column for all rows
artists['name'] = artists['name'].str.replace(r'\(after\)', '')
non_alpha = artists[artists['name'].str.contains('[^a-zA-Z\séèêëôöîïçñ\-]')]
print(non_alpha['name'])
print(artists.shape)
print(non_alpha.shape)

46                             Bartlett, William Henry
112                              Dyck, Sir Anthony van
120                 (Giovanni Antonio Canal) Canaletto
121                                    Arthur W. Perry
128                          Tivadar Kosztka Csontváry
141    Girolamo Francesco Maria Mazzola (Parmigianino)
152                    Edouard (Jean-Edouard) Vuillard
157                           Paolo Veronese (Caliari)
160                                           J. Haier
162                        Jacopo Tintoretto (Robusti)
174                  William (Turner of Oxford) Turner
180                        Correggio (Antonio Allegri)
191                          Millais, Sir John Everett
206                                         H. Pittard
213                                    Henry W. Hansen
216                    Sebastiano Del Piombo (Luciani)
236                                  Catherine M. Wood
242                      Bernardo Bellotto (Canaletto)
243       

  


In [82]:
# remove all the ',' from the fields in the 'name' column for all rows
artists['name'] = artists['name'].str.replace(r'\,', '')
non_alpha = artists[artists['name'].str.contains('[^a-zA-Z\séèêëôöîïçñ\-]')]
print(non_alpha['name'])
print(artists.shape)
print(non_alpha.shape)

120                 (Giovanni Antonio Canal) Canaletto
121                                    Arthur W. Perry
128                          Tivadar Kosztka Csontváry
141    Girolamo Francesco Maria Mazzola (Parmigianino)
152                    Edouard (Jean-Edouard) Vuillard
157                           Paolo Veronese (Caliari)
160                                           J. Haier
162                        Jacopo Tintoretto (Robusti)
174                  William (Turner of Oxford) Turner
180                        Correggio (Antonio Allegri)
206                                         H. Pittard
213                                    Henry W. Hansen
216                    Sebastiano Del Piombo (Luciani)
236                                  Catherine M. Wood
242                      Bernardo Bellotto (Canaletto)
243                           Theofilos (Hadjimichail)
259                    Iulii Iul'evich (Julius) Klever
263                    Henri-Jules-Jean Geoffroy (Geo)
273       

  


<p>Enough cleaning for now. Let's fill some missing data</p>

In [83]:
# Select all artists that have death date as NaN
artists_alive = artists[artists['deathdate'].isna()]
print("Potentially alive: " + str(artists_alive.shape))
# Select all artists that have 'deathdate' different from NaN
artists_dead = artists[artists['deathdate'].notna()]
print("Potentially dead: " + str(artists_dead.shape))

Potentially alive: (346, 10)
Potentially dead: (270, 10)


In [84]:
artists_with_birthdate = artists[artists['birthdate'].notna()]
print("artists_with_birthdate: " + str(artists_with_birthdate.shape))

artists_without_birthdate = artists[artists['birthdate'].isna()]
print("artists_without_birthdate: " + str(artists_without_birthdate.shape))

artists_without_birthdate["name"].head()

artists_with_birthdate: (225, 10)
artists_without_birthdate: (391, 10)


5                Paul Cezanne
7         John Singer Sargent
10                    Raphael
11    Michelangelo Buonarroti
16          Peter Paul Rubens
Name: name, dtype: object

In [85]:
artists_without_birthdate_or_deathdate = artists[
    artists['birthdate'].isna() | artists['deathdate'].isna()]

print("artists_without_birthdate_or_deathdate: " + str(artists_without_birthdate_or_deathdate.shape))

artists_without_birthdate_or_deathdate: (400, 10)


It seems most of the artists in this dataset don't have a birthdate or a deathdate. Some may still be alive but can we fill in the missing birthdates?

In [86]:
# get the name of the first 10 artists without birthdate
artists_without_birthdate_or_deathdate[['name','birthdate','deathdate']].head(10)

Unnamed: 0,name,birthdate,deathdate
3,Fernando Botero,1932-04-19,
5,Paul Cezanne,,
7,John Singer Sargent,,
10,Raphael,,
11,Michelangelo Buonarroti,,1564-02-18
16,Peter Paul Rubens,,
22,Tiziano Vecellio,,1576-08-27
25,Albrecht Durer,,
28,Jacques Louis David,,
30,Giotto Di Bondone,,1337-01-08


In [10]:
import scraper_functions as sf
# # Example usage
# convert the artists_without_birthdate_and_deathdate["name"] to a list
a = artists_without_birthdate["name"].tolist()

#wikipedia_links, artists_not_found = sf.get_wikipedia_links(a)

IntProgress(value=0, max=391)

In [11]:
print("nr Artists potentially found on wikipedia: " + str(len(wikipedia_links)))
print("nr Artists not found on wikipedia: " + str(len(artists_not_found)))

nr Artists potentially found on wikipedia: 377
nr Artists not found on wikipedia: 15


In [12]:
print(artists_not_found)

['Oene Romkes De Jongh', 'Marianne Preindelsberger Stokes', 'Auguste Joseph Marie De Mersseman', 'Axel Birkhammer', 'Albert Lambron Des Pilitieres', "Iulii Iul'evich (Julius) Klever", 'Jose Llaneces', 'Pieter II Peetersz', 'August Theodor Schoefft', 'Ladislas Wladislaw von Czachorski', 'Emile Eisman Semenovsky', 'Klavdiy Vasilievich Lebedev', 'Alexei Alexeivich Harlamoff', 'Frederick Marianus Kruseman', 'Jan Baptiste de Landtsheer']


In [13]:
# for each element in dictionary, write the key and value to a csv file
with open("artists_without_birthdate_and_deathdate.txt", "w") as file:
    file.write("name,link"+"\n")
    for key, value in wikipedia_links.items():
            file.write(key+","+value+"\n")


In [15]:
#names_and_dates = sf.get_dates_from_wikipedia(wikipedia_links)

IntProgress(value=0, max=377)

In [18]:
print(len(names_and_dates))

377


In [46]:
# for each element in dictionary, write the key and value to a csv file
with open("artists_birthdate_deathdate.txt", "w") as file:
    file.write("name,birthdate,deathdate"+"\n")
    for key, value in names_and_dates.items():
            file.write(key+","+str(value['birth_year'])+","+str(value['death_year'])+"\n")

In [87]:
df = pd.read_csv('artists_birthdate_deathdate.txt', delimiter = ",", encoding ="ANSI")
df.head(10)

Unnamed: 0,name,birthdate,deathdate
0,Paul Cezanne,1839,1906
1,John Singer Sargent,1856,1925
2,Raphael,1483,1520
3,Michelangelo Buonarroti,1475,1564
4,Peter Paul Rubens,1577,1640
5,Tiziano Vecellio,1488,1576
6,Albrecht Durer,1471,1528
7,Jacques Louis David,1748,1825
8,Giotto Di Bondone,1267,1337
9,Franz Marc,1880,1916


In [88]:
# for the artist names in df, check if the same artist name is in artists and check if the birthdate is NaN replace the birthdate with the birthdate from df. If the deathdate is NaN, replace the deathdate with the deathdate from df

for index, row in df.iterrows():
    # locate the row in artists
    row_index = artists[artists['name'] == row['name']].index[0]
    # print artist[['name','birthdate','deathdate']] at row_index
    # print(artists[['name','birthdate','deathdate']].loc[row_index])
    # if birthdate is None at row_index, replace with birthdate from df if not None
    if pd.isnull(artists['birthdate'].loc[row_index]):
        artists['birthdate'].loc[row_index] = df['birthdate'].loc[index]
    # if deathdate is None at row_index, replace with deathdate from df if not None
    if pd.isnull(artists['deathdate'].loc[row_index]):
        artists['deathdate'].loc[row_index] = df['deathdate'].loc[index]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [92]:
for index, row in artists.iterrows():
    # check if birthdate is a string and does not contain numeric characters with regex
    if isinstance(row['birthdate'], str) and not any(char.isdigit() for char in row['birthdate']):
        artists.loc[index, 'birthdate'] = np.nan
    # check if deathdate is a string and does not contain numeric characters with regex
    if isinstance(row['deathdate'], str) and not any(char.isdigit() for char in row['deathdate']):
        artists.loc[index, 'deathdate'] = np.nan

# Select all artists that have death date as NaN
artists_alive = artists[artists['deathdate'].isna()]
print("Potentially alive: " + str(artists_alive.shape))
# Select all artists that have 'deathdate' different from NaN
artists_dead = artists[artists['deathdate'].notna()]
print("Potentially dead: " + str(artists_dead.shape))


Potentially alive: (268, 10)
Potentially dead: (348, 10)


In [97]:
# copy the artists dataframe to a new dataframe
artists_clean = artists.copy()
# replace all entries that don't have numbers with NaN
artists_clean['deathdate'] = artists_clean['deathdate'].apply(lambda x: np.nan if re.sub('[0-9]+', '', str(x)) else x)

In [98]:
# Select all artists that have death date as NaN
artists_alive = artists_clean[artists_clean['deathdate'].isna()]
print("Potentially alive: " + str(artists_alive.shape))
# Select all artists that have 'deathdate' different from NaN
artists_dead = artists_clean[artists_clean['deathdate'].notna()]
print("Potentially dead: " + str(artists_dead.shape))

Potentially alive: (268, 10)
Potentially dead: (348, 10)


In [109]:
# convert artists_clean to csv
artists_clean.to_csv('artists_clean.csv', index=False)