# WikiArt: Data Processing and EDA

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
wikiart_original = pd.read_csv('/Users/annielou/Desktop/General Assembly/unit4-project-capstone/wikiart_scraped.csv')

In [3]:
wikiart_original.describe()

Unnamed: 0,Style,Artwork,Artist,Date,Link
count,124170,124170,124170,124170,124170
unique,217,92050,3052,2959,116667
top,Romanticism,Untitled,Giovanni Battista Piranesi,XIX-XX cent.,https://uploads.wikiart.org/Content/wiki/img/l...
freq,3600,4794,1352,5713,54


In [4]:
wikiart_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124170 entries, 0 to 124169
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Style    124170 non-null  object
 1   Artwork  124170 non-null  object
 2   Artist   124170 non-null  object
 3   Date     124170 non-null  object
 4   Link     124170 non-null  object
dtypes: object(5)
memory usage: 4.7+ MB


In [5]:
wikiart_original.head(10)

Unnamed: 0,Style,Artwork,Artist,Date,Link
0,Early-Dynastic,Narmer Palette,Ancient Egypt,3050 BC,https://uploads3.wikiart.org/00265/images/anci...
1,Early-Dynastic,Box Inlay with a Geometric Pattern,Ancient Egypt,3100-2900 BC,https://uploads2.wikiart.org/00244/images/anci...
2,Old-Kingdom,Khafre Enthroned,Ancient Egypt,2570 BC,https://uploads2.wikiart.org/00305/images/anci...
3,Middle-Kingdom,Stele of the Serpent King (Stela of Djet),Ancient Egypt,3000 BC,https://uploads7.wikiart.org/00305/images/anci...
4,Middle-Kingdom,"Laden Donkeys and Ploughing, Tomb of Djar",Ancient Egypt,2060-2010 BC,https://uploads8.wikiart.org/00244/images/anci...
5,Middle-Kingdom,"Women Preparing Food, Tomb of Djari",Ancient Egypt,2060-2010 BC,https://uploads2.wikiart.org/00244/images/anci...
6,Middle-Kingdom,Stela of a Man,Ancient Egypt,2030-1981 BC,https://uploads0.wikiart.org/00244/images/anci...
7,Middle-Kingdom,Relief Fragment Showing a Pile of Offerings an...,Ancient Egypt,2000-1981 BC,https://uploads6.wikiart.org/00244/images/anci...
8,Middle-Kingdom,"Stela of Intef and His Wife, Dedetamun",Ancient Egypt,1981-1802 BC,https://uploads5.wikiart.org/00244/images/anci...
9,Middle-Kingdom,Scarab with the Name of Amenemhat III and a Hi...,Ancient Egypt,1859-1749 BC,https://uploads2.wikiart.org/00244/images/anci...


In [6]:
wikiart_original['Style'].value_counts()

Romanticism                          3600
Impressionism                        3600
Baroque                              3600
Neoclassicism                        3600
Realism                              3600
                                     ... 
Graffiti-Art                            1
Early-Christian                         1
nd-Intermediate-Period                  1
New-media-art                           1
Stroganov-school-of-icon-painting       1
Name: Style, Length: 217, dtype: int64

In [7]:
wikiart_original['Link'][0] #Check if link is in the right format and accessible

'https://uploads3.wikiart.org/00265/images/ancient-egyptian-painting/narmer-palette.jpg'

## Cleaning the Date column

In [8]:
wikiart_original['Date'].value_counts()

XIX-XX cent.       5713
XX cent.           5669
XVIII-XIX cent.    2418
XVIII cent.        1766
XX-XXI cent.       1526
                   ... 
1742-1743             1
1740-1743             1
1738-1740             1
1735-1740             1
3050 BC               1
Name: Date, Length: 2959, dtype: int64

In [9]:
wikiart_original[wikiart_original['Date'] == '?']

# I did a quick search online and it looks like these works done by Denis Peterson were completed around year 2007. Let's replace these ? values with 2007.
# TAKI 183 was active during late 1960s and early 1970s. Let's replace these ? values with 1970.

Unnamed: 0,Style,Artwork,Artist,Date,Link
112096,Hyper-Realism,Apocalypse,Denis Peterson,?,https://uploads4.wikiart.org/images/denis-pete...
112097,Hyper-Realism,Can't Lose What You Never Had,Denis Peterson,?,https://uploads5.wikiart.org/images/denis-pete...
112098,Hyper-Realism,Cardboard Dreams,Denis Peterson,?,https://uploads1.wikiart.org/images/denis-pete...
112099,Hyper-Realism,Dust to Dust,Denis Peterson,?,https://uploads6.wikiart.org/images/denis-pete...
112100,Hyper-Realism,Neon Lights,Denis Peterson,?,https://uploads5.wikiart.org/images/denis-pete...
112101,Hyper-Realism,Not Again,Denis Peterson,?,https://uploads8.wikiart.org/images/denis-pete...
112102,Hyper-Realism,The Wall,Denis Peterson,?,https://uploads2.wikiart.org/images/denis-pete...
112103,Hyper-Realism,"Tombstone Hand, Graveyard Mind",Denis Peterson,?,https://uploads5.wikiart.org/images/denis-pete...
112104,Hyper-Realism,Toothbrush and a Comb,Denis Peterson,?,https://uploads6.wikiart.org/images/denis-pete...
112105,Hyper-Realism,Vortex,Denis Peterson,?,https://uploads0.wikiart.org/images/denis-pete...


In [10]:
wikiart = wikiart_original.copy()
wikiart.loc[112096:112105, 'Date'] = np.repeat('2007',10)
wikiart.loc[116620:116626, 'Date'] = np.repeat('1970',7)
wikiart[wikiart['Artwork'] == 'Metro Tag'] # Verify

Unnamed: 0,Style,Artwork,Artist,Date,Link
116620,Street-art,Metro Tag,TAKI 183,1970,https://uploads6.wikiart.org/images/taki-183/m...


It looks like some values in the Dates column need to be changed for analysis purposes, such as "VII-VIII cent.", "1427-1400 BC", "35 BC", "1503-1519", "100 BC - 100 AD".

To break it down, we need to:

1. Change Roman Numerals to Numbers: For values like "VII-VIII cent.", I need to convert Roman numerals to Arabic numbers.
2. Handle Date Ranges: For ranges such as "1427-1400 BC", I will compute the average of the two numbers and replace the range with that average.
3. Handle BC Dates: For dates with "BC", I will convert the number into its negative form.

## Step 1: Convert Roman Numerals to Numbers

In [11]:
def roman_to_number(s):
    roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    num = 0
    prev_value = 0
    for i in s:
        if roman_numerals[i] > prev_value:
            num += roman_numerals[i] - 2 * prev_value
        else:
            num += roman_numerals[i]
        prev_value = roman_numerals[i]
    return num

## Step 2: Handle Date Ranges

In [12]:
def handle_date_range(date_str):
    if '-' in date_str:
        start, end = date_str.split('-')
        start, end = int(start), int(end)
        return (start + end) // 2
    return int(date_str)

## Step 3: Handle BC Dates

In [13]:
def handle_bc(date_str):
    if 'BC' in date_str:
        return -int(date_str.replace('BC', '').strip())
    return int(date_str)

## Step 4: Applying the Changes

In [14]:
def process_date(date_str):
    # Handle mixed AD and BC dates with ranges, like "100 BC-100 AD"
    if 'AD' in date_str and 'BC' in date_str:
        start, end = date_str.split('-')
        start = -int(start.replace('BC', '').strip())  # Convert BC to negative
        end = int(end.replace('AD', '').strip())
        return (start + end) // 2
    
    # Handle BC dates with ranges, like "3100-2900 BC"
    elif 'BC' in date_str and '-' in date_str:
        date_without_bc = date_str.replace('BC', '').strip()
        avg_date = handle_date_range(date_without_bc)
        return -avg_date  # Convert to negative since it's BC
    
    # Handle BC dates without ranges
    elif 'BC' in date_str:
        return handle_bc(date_str)
    
    # Handle singular roman numeral dates like "XX cent."
    elif 'cent.' in date_str and '-' not in date_str:
        century_value = roman_to_number(date_str.replace('cent.', '').strip()) * 100
        # For values like "XX cent.", use 2023 to calculate midpoint, so we can avoid having 2050 as our artwork date
        if century_value == 2100:
            century_value = 2023
        return century_value - 50  # Assuming middle of century
    
    # Handle roman numeral date ranges like "VII-VIII cent."
    elif 'cent.' in date_str:
        start, end = date_str.replace('cent.', '').split('-')
        start, end = roman_to_number(start.strip()) * 100, roman_to_number(end.strip()) * 100
        # For values like "XX - XXI cent.", use 2023 to calculate midpoint, so we can avoid having 2050 as our artwork date
        if end == 2100:
            end = 2023
        return (start + end) // 2
    
    # Handle date ranges without BC
    elif '-' in date_str:
        return handle_date_range(date_str)
    
    # For all other dates
    else:
        return int(date_str.strip())

# Apply the final transformations
wikiart['Date'] = wikiart['Date'].apply(process_date)

# Display the first few rows of the modified data
wikiart.head()

Unnamed: 0,Style,Artwork,Artist,Date,Link
0,Early-Dynastic,Narmer Palette,Ancient Egypt,-3050,https://uploads3.wikiart.org/00265/images/anci...
1,Early-Dynastic,Box Inlay with a Geometric Pattern,Ancient Egypt,-3000,https://uploads2.wikiart.org/00244/images/anci...
2,Old-Kingdom,Khafre Enthroned,Ancient Egypt,-2570,https://uploads2.wikiart.org/00305/images/anci...
3,Middle-Kingdom,Stele of the Serpent King (Stela of Djet),Ancient Egypt,-3000,https://uploads7.wikiart.org/00305/images/anci...
4,Middle-Kingdom,"Laden Donkeys and Ploughing, Tomb of Djar",Ancient Egypt,-2035,https://uploads8.wikiart.org/00244/images/anci...


## Verification

In [15]:
wikiart['Date'].describe()

count    124170.000000
mean       1856.379222
std         231.414707
min       -3050.000000
25%        1850.000000
50%        1919.000000
75%        1962.000000
max        2021.000000
Name: Date, dtype: float64

In [16]:
wikiart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124170 entries, 0 to 124169
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Style    124170 non-null  object
 1   Artwork  124170 non-null  object
 2   Artist   124170 non-null  object
 3   Date     124170 non-null  int64 
 4   Link     124170 non-null  object
dtypes: int64(1), object(4)
memory usage: 4.7+ MB


In [17]:
wikiart_original[wikiart_original['Artwork'] == 'Transfiguration, wall painting']

Unnamed: 0,Style,Artwork,Artist,Date,Link
1256,Byzantine,"Transfiguration, wall painting",Jerzy Nowosielski,XX-XXI cent.,https://uploads0.wikiart.org/00128/images/jerz...


In [18]:
wikiart[wikiart['Artwork'] == 'Transfiguration, wall painting']

Unnamed: 0,Style,Artwork,Artist,Date,Link
1256,Byzantine,"Transfiguration, wall painting",Jerzy Nowosielski,2011,https://uploads0.wikiart.org/00128/images/jerz...


In [19]:
wikiart.to_csv('wikiart_cleaned.csv') # Save Cleaned Data

## Popularity of Art Styles

### 1. The 20 most popular art styles of all times

In [20]:
style = wikiart[['Style','Artwork']].groupby(['Style']).count().sort_values(by='Artwork', ascending = False)
style.reset_index(inplace = True)
style.head(20)
# Looks like the max number of a certain style in this dataset is 3600. Why? Might have to do with how the data was scraped from WikiArt?

Unnamed: 0,Style,Artwork
0,Art-Nouveau-(Modern),3600
1,Symbolism,3600
2,Baroque,3600
3,Realism,3600
4,Romanticism,3600
5,Neoclassicism,3600
6,Expressionism,3600
7,Post-Impressionism,3600
8,Surrealism,3600
9,Naïve-Art-(Primitivism),3600


Let's add columns Earliest Artwork Date, Latest Artwork Date, and Average Date, which might be helpful for analysis

In [21]:
earliest = wikiart.groupby('Style').min().reset_index()[['Style','Date']]
latest = wikiart.groupby('Style').max().reset_index()[['Style','Date']]
average = wikiart.groupby('Style').mean().reset_index()[['Style','Date']]
style_merged = style.merge(earliest, on = 'Style').merge(latest, on = 'Style').merge(average, on = 'Style').sort_values(by = 'Date_x')
style_merged = style_merged.rename(columns={'Date_x':'Earliest Year','Date_y': 'Latest Year', 'Date':'Average Year'})
style_merged['Average Year'] = style_merged['Average Year'].astype(int)
style_merged

  average = wikiart.groupby('Style').mean().reset_index()[['Style','Date']]


Unnamed: 0,Style,Artwork,Earliest Year,Latest Year,Average Year
209,Early-Dynastic,2,-3050,-3000,-3025
198,Middle-Kingdom,9,-3000,-1721,-2025
214,Old-Kingdom,1,-2570,-2570,-2570
216,nd-Intermediate-Period,1,-1550,-1550,-1550
116,New-Kingdom,114,-1504,-1128,-1400
...,...,...,...,...,...
146,Queer-art,57,2001,2020,2011
212,Graffiti-Art,1,2002,2002,2002
195,Ero-guro,11,2011,2011,2011
213,New-media-art,1,2013,2013,2013


### 2. 20 Most Prolific Artists of All Times

In [22]:
wikiart[['Artist','Artwork']].groupby(['Artist']).count().sort_values(by='Artwork', ascending = False).head(20)

Unnamed: 0_level_0,Artwork
Artist,Unnamed: 1_level_1
Giovanni Battista Piranesi,1352
Vincent van Gogh,1101
Pablo Picasso,992
Alfred Freddy Krupa,923
Albrecht Durer,845
Marc Chagall,834
Salvador Dali,730
Claude Monet,692
Rembrandt,655
Nicholas Roerich,546


## Trends in Art Over Time

### 1. Trend in Number of Artworks Over Time

In [23]:
fig = px.histogram(wikiart,x='Date',nbins=int((2023 + 3050) / 20))
fig.show()


**There seems to be very large number of artworks produced between 1900 - 1999 in this dataset. There could be a number of reasons**
1) the way the data was scraped
2) it is harder to access and preserve artworks produced earlier than 1900
3) in the 20th century, changes in art forms and concepts have made art more accessible to the general public (i.e. the invention of cameras, rising popularity of graffiti art as well as how modern art appreciates using ready-made objects to create artworks)

In [24]:
fig = px.bar(style_merged, x = 'Artwork', y = 'Style', animation_frame='Average Year',title='Trends in Art Styles Over Time')
fig.show()