In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import xlrd
warnings.filterwarnings('ignore')

# Further Pre Processing - Standardize Currency - Medium - Country

In [2]:
#import clean data
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,low_estimate,medium,sales_price,signed,stamped,title,width,aspect_ratio,area,sold
0,Huang Binhong,05/30/2017,Christies,Hong Kong,1364,paper,China,1947-01-01 00:00:00,USD,0.0,...,38490.0,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93,1.99,569.86,1
1,Huang Binhong,05/30/2017,Christies,Hong Kong,1365,paper,China,1990-01-01 00:00:00,USD,0.0,...,38490.0,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43,1.3,141.64,0
2,Yun Tang,05/30/2017,Christies,Hong Kong,1366,paper,,,USD,0.0,...,15400.0,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,,,1
3,Huang Binhong,05/30/2017,Christies,Hong Kong,1367,paper,China,,USD,0.0,...,320750.0,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55,2.63,636.62,1
4,Yun Tang,05/30/2017,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,10260.0,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82,2.1,743.95,1


## Standardize Currency

In [3]:
#how many unique currencies are there?
df['currency'].nunique()

28

In [4]:
#What are the most common
df['currency'].value_counts().head(10)

USD    28954
GBP    14062
EUR     4560
HKD     3045
CHF      322
SEK      199
CNY      167
AUD      139
CAD      119
NOK       64
Name: currency, dtype: int64

In [5]:
# Are there any rows with null currency values?
df[df['currency'].isnull()]

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,low_estimate,medium,sales_price,signed,stamped,title,width,aspect_ratio,area,sold
12275,Adolph Gottlieb,05/13/2008,Christies,The United States: New York Rockefeller Center,22,painting,United States of America,,,0.0,...,,oil on canvas,6537000.0,1.0,0.0,Cool Blast,70.00,1.29,6300.00,1
12291,Adolph Gottlieb,05/14/2008,Christies,The United States: New York Rockefeller Center,146,paper,United States of America,1946-01-01 00:00:00,,1.0,...,,gouache watercolor and graphite on paper,145000.0,1.0,0.0,Mood Indigo,19.49,1.31,496.41,1
12320,Adolph Gottlieb,05/22/2007,Bonhams & Butterfields,The United States: California San Francisco,60,paper,United States of America,1949-01-01 00:00:00,,1.0,...,,gouache on paper,70000.0,1.0,0.0,Untitled,19.25,1.33,491.07,1
12337,Adriaen Coorte,03/28/2014,Hampel Fine Art Auctioneers,Germany: Munich,1097,paper,,,,0.0,...,,oil on paper,145000.0,1.0,0.0,STILL LIFE WITH SEA SHELLS ON A STONE PLATE,8.07,0.70,45.43,1
12350,Adriana Varejão,09/22/2011,Bolsa De Arte,Brasil: Rio de Janeiro,44,painting,Brazil,2001-01-01 00:00:00,,1.0,...,,oil on canvas,1011236.0,1.0,0.0,Macau Wall #5,39.37,1.00,1550.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53831,Zhu (ju Ming) Ming,05/25/2008,Christies,China: Hong Kong,311,sculpture,Taiwan,1994-01-01 00:00:00,,1.0,...,,bronze sculpture,907500.0,1.0,0.0,Taiji Series - Push Hand,12.99,1.85,312.02,1
53834,Zhu (ju Ming) Ming,05/25/2008,Christies,China: Hong Kong,278,sculpture,Taiwan,1991-01-01 00:00:00,,1.0,...,,wood sculpture,1327500.0,1.0,0.0,Taiji Series - Single Whip,15.75,0.70,173.56,1
53838,Zhu (ju Ming) Ming,05/25/2008,Christies,China: Hong Kong,313,sculpture,Taiwan,1991-01-01 00:00:00,,1.0,...,,sculpture,1927500.0,1.0,0.0,Taiji Series - Single Whip,23.62,0.53,297.61,1
53843,Zhu (ju Ming) Ming,05/25/2008,Christies,China: Hong Kong,281,sculpture,Taiwan,1995-01-01 00:00:00,,1.0,...,,sculpture wood,3007500.0,1.0,0.0,Taiji Series - Single Whip,27.56,1.00,759.55,1


In [6]:
# First, let's see what the most common auction locations are for rows with null currency values
df[df['currency'].isnull()].auction_location.value_counts().head(10)

 Bern                                              363
 The United States: New York Rockefeller Center    338
 United Kingdom: London New Bond Street            219
 Germany: Munich                                   208
 China: Hong Kong                                  134
 France: Paris                                      69
 Germany: Hamburg                                   57
 Germany: Cologne                                   50
 United Kingdom: London King Street                 50
 Hong Kong                                          44
Name: auction_location, dtype: int64

In [7]:
# Lets create a temporary column with the auction location country
df['auction_location_country'] = df['auction_location'].str.split(':').str[0]
#Clean any spaces
df['auction_location_country'] = df['auction_location_country'].str.strip()

# See how it looks
df[df['currency'].isnull()].auction_location_country.value_counts().head(10)

The United States           500
Germany                     364
Bern                        363
United Kingdom              321
China                       135
France                       88
United States of America     50
Hong Kong                    46
Ireland                      35
Italy                        25
Name: auction_location_country, dtype: int64

In [8]:
# See how this package works
import pycountry

country = pycountry.countries.get(name='Norway')
currency = pycountry.currencies.get(numeric=country.numeric)

currency.alpha_3

'NOK'

In [9]:
# Iterate through all the rows with null currency
for index, row in df[df['currency'].isnull()].iterrows():
    # Get the country name from the auction_location column
    auction_location_country = row['auction_location_country']

    # Try getting the country object from the pycountry package, if error, skip
    try:
        country = pycountry.countries.get(name=auction_location_country)
        # Get the currency object from the pycountry package
        currency = pycountry.currencies.get(numeric=country.numeric)
        # Set the currency column to the alpha_3 code of the currency object
        df.loc[index, 'currency'] = currency.alpha_3
    except:
        pass

# Check that we have no more null currency values
df[df['currency'].isnull()]

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,medium,sales_price,signed,stamped,title,width,aspect_ratio,area,sold,auction_location_country
12275,Adolph Gottlieb,05/13/2008,Christies,The United States: New York Rockefeller Center,22,painting,United States of America,,,0.0,...,oil on canvas,6537000.0,1.0,0.0,Cool Blast,70.00,1.29,6300.00,1,The United States
12291,Adolph Gottlieb,05/14/2008,Christies,The United States: New York Rockefeller Center,146,paper,United States of America,1946-01-01 00:00:00,,1.0,...,gouache watercolor and graphite on paper,145000.0,1.0,0.0,Mood Indigo,19.49,1.31,496.41,1,The United States
12320,Adolph Gottlieb,05/22/2007,Bonhams & Butterfields,The United States: California San Francisco,60,paper,United States of America,1949-01-01 00:00:00,,1.0,...,gouache on paper,70000.0,1.0,0.0,Untitled,19.25,1.33,491.07,1,The United States
12337,Adriaen Coorte,03/28/2014,Hampel Fine Art Auctioneers,Germany: Munich,1097,paper,,,,0.0,...,oil on paper,145000.0,1.0,0.0,STILL LIFE WITH SEA SHELLS ON A STONE PLATE,8.07,0.70,45.43,1,Germany
12350,Adriana Varejão,09/22/2011,Bolsa De Arte,Brasil: Rio de Janeiro,44,painting,Brazil,2001-01-01 00:00:00,,1.0,...,oil on canvas,1011236.0,1.0,0.0,Macau Wall #5,39.37,1.00,1550.00,1,Brasil
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52841,Zhang Daqian,12/11/2010,Lempertz,Germany: Cologne,72,paper,China,1944-01-01 00:00:00,,1.0,...,ink on paper,204000.0,1.0,0.0,Drei Gelehrte beim Brettspiel an einem bizarre...,21.77,2.06,977.04,1,Germany
53328,Zhang Xiaogang,11/20/2006,Bonhams & Butterfields,The United States: California San Francisco,6148,painting,China,2005-01-01 00:00:00,,1.0,...,oil on canvas,180000.0,1.0,0.0,Amnesia and Memory,20.00,1.20,480.40,1,The United States
53335,Zhang Xiaogang,11/20/2006,Bonhams & Butterfields,The United States: California San Francisco,6147,painting,China,2001-01-01 00:00:00,,1.0,...,oil on canvas,150000.0,1.0,0.0,Baby No. 19,15.75,1.25,311.22,1,The United States
53406,Zhang Xiaogang,05/14/2008,Christies,The United States: New York Rockefeller Center,329,painting,China,2000-01-01 00:00:00,,1.0,...,oil on canvas,993000.0,1.0,0.0,Bloodline Series (Two Comrades),39.25,0.80,1232.45,1,The United States


In [10]:
# That didn't help much, only 500 rows were recovred, let's just create a manual dictionary for the rest
currency_dict = {
    'The United States': 'USD',
    'Germany': 'EUR',
    'Bern': 'CHF',
    'France': 'EUR',
    'United States of America': 'USD',
    'Ireland': 'EUR',
    'Italy': 'EUR',
    'Czech Republic': 'EUR',
    'Belgium': 'EUR',
    'Italia': 'EUR',
    'Brasil': 'BRL',
    'Spain': 'EUR',
    'Finland': 'EUR',
    'Lefèvre': 'EUR',
    'Austria': 'EUR',
    'Netherlands': 'EUR',
    'The Netherlands': 'EUR',
}

# Iterate through all the rows with null currency
for index, row in df[df['currency'].isnull()].iterrows():
    # Get the country name from the auction_location column
    auction_location_country = row['auction_location_country']

    # If the country is in the dictionary, set the currency column to the value
    if auction_location_country in currency_dict:
        df.loc[index, 'currency'] = currency_dict[auction_location_country]

# Check that we have no more null currency values
df[df['currency'].isnull()]


Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,medium,sales_price,signed,stamped,title,width,aspect_ratio,area,sold,auction_location_country
17717,Bernard Buffet,12/06/2006,Poette,Castor,76,painting,France,,,0.0,...,oil on canvas,145000.0,1.0,0.0,New York-Subway,39.37,1.65,2557.48,1,Castor
17944,Brett Whiteley,07/31/2006,Lawson,Menzies,343,painting,Australia,1964-01-01 00:00:00,,1.0,...,painting,204000.0,0.0,0.0,Cheetah in Rillington Place,72.05,1.13,5843.25,1,Menzies
17994,Brett Whiteley,11/27/2013,Deutscher,Menzies,19,painting,Australia,1973-01-01 00:00:00,,1.0,...,oil on plywood,228000.0,1.0,0.0,TO REPEAT WITHOUT REPEATING,34.65,1.01,1207.21,1,Menzies
30331,Ivan Konstantinovich Aivazovsky,04/13/2005,Chochon,Barré & Allardi,36,painting,Russia,,,0.0,...,oil on canvas mounted on carton,80000.0,0.0,0.0,Temple en Grèce,10.24,0.62,64.51,1,Barré & Allardi
31154,Jean Baptiste Greuze,11/17/2004,Delorme,Collin du Bocage,78,painting,France,1759-01-01 00:00:00,,1.0,...,oil on canvas,80000.0,1.0,0.0,"Portrait de César Gabriel, Duc de Choiseul Pra...",28.54,1.26,1022.59,1,Collin du Bocage
34518,Jusepe De Ribera,12/04/2008,Kahn,Dumousset,210,paper,Spain,,,0.0,...,"pen brown washed, on handmade",105000.0,0.0,0.0,Die letzte Fahrt,6.69,1.38,61.88,1,Dumousset
35458,Konstantin Egorovich Makovsky,05/19/2005,Gelos Antiques and Auction House,Russia: Moscow,35,painting,Russia,,,0.0,...,oil on canvas,700000.0,0.0,0.0,The Ceremony of Kissing,84.45,0.67,4787.47,1,Russia
36596,Lucas Cranach I,09/04/2008,Auktionshaus,Weidler,410,painting,Germany,,,0.0,...,tempera,112000.0,0.0,0.0,Apoll und Diana in Landschaft,20.08,1.44,580.31,1,Weidler
37408,Lyonel Charles Feininger,05/06/2005,Peter Kiefer • Buch,und Kunstauktionen,4962,painting,United States of America,1946-01-01 00:00:00,,1.0,...,oil on canvas,150000.0,1.0,0.0,"Yacht race - ""Racing Skerry Cruisers""",25.0,0.52,324.75,1,und Kunstauktionen
38093,Marc Chagall,12/14/2012,Hôtel des Ventes de Monte,Carlo,36,paper,France,,,0.0,...,ink,270000.0,0.0,0.0,L&#146;écuyère au double profil,19.69,1.33,515.48,1,Carlo


In [11]:
# Okay, we went down to 12 rows with null currency values, let's just drop them
df = df.dropna(subset=['currency'])

#Drop the temporary column
df = df.drop('auction_location_country', axis=1)

# Check that we have no more null currency values
df[df['currency'].isnull()]

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,low_estimate,medium,sales_price,signed,stamped,title,width,aspect_ratio,area,sold


In [12]:
#Try out converter package
from currency_converter import CurrencyConverter
c = CurrencyConverter(fallback_on_missing_rate=True, fallback_on_wrong_date=True)
c.convert(100, 'EUR', 'USD')

97.06

In [13]:
#Will have to convert based on the date of the auction
from datetime import date
c.convert(100, 'EUR', 'USD', date=date(2000, 1, 28))

98.48

In [15]:
# Clean the cells
""" df['sales_price'] = df['sales_price'].str.replace('[a-zA-Z]', '')
df['sales_price'] = df['sales_price'].str.replace(',', '')
df['sales_price'] = df['sales_price'].str.replace('$', '')
df['sales_price'] = df['sales_price'].str.replace(' ', '')
df['sales_price'] = df['sales_price'].str.replace('[', '')
df['sales_price'] = df['sales_price'].str.replace(']', '') """

# If the cell is empty, fill it with 0
df['sales_price'] = df['sales_price'].fillna(0)
df['sales_price'] = df['sales_price'].replace('', 0)

# Convert to float
df['sales_price'] = df['sales_price'].astype(float)

In [355]:
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,height,high_estimate,inscribed,low_estimate,medium,sales_price,signed,stamped,title,width
0,Binhong Huang,05/30/2017,Christies,Hong Kong,1364,paper,,1947-01-01 00:00:00,USD,0.0,...,33.66,64150.0,1.0,38490.0,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93
1,Binhong Huang,05/30/2017,Christies,Hong Kong,1365,paper,,1990-01-01 00:00:00,USD,0.0,...,13.58,64150.0,1.0,38490.0,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43
2,Yun Tang,05/30/2017,Christies,Hong Kong,1366,paper,,,USD,0.0,...,,23090.0,1.0,15400.0,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),
3,Binhong Huang,05/30/2017,Christies,Hong Kong,1367,paper,,,USD,0.0,...,40.94,449050.0,1.0,320750.0,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55
4,Yun Tang,05/30/2017,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,39.53,12830.0,1.0,10260.0,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82


In [16]:
# add an column of nan called 'sales_price_usd'
df['sales_price_usd'] = np.nan
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,medium,sales_price,signed,stamped,title,width,aspect_ratio,area,sold,sales_price_usd
0,Huang Binhong,05/30/2017,Christies,Hong Kong,1364,paper,China,1947-01-01 00:00:00,USD,0.0,...,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93,1.99,569.86,1,
1,Huang Binhong,05/30/2017,Christies,Hong Kong,1365,paper,China,1990-01-01 00:00:00,USD,0.0,...,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43,1.3,141.64,0,
2,Yun Tang,05/30/2017,Christies,Hong Kong,1366,paper,,,USD,0.0,...,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,,,1,
3,Huang Binhong,05/30/2017,Christies,Hong Kong,1367,paper,China,,USD,0.0,...,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55,2.63,636.62,1,
4,Yun Tang,05/30/2017,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82,2.1,743.95,1,


In [17]:
# Apparently GRD, ESP, TWD, YUM, UAH are not supported by the currency converter package
# How many rows have these currencies?
df[df['currency'].isin(['GRD', 'ESP', 'TWD', 'YUM', 'UAH', 'VEB', 'AED'])]

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,medium,sales_price,signed,stamped,title,width,aspect_ratio,area,sold,sales_price_usd
13456,Albrecht Dürer,12/02/2008,Christies,United Kingdom: London King Street,20,sculpture,Germany,,GRD,0.0,...,engraving,145250.0,0.0,0.0,"Adam and Eve (B., M., Holl. 1; S.M.S. 39)",7.56,1.30,74.39,1,
13488,Albrecht Dürer,12/02/2008,Christies,United Kingdom: London King Street,21,sculpture,Germany,,GRD,0.0,...,engraving,82850.0,0.0,0.0,"Saint Jerome in his Study (B. 60; M., Holl. 59...",7.36,1.32,71.32,1,
14494,André Derain,05/20/2011,Christies,France: Paris,53,paper,France,1904-01-01 00:00:00,ESP,1.0,...,oil on paper,85000.0,1.0,0.0,Nature morte au pot bleu,13.11,1.25,214.22,1,
15087,Andy Warhol,04/30/2008,Christies,United Arab Emirates: Dubai,133,paper,United States of America,1977-01-01 00:00:00,AED,1.0,...,screenprint on curtis rag paper,121000.0,1.0,0.0,Farah Dibah Pahlavi,45.00,0.78,1575.00,1,
15661,Andy Warhol,04/30/2008,Christies,United Arab Emirates: Dubai,134,paper,United States of America,1978-01-01 00:00:00,AED,1.0,...,screenprint on curtis rag paper,157000.0,1.0,0.0,Mohammed Reza Shah Pahlavi (Shah of Iran),34.84,1.29,1570.59,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53769,Zhu (ju Ming) Ming,06/01/2014,Ravenel Art Group,Taiwan: Taipei,199,sculpture,Taiwan,1982-01-01 00:00:00,TWD,1.0,...,sculpture on wood,3600000.0,0.0,0.0,Taichi Series – Underarm Strike,5.91,1.93,67.49,1,
53771,Zhu (ju Ming) Ming,06/01/2014,Ravenel Art Group,Taiwan: Taipei,201,sculpture,Taiwan,1991-01-01 00:00:00,TWD,1.0,...,bronze,3120000.0,0.0,0.0,Taichi Series,12.20,1.52,225.70,1,
53780,Zhu (ju Ming) Ming,12/05/2010,Ravenel Art Group,Taiwan: Taipei,156,sculpture,Taiwan,1992-01-01 00:00:00,TWD,1.0,...,sculpture,4800000.0,0.0,0.0,Taichi Series,14.57,1.40,298.25,1,
53794,Zhu (ju Ming) Ming,06/05/2011,Ravenel Art Group,Taiwan: Taipei,149,sculpture,Taiwan,1992-01-01 00:00:00,TWD,1.0,...,sculpture on wood,10200000.0,0.0,0.0,Taichi Series,12.60,1.56,248.09,1,


In [18]:
#81 Total, not so bad but let's keep it in mind
# Drop these rows
df = df[~df['currency'].isin(['GRD', 'ESP', 'TWD', 'YUM', 'UAH', 'VEB', 'AED'])]

In [19]:
#loop through the rows and convert the sales price to usd using an aribitrairy date -> today's
for index, row in df.iterrows():
    if row['currency'] != 'USD':     #only if the currency is NOT USD already
        df.loc[index, 'sales_price_usd'] = c.convert(row['sales_price'], row['currency'], 'USD', date=date(2022, 10, 7))
    else:
        df.loc[index, 'sales_price_usd'] = row['sales_price']

#check the new column
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,medium,sales_price,signed,stamped,title,width,aspect_ratio,area,sold,sales_price_usd
0,Huang Binhong,05/30/2017,Christies,Hong Kong,1364,paper,China,1947-01-01 00:00:00,USD,0.0,...,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93,1.99,569.86,1,45900.0
1,Huang Binhong,05/30/2017,Christies,Hong Kong,1365,paper,China,1990-01-01 00:00:00,USD,0.0,...,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43,1.3,141.64,0,0.0
2,Yun Tang,05/30/2017,Christies,Hong Kong,1366,paper,,,USD,0.0,...,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,,,1,22950.0
3,Huang Binhong,05/30/2017,Christies,Hong Kong,1367,paper,China,,USD,0.0,...,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55,2.63,636.62,1,433330.0
4,Yun Tang,05/30/2017,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82,2.1,743.95,1,12240.0


In [20]:
# Now let's try to convert using the date of the auction

# First we need to convert the auction date column to datetime

#Some issues
#Looks like some rows have a range of dates for when the auction was held: Like 04/01/2020-04/16/2020

#Lets see which ones have this problem
df[df['auction_date'].str.contains('-')]

# Change the date for all these rows, keep only the first day of the range
df['auction_date'] = df['auction_date'].str.split('-').str[0]
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,medium,sales_price,signed,stamped,title,width,aspect_ratio,area,sold,sales_price_usd
0,Huang Binhong,05/30/2017,Christies,Hong Kong,1364,paper,China,1947-01-01 00:00:00,USD,0.0,...,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93,1.99,569.86,1,45900.0
1,Huang Binhong,05/30/2017,Christies,Hong Kong,1365,paper,China,1990-01-01 00:00:00,USD,0.0,...,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43,1.3,141.64,0,0.0
2,Yun Tang,05/30/2017,Christies,Hong Kong,1366,paper,,,USD,0.0,...,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,,,1,22950.0
3,Huang Binhong,05/30/2017,Christies,Hong Kong,1367,paper,China,,USD,0.0,...,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55,2.63,636.62,1,433330.0
4,Yun Tang,05/30/2017,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82,2.1,743.95,1,12240.0


In [21]:
# Now convert to datetime
df['auction_date'] = pd.to_datetime(df['auction_date'])

In [22]:
#loop through the rows and convert the sales price to usd using the date of the auction

c = CurrencyConverter(fallback_on_missing_rate=True, fallback_on_wrong_date=True)
for index, row in df.iterrows():
    if row['currency'] != 'USD':     #only the currency is NOT USD already
        df.loc[index, 'sales_price_usd'] = c.convert(row['sales_price'], row['currency'], 'USD', date=row['auction_date'])
    else:
        df.loc[index, 'sales_price_usd'] = row['sales_price']

#There were some errors here, mainly because:
# 1) the currency converter package doesn't have the currency for that specific date
# 2) the currency converter package doesn't have the currency for that year, sometimes it only has it for the last 10 years

# This was solved by using the fallback_on_missing_rate and fallback_on_wrong_date parameters
# What these do is that they will do linear interpolation when possible, and use the closest known rate otherwise
# I guess that's good enough

#check the new column
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,medium,sales_price,signed,stamped,title,width,aspect_ratio,area,sold,sales_price_usd
0,Huang Binhong,2017-05-30,Christies,Hong Kong,1364,paper,China,1947-01-01 00:00:00,USD,0.0,...,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93,1.99,569.86,1,45900.0
1,Huang Binhong,2017-05-30,Christies,Hong Kong,1365,paper,China,1990-01-01 00:00:00,USD,0.0,...,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43,1.3,141.64,0,0.0
2,Yun Tang,2017-05-30,Christies,Hong Kong,1366,paper,,,USD,0.0,...,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,,,1,22950.0
3,Huang Binhong,2017-05-30,Christies,Hong Kong,1367,paper,China,,USD,0.0,...,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55,2.63,636.62,1,433330.0
4,Yun Tang,2017-05-30,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82,2.1,743.95,1,12240.0


In [23]:
# Now we should do the same for the low_estimate and high_estimate columns

# How many rows have null values in the estimate columns?
df['low_estimate'].isnull().sum()
df['high_estimate'].isnull().sum()

2084

In [24]:
# Drop the rows with null values in the estimate columns
df = df.dropna(subset=['low_estimate'])
df = df.dropna(subset=['high_estimate'])

In [25]:
# Create empty columns for the low and high estimates in USD
df['low_estimate_usd'] = np.nan
df['high_estimate_usd'] = np.nan

In [26]:
# Now let's try to convert using the date of the auction

#loop through the rows and convert the low_estimate to usd using the date of the auction
for index, row in df.iterrows():
    if row['currency'] != 'USD':     #only the currency is NOT USD already
        df.loc[index, 'low_estimate_usd'] = c.convert(row['low_estimate'], row['currency'], 'USD', date=row['auction_date'])
    else:
        df.loc[index, 'low_estimate_usd'] = row['low_estimate']

#loop through the rows and convert the high_estimate to usd using the date of the auction
for index, row in df.iterrows():
    if row['currency'] != 'USD':     #only the currency is NOT USD already
        df.loc[index, 'high_estimate_usd'] = c.convert(row['high_estimate'], row['currency'], 'USD', date=row['auction_date'])
    else: 
        df.loc[index, 'high_estimate_usd'] = row['high_estimate']


#check the new column
df.head()


Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,signed,stamped,title,width,aspect_ratio,area,sold,sales_price_usd,low_estimate_usd,high_estimate_usd
0,Huang Binhong,2017-05-30,Christies,Hong Kong,1364,paper,China,1947-01-01 00:00:00,USD,0.0,...,1.0,0.0,Misty Landscape,16.93,1.99,569.86,1,45900.0,38490.0,64150.0
1,Huang Binhong,2017-05-30,Christies,Hong Kong,1365,paper,China,1990-01-01 00:00:00,USD,0.0,...,1.0,0.0,Conversations in the Mountain,10.43,1.3,141.64,0,0.0,38490.0,64150.0
2,Yun Tang,2017-05-30,Christies,Hong Kong,1366,paper,,,USD,0.0,...,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,,,1,22950.0,15400.0,23090.0
3,Huang Binhong,2017-05-30,Christies,Hong Kong,1367,paper,China,,USD,0.0,...,1.0,0.0,Retreat in the Mountains,15.55,2.63,636.62,1,433330.0,320750.0,449050.0
4,Yun Tang,2017-05-30,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,1.0,0.0,Villagers in the Woods,18.82,2.1,743.95,1,12240.0,10260.0,12830.0


In [27]:
# Export this new dataframe to a csv file to explore other stuff
df.to_csv('cleaned_data_with_usd.csv', index=False)

# Standardize Medium

In [29]:
# Let's see how many there are
df['medium'].nunique()

8239

In [35]:
# Let's explore the medium column
df.groupby('medium').count()

# There are a lot of different mediums, are we going to try to classify them all?
# NLP? 
# Keep key words only? How many categories?

Unnamed: 0_level_0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,signed,stamped,title,width,aspect_ratio,area,sold,sales_price_usd,low_estimate_usd,high_estimate_usd
medium,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
",gold leaf, collage and canvas / board",1,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"132 drawn paper receipts of variable size pinned to acrylic on panel, diptych / canvas",1,1,1,1,1,1,0,1,1,0,...,0,0,1,0,0,0,1,1,1,1
18-carat yellow gold with multi-colored enamel brooch/pendant,1,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
23 carat repousse-cisele gold,2,2,2,2,2,2,0,2,2,2,...,2,2,2,2,2,2,2,2,2,2
23 carat repoussé-ciselé gold,1,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wooden frames and cotton strings,1,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
wooden stool,1,1,1,1,1,1,0,1,1,0,...,0,0,1,1,1,1,1,1,1,1
woodwith original box and wooden certificate of authenticity,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
woven canvas,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [36]:
#Okay I think you worked on category
df.groupby('category').count()

Unnamed: 0_level_0,artist,auction_date,auction_house,auction_location,auction_lot,country,created,currency,dated,edition,...,signed,stamped,title,width,aspect_ratio,area,sold,sales_price_usd,low_estimate_usd,high_estimate_usd
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
installation,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,1,1,1,1,1
mixed media,1042,1042,1042,1042,1042,451,855,1042,973,57,...,973,973,1042,972,971,972,1042,1042,1042,1042
other,62,62,62,62,62,14,40,62,61,10,...,61,61,62,53,53,53,62,62,62,62
painting,30361,30361,30361,30361,30358,27031,17802,30361,30202,10,...,30202,30202,30361,30128,30104,30128,30361,30361,30361,30361
paper,13827,13827,13827,13827,13827,10960,7381,13827,13510,3,...,13510,13510,13827,13292,13288,13292,13827,13827,13827,13827
photograph,117,117,117,117,117,117,57,117,117,1,...,117,117,117,117,117,117,117,117,117,117
print,1086,1086,1086,1086,1086,1064,475,1086,1084,28,...,1084,1084,1086,1083,1083,1083,1086,1086,1086,1086
sculpture,3767,3767,3767,3767,3766,3438,1560,3767,3715,312,...,3715,3715,3767,3695,3682,3695,3767,3767,3767,3767
unknown,915,915,915,915,915,882,194,915,908,1,...,908,908,915,897,892,897,915,915,915,915


# Country 

In [31]:
# How many cells without country?
df[df['country'].isnull()]

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,signed,stamped,title,width,aspect_ratio,area,sold,sales_price_usd,low_estimate_usd,high_estimate_usd
2,Yun Tang,2017-05-30,Christies,Hong Kong,1366,paper,,,USD,0.0,...,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,,,1,2.295000e+04,1.540000e+04,2.309000e+04
4,Yun Tang,2017-05-30,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,1.0,0.0,Villagers in the Woods,18.82,2.10,743.95,1,1.224000e+04,1.026000e+04,1.283000e+04
5,Bangda Xu,2017-05-30,Christies,Hong Kong,1369,paper,,,USD,0.0,...,1.0,0.0,Scholar Boating,16.93,2.02,579.85,1,1.836000e+04,6.420000e+03,8.980000e+03
6,Hufan Wu,2017-05-30,Christies,Hong Kong,1370,paper,,,USD,0.0,...,1.0,0.0,Autumn Landscape,26.50,0.81,568.69,0,0.000000e+00,3.849000e+04,6.415000e+04
7,Shixuan Zheng,2017-05-30,Christies,Hong Kong,1371,paper,,1951-01-01 00:00:00,USD,0.0,...,1.0,0.0,Four Beauties (4),,,,1,9.487000e+04,1.026000e+04,1.540000e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51040,Willem Van Tetrode (attribued To),2014-01-30,Sothebys,The United States: New York,116,sculpture,,,USD,0.0,...,0.0,0.0,SAMSON SLAYING THE PHILISTINE,0.00,inf,0.00,1,3.301000e+06,8.000000e+05,1.200000e+06
51253,William Scrots (school),2012-07-04,Sothebys,United Kingdom: London,10,painting,,,GBP,0.0,...,0.0,0.0,PORTRAIT OF EDWARD VI (1537-1553),27.95,1.32,1034.43,1,2.779166e+06,7.818725e+05,1.094622e+06
53518,Zhou Chen,2012-11-26,Christies,China: Hong Kong,818,paper,,,HKD,0.0,...,0.0,0.0,Fishing in a Quiet Lake,21.65,0.33,155.88,1,1.109666e+05,3.870929e+04,5.161239e+04
53519,Zhou Chen,2011-11-28,Christies,China: Hong Kong,1702,paper,,,HKD,0.0,...,0.0,0.0,Pavilions by the River,18.74,0.35,123.87,1,1.565251e+05,2.565986e+04,3.848978e+04


In [34]:
# So, 12920 rows without country, how many artists?
df[df['country'].isnull()].groupby('artist').size().sort_values(ascending=False).head(20)

# 1928 artists, hopefully only a few of them make up most of the rows

artist
Keran Li              38
Zhen Wang             37
Chunya Zhou           37
Tay Bak Koi           36
Shanshen Yang         33
Atsuko Tanaka         33
Henri Mege            32
Pang Jiun             32
Shifa Cheng           30
Xiaogang Zhang        30
Youren Yu             30
Cui Ruzhuo            30
Shiryu Morita         29
On Kawara             29
Dayu Wu               28
Dan Liu               28
Zikai Feng            28
AY T Joe Christine    27
Dan Colen             27
Sanghwa Chung         27
dtype: int64