In [144]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import xlrd
warnings.filterwarnings('ignore')

# Further Pre Processing - Standardize Currency - Medium - Country

In [145]:
#import clean data
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,height,high_estimate,inscribed,low_estimate,medium,sales_price,signed,stamped,title,width
0,Binhong Huang,05/30/2017,Christies,Hong Kong,1364,paper,,1947-01-01 00:00:00,USD,0.0,...,33.66,64150.0,1.0,38490.0,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93
1,Binhong Huang,05/30/2017,Christies,Hong Kong,1365,paper,,1990-01-01 00:00:00,USD,0.0,...,13.58,64150.0,1.0,38490.0,watercolor and ink / paper,,1.0,0.0,Conversations in the Mountain,10.43
2,Yun Tang,05/30/2017,Christies,Hong Kong,1366,paper,,,USD,0.0,...,,23090.0,1.0,15400.0,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),
3,Binhong Huang,05/30/2017,Christies,Hong Kong,1367,paper,,,USD,0.0,...,40.94,449050.0,1.0,320750.0,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55
4,Yun Tang,05/30/2017,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,39.53,12830.0,1.0,10260.0,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82


## Standardize Currency

In [146]:
#how many unique currencies are there?
df['currency'].nunique()

28

In [147]:
#What are the most common
df['currency'].value_counts().head(10)

USD    28958
GBP    14062
EUR     4560
HKD     3045
CHF      322
SEK      199
CNY      167
AUD      139
CAD      119
NOK       64
Name: currency, dtype: int64

In [148]:
# Are there any rows with null currency values?
df[df['currency'].isnull()]

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,height,high_estimate,inscribed,low_estimate,medium,sales_price,signed,stamped,title,width
12279,Adolph Gottlieb,05/13/2008,Christies,The United States: New York Rockefeller Center,22,,United States of America,,,0.0,...,90.00,,0.0,,Oil on canvas,6537000,1.0,0.0,Cool Blast,70.00
12295,Adolph Gottlieb,05/14/2008,Christies,The United States: New York Rockefeller Center,146,,United States of America,1946-01-01 00:00:00,,1.0,...,25.47,,0.0,,Gouache watercolor and graphite on paper,145000,1.0,0.0,Mood Indigo,19.49
12324,Adolph Gottlieb,05/22/2007,Bonhams & Butterfields,The United States: California San Francisco,60,,United States of America,1949-01-01 00:00:00,,1.0,...,25.51,,0.0,,Gouache on paper,70000,1.0,0.0,Untitled,19.25
12341,Adriaen Coorte,03/28/2014,Hampel Fine Art Auctioneers,Germany: Munich,1097,,,,,0.0,...,5.63,,0.0,,Oil on paper,145000,1.0,0.0,STILL LIFE WITH SEA SHELLS ON A STONE PLATE,8.07
12354,Adriana Varejão,09/22/2011,Bolsa De Arte,Brasil: Rio de Janeiro,44,,Brazil,2001-01-01 00:00:00,,1.0,...,39.37,,0.0,,Oil on canvas,1011236,1.0,0.0,Macau Wall #5,39.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53835,Zhu (ju Ming) Ming,05/25/2008,Christies,China: Hong Kong,311,,Taiwan,1994-01-01 00:00:00,,1.0,...,24.02,,0.0,,Bronze sculpture,907500,1.0,0.0,Taiji Series - Push Hand,12.99
53838,Zhu (ju Ming) Ming,05/25/2008,Christies,China: Hong Kong,278,,Taiwan,1991-01-01 00:00:00,,1.0,...,11.02,,0.0,,Wood sculpture,1327500,1.0,0.0,Taiji Series - Single Whip,15.75
53842,Zhu (ju Ming) Ming,05/25/2008,Christies,China: Hong Kong,313,,Taiwan,1991-01-01 00:00:00,,1.0,...,12.60,,1.0,,Sculpture,1927500,1.0,0.0,Taiji Series - Single Whip,23.62
53847,Zhu (ju Ming) Ming,05/25/2008,Christies,China: Hong Kong,281,,Taiwan,1995-01-01 00:00:00,,1.0,...,27.56,,0.0,,Sculpture wood,3007500,1.0,0.0,Taiji Series - Single Whip,27.56


In [149]:
# Will drop these rows with null currency for now, just to keep things simple
df = df.dropna(subset=['currency'])

In [254]:
#Try out converter package
from currency_converter import CurrencyConverter
c = CurrencyConverter(fallback_on_missing_rate=True, fallback_on_wrong_date=True)
c.convert(100, 'EUR', 'USD')

97.06

In [252]:
#Will have to convert based on the date of the auction
from datetime import date
c.convert(100, 'EUR', 'USD', date=date(2000, 1, 28))

98.48

In [152]:
# Convert the sales_price column to float

# Clean the cells
df['sales_price'] = df['sales_price'].str.replace('[a-zA-Z]', '')
df['sales_price'] = df['sales_price'].str.replace(',', '')
df['sales_price'] = df['sales_price'].str.replace('$', '')
df['sales_price'] = df['sales_price'].str.replace(' ', '')
df['sales_price'] = df['sales_price'].str.replace('[', '')
df['sales_price'] = df['sales_price'].str.replace(']', '')

# If the cell is empty, fill it with 0
df['sales_price'] = df['sales_price'].fillna(0)
df['sales_price'] = df['sales_price'].replace('', 0)

# Convert to float
df['sales_price'] = df['sales_price'].astype(float)

In [153]:
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,height,high_estimate,inscribed,low_estimate,medium,sales_price,signed,stamped,title,width
0,Binhong Huang,05/30/2017,Christies,Hong Kong,1364,paper,,1947-01-01 00:00:00,USD,0.0,...,33.66,64150.0,1.0,38490.0,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93
1,Binhong Huang,05/30/2017,Christies,Hong Kong,1365,paper,,1990-01-01 00:00:00,USD,0.0,...,13.58,64150.0,1.0,38490.0,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43
2,Yun Tang,05/30/2017,Christies,Hong Kong,1366,paper,,,USD,0.0,...,,23090.0,1.0,15400.0,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),
3,Binhong Huang,05/30/2017,Christies,Hong Kong,1367,paper,,,USD,0.0,...,40.94,449050.0,1.0,320750.0,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55
4,Yun Tang,05/30/2017,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,39.53,12830.0,1.0,10260.0,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82


In [154]:
# add an column of nan called 'sales_price_usd'
df['sales_price_usd'] = np.nan
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,high_estimate,inscribed,low_estimate,medium,sales_price,signed,stamped,title,width,sales_price_usd
0,Binhong Huang,05/30/2017,Christies,Hong Kong,1364,paper,,1947-01-01 00:00:00,USD,0.0,...,64150.0,1.0,38490.0,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93,
1,Binhong Huang,05/30/2017,Christies,Hong Kong,1365,paper,,1990-01-01 00:00:00,USD,0.0,...,64150.0,1.0,38490.0,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43,
2,Yun Tang,05/30/2017,Christies,Hong Kong,1366,paper,,,USD,0.0,...,23090.0,1.0,15400.0,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,
3,Binhong Huang,05/30/2017,Christies,Hong Kong,1367,paper,,,USD,0.0,...,449050.0,1.0,320750.0,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55,
4,Yun Tang,05/30/2017,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,12830.0,1.0,10260.0,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82,


In [155]:
# Apparently GRD, ESP, TWD, YUM, UAH are not supported by the currency converter package
# How many rows have these currencies?
df[df['currency'].isin(['GRD', 'ESP', 'TWD', 'YUM', 'UAH', 'VEB'])]

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,high_estimate,inscribed,low_estimate,medium,sales_price,signed,stamped,title,width,sales_price_usd
13460,Albrecht Dürer,12/02/2008,Christies,United Kingdom: London King Street,20,,Germany,,GRD,0.0,...,80000.0,0.0,60000.0,Engraving,145250.0,0.0,0.0,"Adam and Eve (B., M., Holl. 1; S.M.S. 39)",7.56,
13492,Albrecht Dürer,12/02/2008,Christies,United Kingdom: London King Street,21,,Germany,,GRD,0.0,...,50000.0,0.0,30000.0,Engraving,82850.0,0.0,0.0,"Saint Jerome in his Study (B. 60; M., Holl. 59...",7.36,
14498,André Derain,05/20/2011,Christies,France: Paris,53,,France,1904-01-01 00:00:00,ESP,1.0,...,70000.0,0.0,50000.0,Oil on paper,85000.0,1.0,0.0,Nature morte au pot bleu,13.11,
18054,Bridget Riley,06/04/2004,Christies,United Kingdom: London King Street,113,,England,1981-01-01 00:00:00,GRD,1.0,...,80000.0,0.0,60000.0,Oil on linen,122850.0,1.0,0.0,Bright Day,56.97,
18173,Cai Guo Qiang,06/02/2013,Ravenel Art Group,Taiwan: Taipei,732,,China,,TWD,0.0,...,32000000.0,0.0,24000000.0,Ink on paper,21600000.0,0.0,0.0,Dragon Cypress,158.46,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53773,Zhu (ju Ming) Ming,06/01/2014,Ravenel Art Group,Taiwan: Taipei,199,,Taiwan,1982-01-01 00:00:00,TWD,1.0,...,4000000.0,0.0,2800000.0,Sculpture on wood,0.0,0.0,0.0,Taichi Series – Underarm Strike,5.91,
53775,Zhu (ju Ming) Ming,06/01/2014,Ravenel Art Group,Taiwan: Taipei,201,,Taiwan,1991-01-01 00:00:00,TWD,1.0,...,4000000.0,0.0,2800000.0,Bronze,0.0,0.0,0.0,Taichi Series,12.20,
53784,Zhu (ju Ming) Ming,12/05/2010,Ravenel Art Group,Taiwan: Taipei,156,,Taiwan,1992-01-01 00:00:00,TWD,1.0,...,6000000.0,0.0,4800000.0,Sculpture,0.0,0.0,0.0,Taichi Series,14.57,
53798,Zhu (ju Ming) Ming,06/05/2011,Ravenel Art Group,Taiwan: Taipei,149,,Taiwan,1992-01-01 00:00:00,TWD,1.0,...,8000000.0,0.0,6000000.0,Sculpture on wood,0.0,0.0,0.0,Taichi Series,12.60,


In [156]:
# 75 Total, not so bad but let's keep it in mind
# Drop these rows
df = df[~df['currency'].isin(['GRD', 'ESP', 'TWD', 'YUM', 'UAH', 'VEB'])]

In [236]:
#loop through the rows and convert the sales price to usd using an aribitrairy date 
for index, row in df.iterrows():
    if row['currency'] != 'USD':     #only if the currency is NOT USD already
        df.loc[index, 'sales_price_usd'] = c.convert(row['sales_price'], row['currency'], 'USD', date=date(2021, 1, 28))

#check the new column
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,high_estimate,inscribed,low_estimate,medium,sales_price,signed,stamped,title,width,sales_price_usd
0,Binhong Huang,2017-05-30,Christies,Hong Kong,1364,paper,,1947-01-01 00:00:00,USD,0.0,...,64150.0,1.0,38490.0,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93,45900.0
1,Binhong Huang,2017-05-30,Christies,Hong Kong,1365,paper,,1990-01-01 00:00:00,USD,0.0,...,64150.0,1.0,38490.0,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43,0.0
2,Yun Tang,2017-05-30,Christies,Hong Kong,1366,paper,,,USD,0.0,...,23090.0,1.0,15400.0,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,22950.0
3,Binhong Huang,2017-05-30,Christies,Hong Kong,1367,paper,,,USD,0.0,...,449050.0,1.0,320750.0,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55,433330.0
4,Yun Tang,2017-05-30,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,12830.0,1.0,10260.0,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82,12240.0


In [223]:
# Now let's try to convert using the date of the auction

# First we need to convert the auction date column to datetime

#Some issues
#Looks like some rows have a range of dates for when the auction was held: Like 04/01/2020-04/16/2020

#Lets see which ones have this problem
df[df['auction_date'].str.contains('-')]

# Change the date for all these rows, keep only the first day of the range
df['auction_date'] = df['auction_date'].str.split('-').str[0]
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,high_estimate,inscribed,low_estimate,medium,sales_price,signed,stamped,title,width,sales_price_usd
0,Binhong Huang,05/30/2017,Christies,Hong Kong,1364,paper,,1947-01-01 00:00:00,USD,0.0,...,64150.0,1.0,38490.0,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93,45900.0
1,Binhong Huang,05/30/2017,Christies,Hong Kong,1365,paper,,1990-01-01 00:00:00,USD,0.0,...,64150.0,1.0,38490.0,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43,0.0
2,Yun Tang,05/30/2017,Christies,Hong Kong,1366,paper,,,USD,0.0,...,23090.0,1.0,15400.0,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,22950.0
3,Binhong Huang,05/30/2017,Christies,Hong Kong,1367,paper,,,USD,0.0,...,449050.0,1.0,320750.0,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55,433330.0
4,Yun Tang,05/30/2017,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,12830.0,1.0,10260.0,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82,12240.0


In [225]:
# Now convert to datetime
df['auction_date'] = pd.to_datetime(df['auction_date'])

In [256]:
#loop through the rows and convert the sales price to usd using the date of the auction

c = CurrencyConverter(fallback_on_missing_rate=True, fallback_on_wrong_date=True)
for index, row in df.iterrows():
    if row['currency'] != 'USD':     #only the currency is NOT USD already
        df.loc[index, 'sales_price_usd'] = c.convert(row['sales_price'], row['currency'], 'USD', date=row['auction_date'])

#There were some errors here, mainly because:
# 1) the currency converter package doesn't have the currency for that specific date
# 2) the currency converter package doesn't have the currency for that year, sometimes it only has it for the last 10 years

# This was solved by using the fallback_on_missing_rate and fallback_on_wrong_date parameters
# What these do is that they will do linear interpolation when possible, and use the closest known rate otherwise
# I guess that's good enough

#check the new column
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,high_estimate,inscribed,low_estimate,medium,sales_price,signed,stamped,title,width,sales_price_usd
0,Binhong Huang,2017-05-30,Christies,Hong Kong,1364,paper,,1947-01-01 00:00:00,USD,0.0,...,64150.0,1.0,38490.0,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93,45900.0
1,Binhong Huang,2017-05-30,Christies,Hong Kong,1365,paper,,1990-01-01 00:00:00,USD,0.0,...,64150.0,1.0,38490.0,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43,0.0
2,Yun Tang,2017-05-30,Christies,Hong Kong,1366,paper,,,USD,0.0,...,23090.0,1.0,15400.0,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,22950.0
3,Binhong Huang,2017-05-30,Christies,Hong Kong,1367,paper,,,USD,0.0,...,449050.0,1.0,320750.0,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55,433330.0
4,Yun Tang,2017-05-30,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,12830.0,1.0,10260.0,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82,12240.0


In [283]:
# Export this new dataframe to a csv file to explore other stuff
df.to_csv('cleaned_data_with_usd.csv', index=False)

# Standardize Medium

In [160]:
# Let's see how many there are
df['medium'].nunique()

8368

In [161]:
# Let's explore the medium column
df.groupby('medium').count()

# There are a lot of different mediums, are we going to try to classify them all?
# NLP? 
# Keep key words only? How many categories?

Unnamed: 0_level_0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,height,high_estimate,inscribed,low_estimate,sales_price,signed,stamped,title,width,sales_price_usd
medium,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
",gold leaf, collage and canvas / board",1,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"132 drawn paper receipts of variable size pinned to acrylic on panel, diptych / canvas",1,1,1,1,1,1,0,1,1,0,...,0,1,0,1,1,0,0,1,0,1
18-carat yellow gold with multi-colored enamel brooch/pendant,1,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
23 carat repousse-cisele gold,2,2,2,2,2,2,0,2,2,2,...,2,2,2,2,2,2,2,2,2,2
23 carat repoussé-ciselé gold,1,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wooden frames and cotton strings,1,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
wooden stool,1,1,1,1,1,1,0,1,1,0,...,1,1,0,1,1,0,0,1,1,1
woodwith original box and wooden certificate of authenticity,1,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
woven canvas,1,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1


# Country 

In [170]:
# How many cells without country?
df[df['country'].isnull()]

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,high_estimate,inscribed,low_estimate,medium,sales_price,signed,stamped,title,width,sales_price_usd
0,Binhong Huang,05/30/2017,Christies,Hong Kong,1364,paper,,1947-01-01 00:00:00,USD,0.0,...,64150.0,1.0,38490.0,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93,45900.0
1,Binhong Huang,05/30/2017,Christies,Hong Kong,1365,paper,,1990-01-01 00:00:00,USD,0.0,...,64150.0,1.0,38490.0,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43,0.0
2,Yun Tang,05/30/2017,Christies,Hong Kong,1366,paper,,,USD,0.0,...,23090.0,1.0,15400.0,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,22950.0
3,Binhong Huang,05/30/2017,Christies,Hong Kong,1367,paper,,,USD,0.0,...,449050.0,1.0,320750.0,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55,433330.0
4,Yun Tang,05/30/2017,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,12830.0,1.0,10260.0,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82,12240.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51044,Willem Van Tetrode (attribued To),01/30/2014,Sothebys,The United States: New York,116,,,,USD,0.0,...,1200000.0,0.0,800000.0,Bronze,0.0,0.0,0.0,SAMSON SLAYING THE PHILISTINE,0.00,0.0
51257,William Scrots (school),07/04/2012,Sothebys,United Kingdom: London,10,,,,GBP,0.0,...,700000.0,0.0,500000.0,Oil on panel,0.0,0.0,0.0,PORTRAIT OF EDWARD VI (1537-1553),27.95,0.0
53522,Zhou Chen,11/26/2012,Christies,China: Hong Kong,818,,,,HKD,0.0,...,400000.0,0.0,300000.0,Ink on paper,0.0,0.0,0.0,Fishing in a Quiet Lake,21.65,0.0
53523,Zhou Chen,11/28/2011,Christies,China: Hong Kong,1702,,,,HKD,0.0,...,300000.0,0.0,200000.0,Ink on paper,0.0,0.0,0.0,Pavilions by the River,18.74,0.0


In [275]:
# So, 12920 rows without country, how many artists?
df[df['country'].isnull()].groupby('artist').size().sort_values(ascending=False)

# 1928 artists, hopefully only a few of them make up most of the rows

artist
Ru Pu                       248
Daqian Zhang                245
Yayoi Kusama                245
Wou-Ki Zao                  210
Le Pho                      209
                           ... 
Luo Erchun                    1
Luo Quanmu                    1
Luo Song                      1
Luo Wang                      1
ou & Zhu Haonian & Mulan      1
Length: 1928, dtype: int64

In [281]:
# How to recover the country?
# Maybe we can use the artist's name and search for it on wikipedia?

# Let's try wikipedia and one artist only for now
import wikipedia
wikipedia.set_lang("en")
wikipedia.search("Yayoi Kusama")
# Okay she can be found on wikipedia

page = wikipedia.page("Yayoi Kusama")
page.content[:1000] # print the first 1000 characters

'Yayoi Kusama (草間 彌生, Kusama Yayoi, born 22 March 1929) is a Japanese contemporary artist who works primarily in sculpture and installation, but is also active in painting, performance, video art, fashion, poetry, fiction, and other arts. Her work is based in conceptual art and shows some attributes of feminism, minimalism, surrealism, Art Brut, pop art, and abstract expressionism, and is infused with autobiographical, psychological, and sexual content. She has been acknowledged as one of the most important living artists to come out of Japan.Kusama was raised in Matsumoto, and trained at the Kyoto City University of Arts in a traditional Japanese painting style called nihonga. Kusama was inspired, however, by American Abstract impressionism. She moved to New York City in 1958 and was a part of the New York avant-garde scene throughout the 1960s, especially in the pop-art movement. Embracing the rise of the hippie counterculture of the late 1960s, she came to public attention when she 

In [282]:
#How to extract the country from the page content?
#Let's try to find the word 'nationality' and then get the next word
page.content.find('born in')

## uffff man this looks like a lot of work, let's try to find a better way

-1