# Further Cleaning for clustering

## Initial imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import xlrd
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

Import data

In [2]:
df = pd.read_csv('cleaned_data_with_usd.csv')
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,signed,stamped,title,width,aspect_ratio,area,sold,sales_price_usd,low_estimate_usd,high_estimate_usd
0,Huang Binhong,2017-05-30,Christies,Hong Kong,1364,paper,China,1947-01-01 00:00:00,USD,0.0,...,1.0,0.0,Misty Landscape,16.93,1.99,569.86,1,45900.0,38490.0,64150.0
1,Huang Binhong,2017-05-30,Christies,Hong Kong,1365,paper,China,1990-01-01 00:00:00,USD,0.0,...,1.0,0.0,Conversations in the Mountain,10.43,1.3,141.64,0,0.0,38490.0,64150.0
2,Yun Tang,2017-05-30,Christies,Hong Kong,1366,paper,,,USD,0.0,...,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,,,1,22950.0,15400.0,23090.0
3,Huang Binhong,2017-05-30,Christies,Hong Kong,1367,paper,China,,USD,0.0,...,1.0,0.0,Retreat in the Mountains,15.55,2.63,636.62,1,433330.0,320750.0,449050.0
4,Yun Tang,2017-05-30,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,1.0,0.0,Villagers in the Woods,18.82,2.1,743.95,1,12240.0,10260.0,12830.0


In [3]:
# strip white space from auction_house column values
df['auction_house'] = df['auction_house'].str.strip()

## Clean the data for clustering

In [6]:
#Define feature to use for now
df.columns

Index(['artist', 'auction_date', 'auction_house', 'auction_location',
       'auction_lot', 'category', 'country', 'created', 'currency', 'dated',
       'edition', 'foundry', 'height', 'high_estimate', 'inscribed',
       'low_estimate', 'medium', 'sales_price', 'signed', 'stamped', 'title',
       'width', 'aspect_ratio', 'area', 'sold', 'sales_price_usd',
       'low_estimate_usd', 'high_estimate_usd'],
      dtype='object')

In [7]:
# drop sales_price column
df = df.drop(['sales_price'], axis=1)

# rename sales_price_usd column to sales_price
df = df.rename(columns={'sales_price_usd': 'sales_price'})

In [8]:
# print all non numeric columns
for col in df.columns:
    if df[col].dtype == 'object':
        print(col)
        

artist
auction_date
auction_house
auction_location
auction_lot
category
country
created
currency
edition
foundry
medium
title


### One-hot encoding top artists

In [9]:
def top_values(df, col, n):
    
    # print how many time each artist appears in the dataset
    counts = df[col].value_counts()

    # how many artists appear more than 200 times
    uniques = df[col].value_counts()[df[col].value_counts() >= n].count()

    # number of row with artist appearing more than 200 times
    n_rows = df[df[col].isin(df[col].value_counts()[df[col].value_counts() >= n].index)].shape[0]

    return counts, uniques, n_rows

In [10]:
top_values(df, 'artist', 150)

(Andy Warhol              1489
 Pablo Ruiz Picasso       1265
 Zhang Daqian              806
 Marc Chagall              681
 Pierre Auguste Renoir     575
                          ... 
 Guofang Jiang               1
 Genpei Akasegawa            1
 Shu Tanaka                  1
 Yoshida Katsuro             1
 Yanobe Kenji                1
 Name: artist, Length: 2499, dtype: int64,
 67,
 20472)

In [11]:
# rename artists that appear between 50 and 150 times as 'second_tier'
df.loc[df['artist'].isin(df['artist'].value_counts()[df['artist'].value_counts() >= 50].index) & df['artist'].isin(df['artist'].value_counts()[df['artist'].value_counts() < 150].index), 'artist'] = 'artist_second_tier'

# rename artists that appear less than 50 times as 'third_tier'
df.loc[df['artist'].isin(df['artist'].value_counts()[df['artist'].value_counts() < 50].index), 'artist'] = 'artist_third_tier'

In [12]:
top_values(df, 'artist', 150)

(artist_second_tier    15699
 artist_third_tier     15508
 Andy Warhol            1489
 Pablo Ruiz Picasso     1265
 Zhang Daqian            806
                       ...  
 Ting Walasse            161
 Alfred Sisley           156
 Alighiero Boetti        153
 Chu Teh-Chun            153
 Anselm Kiefer           150
 Name: artist, Length: 69, dtype: int64,
 69,
 51679)

In [13]:
def dummy(df,col,n):
    # drop all rows with artist appearing less than n times
    df = df[df[col].isin(df[col].value_counts()[df[col].value_counts() > n].index)]

    # pd.get_dummies for artist which appears more than n times
    df = pd.concat([df, pd.get_dummies(df[col])], axis=1)
    df.drop(col, axis=1, inplace=True)
    return df

In [14]:
df = dummy(df, 'artist', 150)

In [15]:
df.head()

Unnamed: 0,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,edition,...,Wu Guanzhong,Xu Beihong,Yayoi Kusama,Yoshitomo Nara,Zao Wou-Ki,Zhang Daqian,Zhang Xiaogang,Zhu (ju Ming) Ming,artist_second_tier,artist_third_tier
0,2017-05-30,Christies,Hong Kong,1364,paper,China,1947-01-01 00:00:00,USD,0.0,,...,0,0,0,0,0,0,0,0,1,0
1,2017-05-30,Christies,Hong Kong,1365,paper,China,1990-01-01 00:00:00,USD,0.0,,...,0,0,0,0,0,0,0,0,1,0
2,2017-05-30,Christies,Hong Kong,1366,paper,,,USD,0.0,,...,0,0,0,0,0,0,0,0,0,1
3,2017-05-30,Christies,Hong Kong,1367,paper,China,,USD,0.0,,...,0,0,0,0,0,0,0,0,1,0
4,2017-05-30,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,,...,0,0,0,0,0,0,0,0,0,1


### One-hot encoding for countries

In [16]:
# count of df['country] 
top_values(df, 'country',250)

(United States of America    9274
 France                      8255
 China                       6329
 England                     2913
 Italy                       2570
 Spain                       2441
 Germany                     2331
 Japan                       1334
 Russia                      1329
 Netherlands                 1131
 Switzerland                  735
 India                        610
 Belgium                      466
 Vietnam                      430
 Indonesia                    386
 Austria                      335
 Taiwan                       321
 Colombia                     273
 USA                          257
 Canada                       228
 Scotland                     203
 Ireland                      185
 Philippines                  185
 Flemish                      182
 Venezuela                    182
 Mexico                       157
 Chile                        155
 Norway                       147
 South Africa                 126
 Sweden       

In [17]:
# if country appears less than 250 times add it to lst 
lst = df['country'].value_counts()[df['country'].value_counts() < 250].index

# replace all countries in lst with 'other_country'
df.loc[df['country'].isin(lst), 'country'] = 'other_country'

In [18]:
df = dummy(df, 'country', 250)

In [19]:
df.head()

Unnamed: 0,auction_date,auction_house,auction_location,auction_lot,category,created,currency,dated,edition,foundry,...,Japan,Netherlands,Russia,Spain,Switzerland,Taiwan,USA,United States of America,Vietnam,other_country
0,2017-05-30,Christies,Hong Kong,1364,paper,1947-01-01 00:00:00,USD,0.0,,,...,0,0,0,0,0,0,0,0,0,0
1,2017-05-30,Christies,Hong Kong,1365,paper,1990-01-01 00:00:00,USD,0.0,,,...,0,0,0,0,0,0,0,0,0,0
3,2017-05-30,Christies,Hong Kong,1367,paper,,USD,0.0,,,...,0,0,0,0,0,0,0,0,0,0
10,2017-05-30,Christies,Hong Kong,1374,paper,1976-01-01 00:00:00,USD,0.0,,,...,0,0,0,0,0,0,0,0,0,0
11,2017-05-30,Christies,Hong Kong,1375,paper,,USD,0.0,,,...,0,0,0,0,0,0,0,0,0,0


### One-hot encoding top auction-houses

In [20]:
top_values(df, 'auction_house', 200)

(Christies                             21976
 Sothebys                              17187
 Phillips                               1308
 Artcurial                               323
 Villa Grisebach Auktionen               271
                                       ...  
 Glerum Auctioneers                        1
 Philippe Kaczorowski                      1
 Keno Auctions                             1
 Lawrences Auctioneers of Crewkerne        1
 Lehr Berlin                               1
 Name: auction_house, Length: 198, dtype: int64,
 6,
 41282)

In [21]:
# if country appears less than 200 times add it to lst 
lst = df['auction_house'].value_counts()[df['auction_house'].value_counts() < 200].index

# replace all countries in lst with 'other_country'
df.loc[df['auction_house'].isin(lst), 'auction_house'] = 'other_auction_house'

In [22]:
df = dummy(df, 'auction_house', 200)

In [23]:
df.head()

Unnamed: 0,auction_date,auction_location,auction_lot,category,created,currency,dated,edition,foundry,height,...,United States of America,Vietnam,other_country,Artcurial,Christies,Dorotheum,Phillips,Sothebys,Villa Grisebach Auktionen,other_auction_house
0,2017-05-30,Hong Kong,1364,paper,1947-01-01 00:00:00,USD,0.0,,,33.66,...,0,0,0,0,1,0,0,0,0,0
1,2017-05-30,Hong Kong,1365,paper,1990-01-01 00:00:00,USD,0.0,,,13.58,...,0,0,0,0,1,0,0,0,0,0
3,2017-05-30,Hong Kong,1367,paper,,USD,0.0,,,40.94,...,0,0,0,0,1,0,0,0,0,0
10,2017-05-30,Hong Kong,1374,paper,1976-01-01 00:00:00,USD,0.0,,,13.19,...,0,0,0,0,1,0,0,0,0,0
11,2017-05-30,Hong Kong,1375,paper,,USD,0.0,,,9.45,...,0,0,0,0,1,0,0,0,0,0


### One-hot encoding top category

In [24]:
top_values(df, 'category', 2)

(painting        26938
 paper           10936
 sculpture        3433
 print            1056
 unknown           876
 mixed media       444
 photograph        111
 other              14
 installation        1
 Name: category, dtype: int64,
 8,
 43808)

In [25]:
df = dummy(df, 'category', 2)

In [26]:
df.head()

Unnamed: 0,auction_date,auction_location,auction_lot,created,currency,dated,edition,foundry,height,high_estimate,...,Villa Grisebach Auktionen,other_auction_house,mixed media,other,painting,paper,photograph,print,sculpture,unknown
0,2017-05-30,Hong Kong,1364,1947-01-01 00:00:00,USD,0.0,,,33.66,64150.0,...,0,0,0,0,0,1,0,0,0,0
1,2017-05-30,Hong Kong,1365,1990-01-01 00:00:00,USD,0.0,,,13.58,64150.0,...,0,0,0,0,0,1,0,0,0,0
3,2017-05-30,Hong Kong,1367,,USD,0.0,,,40.94,449050.0,...,0,0,0,0,0,1,0,0,0,0
10,2017-05-30,Hong Kong,1374,1976-01-01 00:00:00,USD,0.0,,,13.19,64150.0,...,0,0,0,0,0,1,0,0,0,0
11,2017-05-30,Hong Kong,1375,,USD,0.0,,,9.45,32080.0,...,0,0,0,0,0,1,0,0,0,0


### Create a year variable

In [27]:
#  create a new column with the year of the auction and drop the date column
df['auction_year'] = df['auction_date'].str[:4]
# convert to int
df['auction_year'] = df['auction_year'].astype(int)

df.drop('auction_date', axis=1, inplace=True)

In [28]:
# how many nan in created column
df['created'].isna().sum()

21054

Too many Nan, lets drop it for now

In [29]:
df.drop('created', axis=1, inplace=True)

In [30]:
df.shape

(43808, 125)

### Get rid of non-numeric columns

In [31]:
# drop all columns that are not numeric
df = df.select_dtypes(exclude='object')

In [32]:
df.shape

(43808, 118)

In [33]:
# reset index
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,dated,height,high_estimate,inscribed,low_estimate,signed,stamped,width,aspect_ratio,area,...,other_auction_house,mixed media,other,painting,paper,photograph,print,sculpture,unknown,auction_year
0,0.0,33.66,64150.0,1.0,38490.0,1.0,0.0,16.93,1.99,569.86,...,0,0,0,0,1,0,0,0,0,2017
1,0.0,13.58,64150.0,1.0,38490.0,1.0,0.0,10.43,1.3,141.64,...,0,0,0,0,1,0,0,0,0,2017
2,0.0,40.94,449050.0,1.0,320750.0,1.0,0.0,15.55,2.63,636.62,...,0,0,0,0,1,0,0,0,0,2017
3,0.0,13.19,64150.0,1.0,38490.0,1.0,0.0,16.73,0.79,220.67,...,0,0,0,0,1,0,0,0,0,2017
4,0.0,9.45,32080.0,1.0,19250.0,1.0,0.0,10.63,0.89,100.45,...,0,0,0,0,1,0,0,0,0,2017


### Dropping all Infinity values

In [34]:
# how many inf in the dataset
df.isin([np.inf, -np.inf]).sum().sum()

1613

In [35]:
# replace inf with nan
df = df.replace([np.inf, -np.inf], np.nan)

### Imputing and Dropping Nans

In [36]:
# how many NaN in each column
df.isna().sum()

dated            165
height           286
high_estimate      0
inscribed        165
low_estimate       0
                ... 
photograph         0
print              0
sculpture          0
unknown            0
auction_year       0
Length: 118, dtype: int64

In [37]:
# how many Nan in total
df.isna().sum().sum()

3467

In [38]:
#  impute missing values with IAI.ImputationLearner
# imputer = iai.ImputationLearner(method = 'opt_knn')
# df = imputer.fit_transform(df)

In [39]:
#  drop all rows with NaN
df.dropna(inplace=True)

In [40]:
df.shape

(41762, 118)

In [41]:
df.head()

Unnamed: 0,dated,height,high_estimate,inscribed,low_estimate,signed,stamped,width,aspect_ratio,area,...,other_auction_house,mixed media,other,painting,paper,photograph,print,sculpture,unknown,auction_year
0,0.0,33.66,64150.0,1.0,38490.0,1.0,0.0,16.93,1.99,569.86,...,0,0,0,0,1,0,0,0,0,2017
1,0.0,13.58,64150.0,1.0,38490.0,1.0,0.0,10.43,1.3,141.64,...,0,0,0,0,1,0,0,0,0,2017
2,0.0,40.94,449050.0,1.0,320750.0,1.0,0.0,15.55,2.63,636.62,...,0,0,0,0,1,0,0,0,0,2017
3,0.0,13.19,64150.0,1.0,38490.0,1.0,0.0,16.73,0.79,220.67,...,0,0,0,0,1,0,0,0,0,2017
4,0.0,9.45,32080.0,1.0,19250.0,1.0,0.0,10.63,0.89,100.45,...,0,0,0,0,1,0,0,0,0,2017


### Drop auction_year

In [42]:
# drop auction_year column as we are doing prediction, not useful
df.drop('auction_year', axis=1, inplace=True)

In [43]:
df.shape

(41762, 117)

### Drop the high and low estimates

In [44]:
# drop high_estimate and low_estimate 
df.drop(['high_estimate', 'low_estimate'], axis=1, inplace=True)

# rename high_estimate_usd and low_estimate_usd to high_estimate and low_estimate
df = df.rename(columns={'high_estimate_usd': 'high_estimate', 'low_estimate_usd': 'low_estimate'})

### Drop rows if high < low

In [45]:
# drop df[df['high_estimate'] < df['low_estimate']]

df = df[df['high_estimate'] > df['low_estimate']]
df

Unnamed: 0,dated,height,inscribed,signed,stamped,width,aspect_ratio,area,sold,sales_price,...,Villa Grisebach Auktionen,other_auction_house,mixed media,other,painting,paper,photograph,print,sculpture,unknown
0,0.0,33.66,1.0,1.0,0.0,16.93,1.99,569.86,1,45900.000000,...,0,0,0,0,0,1,0,0,0,0
1,0.0,13.58,1.0,1.0,0.0,10.43,1.30,141.64,0,0.000000,...,0,0,0,0,0,1,0,0,0,0
2,0.0,40.94,1.0,1.0,0.0,15.55,2.63,636.62,1,433330.000000,...,0,0,0,0,0,1,0,0,0,0
3,0.0,13.19,1.0,1.0,0.0,16.73,0.79,220.67,0,0.000000,...,0,0,0,0,0,1,0,0,0,0
4,0.0,9.45,1.0,1.0,0.0,10.63,0.89,100.45,1,58140.000000,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43803,1.0,24.41,0.0,1.0,0.0,18.11,1.35,442.07,1,116663.178284,...,0,0,0,0,0,0,0,0,1,0
43804,1.0,32.28,0.0,0.0,0.0,24.41,1.32,787.95,1,169743.898449,...,0,0,0,0,0,0,0,0,1,0
43805,0.0,33.86,0.0,1.0,0.0,29.53,1.15,999.89,1,110699.282522,...,0,0,0,0,0,0,0,0,1,0
43806,0.0,10.71,0.0,0.0,0.0,22.36,0.48,239.48,1,126213.413808,...,0,0,0,0,0,0,0,0,1,0


### Drop "other" from category

In [266]:
# drop rows where "other" is 1 (meaning the painting is not in any of the categories)
df = df[df['other'] != 1]

# drop other column
df.drop('other', axis=1, inplace=True)

In [267]:
# rename 'unkown' column to 'other_category'
df = df.rename(columns={'unknown': 'other_category'})

In [268]:
# df.columns as dataframe
# print all rows
pd.set_option('display.max_rows', None)
df_cols = pd.DataFrame(df.columns)
df_cols

Unnamed: 0,0
0,dated
1,height
2,inscribed
3,signed
4,stamped
5,width
6,aspect_ratio
7,area
8,sold
9,sales_price


In [269]:
df.shape

(41751, 114)

### Export to csv

In [270]:
# export cleaned data to csv
df.to_csv('clustering_data.csv', index=False) 