# Further Cleaning for clustering

## Initial imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import xlrd
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

Import data

In [2]:
df = pd.read_csv('cleaned_data_with_usd.csv')
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,signed,stamped,title,width,aspect_ratio,area,sold,sales_price_usd,low_estimate_usd,high_estimate_usd
0,Huang Binhong,2017-05-30,Christies,Hong Kong,1364,paper,China,1947-01-01 00:00:00,USD,0.0,...,1.0,0.0,Misty Landscape,16.93,1.99,569.86,1,45900.0,38490.0,64150.0
1,Huang Binhong,2017-05-30,Christies,Hong Kong,1365,paper,China,1990-01-01 00:00:00,USD,0.0,...,1.0,0.0,Conversations in the Mountain,10.43,1.3,141.64,0,0.0,38490.0,64150.0
2,Yun Tang,2017-05-30,Christies,Hong Kong,1366,paper,,,USD,0.0,...,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,,,1,22950.0,15400.0,23090.0
3,Huang Binhong,2017-05-30,Christies,Hong Kong,1367,paper,China,,USD,0.0,...,1.0,0.0,Retreat in the Mountains,15.55,2.63,636.62,1,433330.0,320750.0,449050.0
4,Yun Tang,2017-05-30,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,1.0,0.0,Villagers in the Woods,18.82,2.1,743.95,1,12240.0,10260.0,12830.0


In [3]:
# strip white space from auction_house column values
df['auction_house'] = df['auction_house'].str.strip()

## Clean the data for clustering

In [4]:
#Define feature to use for now
df.columns

Index(['artist', 'auction_date', 'auction_house', 'auction_location',
       'auction_lot', 'category', 'country', 'created', 'currency', 'dated',
       'edition', 'foundry', 'height', 'high_estimate', 'inscribed',
       'low_estimate', 'medium', 'sales_price', 'signed', 'stamped', 'title',
       'width', 'aspect_ratio', 'area', 'sold', 'sales_price_usd',
       'low_estimate_usd', 'high_estimate_usd'],
      dtype='object')

In [5]:
# drop sales_price column
df = df.drop(['sales_price'], axis=1)

# rename sales_price_usd column to sales_price
df = df.rename(columns={'sales_price_usd': 'sales_price'})

In [6]:
# print all non numeric columns
for col in df.columns:
    if df[col].dtype == 'object':
        print(col)
        

artist
auction_date
auction_house
auction_location
auction_lot
category
country
created
currency
edition
foundry
medium
title


### One-hot encoding top artists

In [7]:
def top_values(df, col, n):
    
    # print how many time each artist appears in the dataset
    counts = df[col].value_counts()

    # how many artists appear more than 200 times
    uniques = df[col].value_counts()[df[col].value_counts() > n].count()

    # number of row with artist appearing more than 200 times
    n_rows = df[df[col].isin(df[col].value_counts()[df[col].value_counts() > n].index)].shape[0]

    return counts, uniques, n_rows

In [8]:
top_values(df, 'artist', 150)

(Andy Warhol              1489
 Pablo Ruiz Picasso       1265
 Zhang Daqian              806
 Marc Chagall              681
 Pierre Auguste Renoir     575
                          ... 
 Guofang Jiang               1
 Genpei Akasegawa            1
 Shu Tanaka                  1
 Yoshida Katsuro             1
 Yanobe Kenji                1
 Name: artist, Length: 2499, dtype: int64,
 66,
 20322)

In [9]:
def dummy(df,col,n):
    # drop all rows with artist appearing less than n times
    df = df[df[col].isin(df[col].value_counts()[df[col].value_counts() > n].index)]

    # pd.get_dummies for artist which appears more than n times
    df = pd.concat([df, pd.get_dummies(df[col])], axis=1)
    df.drop(col, axis=1, inplace=True)
    return df

In [10]:
df = dummy(df, 'artist', 150)

In [11]:
df.head()

Unnamed: 0,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,edition,...,Wassily Kandinsky,Willem De Kooning,Wu Guanzhong,Xu Beihong,Yayoi Kusama,Yoshitomo Nara,Zao Wou-Ki,Zhang Daqian,Zhang Xiaogang,Zhu (ju Ming) Ming
10,2017-05-30,Christies,Hong Kong,1374,paper,China,1976-01-01 00:00:00,USD,0.0,,...,0,0,0,0,0,0,0,1,0,0
11,2017-05-30,Christies,Hong Kong,1375,paper,China,,USD,0.0,,...,0,0,0,0,0,0,0,1,0,0
12,2017-05-30,Christies,Hong Kong,1376,paper,China,,USD,0.0,,...,0,0,0,0,0,0,0,1,0,0
13,2017-05-30,Christies,Hong Kong,1377,paper,China,1946-01-01 00:00:00,USD,0.0,,...,0,0,0,0,0,0,0,1,0,0
16,2017-05-30,Christies,Hong Kong,1382,paper,China,1975-01-01 00:00:00,USD,0.0,,...,0,0,0,0,0,0,0,1,0,0


### One-hot encoding for countries

In [16]:
# for all rows that appear less than 400 times, replace country with 'other'
df['country'] = df['country'].apply(lambda x: 'other' if x not in df['country'].value_counts()[df['country'].value_counts() > 400].index else x)

In [17]:
# count of df['country] 
top_values(df, 'country',400)

(France                      4740
 United States of America    3801
 China                       3120
 Spain                       1993
 other                       1801
 England                     1000
 Italy                        929
 Japan                        912
 Germany                      882
 Russia                       725
 India                        419
 Name: country, dtype: int64,
 11,
 20322)

In [18]:
df = dummy(df, 'country', 400)

In [19]:
df.head()

Unnamed: 0,auction_date,auction_house,auction_location,auction_lot,category,created,currency,dated,edition,foundry,...,England,France,Germany,India,Italy,Japan,Russia,Spain,United States of America,other
10,2017-05-30,Christies,Hong Kong,1374,paper,1976-01-01 00:00:00,USD,0.0,,,...,0,0,0,0,0,0,0,0,0,0
11,2017-05-30,Christies,Hong Kong,1375,paper,,USD,0.0,,,...,0,0,0,0,0,0,0,0,0,0
12,2017-05-30,Christies,Hong Kong,1376,paper,,USD,0.0,,,...,0,0,0,0,0,0,0,0,0,0
13,2017-05-30,Christies,Hong Kong,1377,paper,1946-01-01 00:00:00,USD,0.0,,,...,0,0,0,0,0,0,0,0,0,0
16,2017-05-30,Christies,Hong Kong,1382,paper,1975-01-01 00:00:00,USD,0.0,,,...,0,0,0,0,0,0,0,0,0,0


### One-hot encoding top auction-houses

In [20]:
top_values(df, 'auction_house', 100)

(Christies                    10321
 Sothebys                      7995
 Phillips                       520
 Artcurial                      174
 Villa Grisebach Auktionen      120
                              ...  
 Philippe Kaczorowski             1
 Grogan & Company                 1
 Heffel Gallery Limited           1
 Gorringes                        1
 EUROP AUCTION                    1
 Name: auction_house, Length: 129, dtype: int64,
 5,
 19130)

In [21]:
df = dummy(df, 'auction_house', 100)

In [22]:
df.head()

Unnamed: 0,auction_date,auction_location,auction_lot,category,created,currency,dated,edition,foundry,height,...,Japan,Russia,Spain,United States of America,other,Artcurial,Christies,Phillips,Sothebys,Villa Grisebach Auktionen
10,2017-05-30,Hong Kong,1374,paper,1976-01-01 00:00:00,USD,0.0,,,13.19,...,0,0,0,0,0,0,1,0,0,0
11,2017-05-30,Hong Kong,1375,paper,,USD,0.0,,,9.45,...,0,0,0,0,0,0,1,0,0,0
12,2017-05-30,Hong Kong,1376,paper,,USD,0.0,,,55.91,...,0,0,0,0,0,0,1,0,0,0
13,2017-05-30,Hong Kong,1377,paper,1946-01-01 00:00:00,USD,0.0,,,41.73,...,0,0,0,0,0,0,1,0,0,0
16,2017-05-30,Hong Kong,1382,paper,1975-01-01 00:00:00,USD,0.0,,,47.24,...,0,0,0,0,0,0,1,0,0,0


### One-hot encoding top category

In [23]:
top_values(df, 'category', 150)

(painting       10473
 paper           5972
 sculpture       1279
 print            670
 unknown          326
 mixed media      178
 photograph        57
 other              6
 Name: category, dtype: int64,
 6,
 18898)

In [24]:
df = dummy(df, 'category', 150)

In [25]:
df.head()

Unnamed: 0,auction_date,auction_location,auction_lot,created,currency,dated,edition,foundry,height,high_estimate,...,Christies,Phillips,Sothebys,Villa Grisebach Auktionen,mixed media,painting,paper,print,sculpture,unknown
10,2017-05-30,Hong Kong,1374,1976-01-01 00:00:00,USD,0.0,,,13.19,64150.0,...,1,0,0,0,0,0,1,0,0,0
11,2017-05-30,Hong Kong,1375,,USD,0.0,,,9.45,32080.0,...,1,0,0,0,0,0,1,0,0,0
12,2017-05-30,Hong Kong,1376,,USD,0.0,,,55.91,2566000.0,...,1,0,0,0,0,0,1,0,0,0
13,2017-05-30,Hong Kong,1377,1946-01-01 00:00:00,USD,0.0,,,41.73,641500.0,...,1,0,0,0,0,0,1,0,0,0
16,2017-05-30,Hong Kong,1382,1975-01-01 00:00:00,USD,0.0,,,47.24,769800.0,...,1,0,0,0,0,0,1,0,0,0


### Create a year variable

In [26]:
#  create a new column with the year of the auction and drop the date column
df['auction_year'] = df['auction_date'].str[:4]
# convert to int
df['auction_year'] = df['auction_year'].astype(int)

df.drop('auction_date', axis=1, inplace=True)

In [27]:
# how many nan in created column
df['created'].isna().sum()

8547

Too many Nan, lets drop it for now

In [28]:
df.drop('created', axis=1, inplace=True)

### Get rid of non-numeric columns

In [29]:
# print all non numeric columns
for col in df.columns:
    if df[col].dtype == 'object':
        print(col)

auction_location
auction_lot
currency
edition
foundry
medium
title


In [30]:
# if dtype is object, drop the column
for col in df.columns:
    if df[col].dtype == 'object':
        df.drop(col, axis=1, inplace=True)

In [31]:
# reset index
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,dated,height,high_estimate,inscribed,low_estimate,signed,stamped,width,aspect_ratio,area,...,Phillips,Sothebys,Villa Grisebach Auktionen,mixed media,painting,paper,print,sculpture,unknown,auction_year
0,0.0,13.19,64150.0,1.0,38490.0,1.0,0.0,16.73,0.79,220.67,...,0,0,0,0,0,1,0,0,0,2017
1,0.0,9.45,32080.0,1.0,19250.0,1.0,0.0,10.63,0.89,100.45,...,0,0,0,0,0,1,0,0,0,2017
2,0.0,55.91,2566000.0,1.0,1924500.0,1.0,0.0,28.54,1.96,1595.67,...,0,0,0,0,0,1,0,0,0,2017
3,0.0,41.73,641500.0,1.0,513200.0,1.0,0.0,25.39,1.64,1059.52,...,0,0,0,0,0,1,0,0,0,2017
4,0.0,47.24,769800.0,1.0,513200.0,1.0,0.0,22.36,2.11,1056.29,...,0,0,0,0,0,1,0,0,0,2017


### Dropping all Infinity values

In [32]:
# how many inf in the dataset
df.isin([np.inf, -np.inf]).sum().sum()

747

In [33]:
# replace inf with nan
df = df.replace([np.inf, -np.inf], np.nan)

### Imputing and Dropping Nans

In [34]:
# how many NaN in each column
df.isna().sum()

dated             61
height           133
high_estimate      0
inscribed         61
low_estimate       0
                ... 
paper              0
print              0
sculpture          0
unknown            0
auction_year       0
Length: 103, dtype: int64

In [35]:
# how many Nan in total
df.isna().sum().sum()

1546

In [36]:
#  impute missing values with IAI.ImputationLearner
# imputer = iai.ImputationLearner(method = 'opt_knn')
# df = imputer.fit_transform(df)

In [37]:
#  drop all rows with NaN
df.dropna(inplace=True)

In [38]:
df.shape

(17964, 103)

In [39]:
df.head()

Unnamed: 0,dated,height,high_estimate,inscribed,low_estimate,signed,stamped,width,aspect_ratio,area,...,Phillips,Sothebys,Villa Grisebach Auktionen,mixed media,painting,paper,print,sculpture,unknown,auction_year
0,0.0,13.19,64150.0,1.0,38490.0,1.0,0.0,16.73,0.79,220.67,...,0,0,0,0,0,1,0,0,0,2017
1,0.0,9.45,32080.0,1.0,19250.0,1.0,0.0,10.63,0.89,100.45,...,0,0,0,0,0,1,0,0,0,2017
2,0.0,55.91,2566000.0,1.0,1924500.0,1.0,0.0,28.54,1.96,1595.67,...,0,0,0,0,0,1,0,0,0,2017
3,0.0,41.73,641500.0,1.0,513200.0,1.0,0.0,25.39,1.64,1059.52,...,0,0,0,0,0,1,0,0,0,2017
4,0.0,47.24,769800.0,1.0,513200.0,1.0,0.0,22.36,2.11,1056.29,...,0,0,0,0,0,1,0,0,0,2017


### Drop auction_year

In [50]:
# drop auction_year column as we are doing prediction, not useful
df.drop('auction_year', axis=1, inplace=True)

In [53]:
df.shape

(17964, 102)

### Export to csv

In [52]:
# export cleaned data to csv
df.to_csv('clustering_data.csv', index=False)