In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import xlrd
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier


In [45]:
# Import the data with USD currency only
df = pd.read_csv('cleaned_data_with_usd.csv')
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,signed,stamped,title,width,aspect_ratio,area,sold,sales_price_usd,low_estimate_usd,high_estimate_usd
0,Huang Binhong,2017-05-30,Christies,Hong Kong,1364,paper,China,1947-01-01 00:00:00,USD,0.0,...,1.0,0.0,Misty Landscape,16.93,1.99,569.86,1,45900.0,38490.0,64150.0
1,Huang Binhong,2017-05-30,Christies,Hong Kong,1365,paper,China,1990-01-01 00:00:00,USD,0.0,...,1.0,0.0,Conversations in the Mountain,10.43,1.3,141.64,0,0.0,38490.0,64150.0
2,Yun Tang,2017-05-30,Christies,Hong Kong,1366,paper,,,USD,0.0,...,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,,,1,22950.0,15400.0,23090.0
3,Huang Binhong,2017-05-30,Christies,Hong Kong,1367,paper,China,,USD,0.0,...,1.0,0.0,Retreat in the Mountains,15.55,2.63,636.62,1,433330.0,320750.0,449050.0
4,Yun Tang,2017-05-30,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,1.0,0.0,Villagers in the Woods,18.82,2.1,743.95,1,12240.0,10260.0,12830.0


In [46]:
# First stage could be logistic regression to predict if the piece is sold or not
# Add target variable column, if the sales_price_usd is 0, then the piece is not sold
#df['sold'] = np.where(df['sales_price_usd'] == 0, 0, 1)

# You addded this is cleaning file so not needed anymore

df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,signed,stamped,title,width,aspect_ratio,area,sold,sales_price_usd,low_estimate_usd,high_estimate_usd
0,Huang Binhong,2017-05-30,Christies,Hong Kong,1364,paper,China,1947-01-01 00:00:00,USD,0.0,...,1.0,0.0,Misty Landscape,16.93,1.99,569.86,1,45900.0,38490.0,64150.0
1,Huang Binhong,2017-05-30,Christies,Hong Kong,1365,paper,China,1990-01-01 00:00:00,USD,0.0,...,1.0,0.0,Conversations in the Mountain,10.43,1.3,141.64,0,0.0,38490.0,64150.0
2,Yun Tang,2017-05-30,Christies,Hong Kong,1366,paper,,,USD,0.0,...,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,,,1,22950.0,15400.0,23090.0
3,Huang Binhong,2017-05-30,Christies,Hong Kong,1367,paper,China,,USD,0.0,...,1.0,0.0,Retreat in the Mountains,15.55,2.63,636.62,1,433330.0,320750.0,449050.0
4,Yun Tang,2017-05-30,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,1.0,0.0,Villagers in the Woods,18.82,2.1,743.95,1,12240.0,10260.0,12830.0


In [47]:
#Define feature to use for now
df.columns

Index(['artist', 'auction_date', 'auction_house', 'auction_location',
       'auction_lot', 'category', 'country', 'created', 'currency', 'dated',
       'edition', 'foundry', 'height', 'high_estimate', 'inscribed',
       'low_estimate', 'medium', 'sales_price', 'signed', 'stamped', 'title',
       'width', 'aspect_ratio', 'area', 'sold', 'sales_price_usd',
       'low_estimate_usd', 'high_estimate_usd'],
      dtype='object')

In [48]:
features_to_use= [
       'dated', 'height', 'high_estimate_usd', 'inscribed',
       'low_estimate_usd', 'signed', 'stamped',
       'width', 'auction_house', 'category', 'country', 'aspect_ratio', 'area']

In [49]:
#Splitting the dataframe into X and y
y=df['sold'] #Define target variable and remove from dataframe
X=df[features_to_use] #Features

In [50]:
X.head()

Unnamed: 0,dated,height,high_estimate_usd,inscribed,low_estimate_usd,signed,stamped,width,auction_house,category,country,aspect_ratio,area
0,0.0,33.66,64150.0,1.0,38490.0,1.0,0.0,16.93,Christies,paper,China,1.99,569.86
1,0.0,13.58,64150.0,1.0,38490.0,1.0,0.0,10.43,Christies,paper,China,1.3,141.64
2,0.0,,23090.0,1.0,15400.0,1.0,0.0,,Christies,paper,,,
3,0.0,40.94,449050.0,1.0,320750.0,1.0,0.0,15.55,Christies,paper,China,2.63,636.62
4,0.0,39.53,12830.0,1.0,10260.0,1.0,0.0,18.82,Christies,paper,,2.1,743.95


In [51]:
# How many countries are there?
X['country'].nunique()

45

In [52]:
#How many categories are there?
X['category'].nunique()

9

In [53]:
#How many auction houses are there?
X['auction_house'].nunique()

200

In [54]:
#What are the most common auction houses?
X['auction_house'].value_counts()

Sothebys                              17361
Christies                             16858
Christies                             12216
Phillips                               1379
Artcurial                               323
                                      ...  
Galerie & Auktionshaus Hassfuther         1
Hodgins Art Auctions Ltd                  1
Donnington Priory Salerooms               1
Oger                                      1
Lehr Berlin                               1
Name: auction_house, Length: 200, dtype: int64

In [55]:
#Why is christies two times? they should be one, maybe there is a space in the name?
X['auction_house'] = np.where(X['auction_house'].str.contains('Christies'), 'Christies', X['auction_house'])

X['auction_house'].value_counts().head(10)

Christies                     29074
Sothebys                      17361
Phillips                       1379
Artcurial                       323
Villa Grisebach Auktionen       274
Dorotheum                       223
Bonhams                         167
Lempertz                        165
Tajan                           146
China Guardian Auctions         137
Name: auction_house, dtype: int64

In [56]:
#We will decide what to do with country later, but for now we can hot encode category and auction house

#Hot encode category
X = pd.get_dummies(X, columns=['category'], drop_first=True)

#Hot encode auction house but only keep the top 10, the rest mark as "other"
top_10_auction_houses = X['auction_house'].value_counts().head(10).index
X['auction_house'] = np.where(X['auction_house'].isin(top_10_auction_houses), X['auction_house'], 'other')
X = pd.get_dummies(X, columns=['auction_house'], drop_first=True)

#And drop country for now
X = X.drop('country', axis=1)

X.head()

Unnamed: 0,dated,height,high_estimate_usd,inscribed,low_estimate_usd,signed,stamped,width,aspect_ratio,area,...,auction_house_Bonhams,auction_house_China Guardian Auctions,auction_house_Christies,auction_house_Dorotheum,auction_house_Lempertz,auction_house_Phillips,auction_house_Sothebys,auction_house_Tajan,auction_house_Villa Grisebach Auktionen,auction_house_other
0,0.0,33.66,64150.0,1.0,38490.0,1.0,0.0,16.93,1.99,569.86,...,0,0,1,0,0,0,0,0,0,0
1,0.0,13.58,64150.0,1.0,38490.0,1.0,0.0,10.43,1.3,141.64,...,0,0,1,0,0,0,0,0,0,0
2,0.0,,23090.0,1.0,15400.0,1.0,0.0,,,,...,0,0,1,0,0,0,0,0,0,0
3,0.0,40.94,449050.0,1.0,320750.0,1.0,0.0,15.55,2.63,636.62,...,0,0,1,0,0,0,0,0,0,0
4,0.0,39.53,12830.0,1.0,10260.0,1.0,0.0,18.82,2.1,743.95,...,0,0,1,0,0,0,0,0,0,0


In [57]:
#Splitting the data into train, validation, and test sets
randomState=15095

X_train, X_rem, y_train, y_rem = train_test_split( X, y, test_size=0.20, random_state=randomState, stratify=y)
X_val, X_test, y_val, y_test = train_test_split( X_rem, y_rem, test_size=0.50, random_state=randomState, stratify=y_rem)

In [58]:
X_train

Unnamed: 0,dated,height,high_estimate_usd,inscribed,low_estimate_usd,signed,stamped,width,aspect_ratio,area,...,auction_house_Bonhams,auction_house_China Guardian Auctions,auction_house_Christies,auction_house_Dorotheum,auction_house_Lempertz,auction_house_Phillips,auction_house_Sothebys,auction_house_Tajan,auction_house_Villa Grisebach Auktionen,auction_house_other
2178,1.0,59.45,8.967000e+04,0.0,6.405000e+04,1.0,0.0,39.37,1.51,2340.55,...,0,0,1,0,0,0,0,0,0,0
7532,0.0,27.95,1.025000e+04,0.0,5.120000e+03,1.0,0.0,14.17,1.97,396.05,...,0,0,1,0,0,0,0,0,0,0
49804,1.0,118.11,3.229000e+05,0.0,2.324880e+05,1.0,0.0,70.87,1.67,8370.46,...,0,0,0,0,0,0,1,0,0,0
25715,1.0,12.20,2.160094e+05,0.0,1.440063e+05,1.0,0.0,25.59,0.48,312.20,...,0,0,0,0,0,0,1,0,0,0
1939,0.0,26.97,3.202500e+05,0.0,1.921500e+05,1.0,0.0,26.77,1.01,721.99,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38092,1.0,53.74,1.500000e+05,0.0,1.000000e+05,0.0,0.0,43.50,1.24,2337.69,...,0,0,0,0,0,0,1,0,0,0
26321,0.0,70.87,1.139022e+06,1.0,8.135870e+05,1.0,0.0,51.18,1.38,3627.13,...,0,0,0,0,0,0,1,0,0,0
27010,0.0,34.06,1.396234e+07,0.0,1.047176e+07,0.0,0.0,54.53,0.62,1857.29,...,0,0,0,0,0,0,1,0,0,0
4697,1.0,13.11,5.152000e+04,0.0,3.864000e+04,1.0,0.0,9.53,1.38,124.94,...,0,0,1,0,0,0,0,0,0,0


In [59]:
# Create model
from sklearn.ensemble import HistGradientBoostingClassifier

xgb = HistGradientBoostingClassifier(max_depth=4, random_state=randomState)

# Train model
xgb.fit(X_train, y_train.astype('int')) # XGBoost need the target variable to be binary in integer format

# Predict with test set
y_pred = xgb.predict(X_val)

# Calculate accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_val, y_pred)

# Calculate confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, y_pred)

#Print the accuracy score and confusion matrix
print('Accuracy: ', accuracy_score(y_val, y_pred))
print('Confusion Matrix: ', confusion_matrix(y_val, y_pred))


Accuracy:  0.951625386996904
Confusion Matrix:  [[   0  250]
 [   0 4918]]


In [60]:
# That is refreshing, what about now predicting the sales price?
# First, let's try to predict the sales price with the same features as before

#But keep only the rows where the piece is sold
df_sold = df[df['sold'] == 1]

X=df_sold[features_to_use] 
y=df_sold['sales_price_usd'] #Define target variable

###########
#Doing the same cleaning and hot encoding as before
X['auction_house'] = np.where(X['auction_house'].str.contains('Christies'), 'Christies', X['auction_house'])
#Hot encode category
X = pd.get_dummies(X, columns=['category'], drop_first=True)
#Hot encode auction house but only keep the top 10, the rest mark as "other"
top_10_auction_houses = X['auction_house'].value_counts().head(10).index
X['auction_house'] = np.where(X['auction_house'].isin(top_10_auction_houses), X['auction_house'], 'other')
X = pd.get_dummies(X, columns=['auction_house'], drop_first=True)
#And drop country for now
X = X.drop('country', axis=1)
###########

#Splitting the data into train, validation, and test sets
randomState=15095

X_train, X_rem, y_train, y_rem = train_test_split( X, y, test_size=0.20, random_state=randomState)
X_val, X_test, y_val, y_test = train_test_split( X_rem, y_rem, test_size=0.50, random_state=randomState)

# Create model
from sklearn.ensemble import HistGradientBoostingRegressor

xgb = HistGradientBoostingRegressor(max_depth=4, random_state=randomState)

# Train model
xgb.fit(X_train, y_train.astype('int')) # XGBoost need the target variable to be binary in integer format

# Predict with test set
y_pred = xgb.predict(X_val)

# Calculate accuracy
from sklearn.metrics import mean_squared_error
mean_squared_error(y_val, y_pred)

# Calculate confusion matrix
from sklearn.metrics import r2_score
r2_score(y_val, y_pred)

print('MSE: ', mean_squared_error(y_val, y_pred))
print('R2: ', r2_score(y_val, y_pred))



MSE:  1873936783127.3267
R2:  0.758818615695875
