In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import xlrd
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier


In [37]:
# Import the data with USD currency only
df = pd.read_csv('cleaned_data_with_usd.csv')
df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,high_estimate,inscribed,low_estimate,medium,sales_price,signed,stamped,title,width,sales_price_usd
0,Binhong Huang,2017-05-30,Christies,Hong Kong,1364,paper,,1947-01-01 00:00:00,USD,0.0,...,64150.0,1.0,38490.0,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93,45900.0
1,Binhong Huang,2017-05-30,Christies,Hong Kong,1365,paper,,1990-01-01 00:00:00,USD,0.0,...,64150.0,1.0,38490.0,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43,0.0
2,Yun Tang,2017-05-30,Christies,Hong Kong,1366,paper,,,USD,0.0,...,23090.0,1.0,15400.0,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,22950.0
3,Binhong Huang,2017-05-30,Christies,Hong Kong,1367,paper,,,USD,0.0,...,449050.0,1.0,320750.0,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55,433330.0
4,Yun Tang,2017-05-30,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,12830.0,1.0,10260.0,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82,12240.0


In [38]:
# First stage could be logistic regression to predict if the piece is sold or not
# Add target variable column, if the sales_price_usd is 0, then the piece is not sold
df['sold'] = np.where(df['sales_price_usd'] == 0, 0, 1)

df.head()

Unnamed: 0,artist,auction_date,auction_house,auction_location,auction_lot,category,country,created,currency,dated,...,inscribed,low_estimate,medium,sales_price,signed,stamped,title,width,sales_price_usd,sold
0,Binhong Huang,2017-05-30,Christies,Hong Kong,1364,paper,,1947-01-01 00:00:00,USD,0.0,...,1.0,38490.0,watercolor and ink / paper,45900.0,1.0,0.0,Misty Landscape,16.93,45900.0,1
1,Binhong Huang,2017-05-30,Christies,Hong Kong,1365,paper,,1990-01-01 00:00:00,USD,0.0,...,1.0,38490.0,watercolor and ink / paper,0.0,1.0,0.0,Conversations in the Mountain,10.43,0.0,0
2,Yun Tang,2017-05-30,Christies,Hong Kong,1366,paper,,,USD,0.0,...,1.0,15400.0,watercolor and ink / paper,22950.0,1.0,0.0,Peony and Insects / Peony and Butterfly (2),,22950.0,1
3,Binhong Huang,2017-05-30,Christies,Hong Kong,1367,paper,,,USD,0.0,...,1.0,320750.0,watercolor and ink / paper,433330.0,1.0,0.0,Retreat in the Mountains,15.55,433330.0,1
4,Yun Tang,2017-05-30,Christies,Hong Kong,1368,paper,,1940-01-01 00:00:00,USD,0.0,...,1.0,10260.0,watercolor and ink / paper,12240.0,1.0,0.0,Villagers in the Woods,18.82,12240.0,1


In [39]:
#Define feature to use for now
df.columns

Index(['artist', 'auction_date', 'auction_house', 'auction_location',
       'auction_lot', 'category', 'country', 'created', 'currency', 'dated',
       'edition', 'foundry', 'height', 'high_estimate', 'inscribed',
       'low_estimate', 'medium', 'sales_price', 'signed', 'stamped', 'title',
       'width', 'sales_price_usd', 'sold'],
      dtype='object')

In [46]:
features_to_use= [
       'dated', 'height', 'high_estimate', 'inscribed',
       'low_estimate', 'signed', 'stamped',
       'width']

In [47]:
#Splitting the dataframe into X and y
y=df['sold'] #Define target variable and remove from dataframe
X=df[features_to_use] #Features

In [48]:
#Splitting the data into train, validation, and test sets
randomState=15095

X_train, X_rem, y_train, y_rem = train_test_split( X, y, test_size=0.20, random_state=randomState, stratify=y)
X_val, X_test, y_val, y_test = train_test_split( X_rem, y_rem, test_size=0.50, random_state=randomState, stratify=y_rem)

In [49]:
X_train

Unnamed: 0,dated,height,high_estimate,inscribed,low_estimate,signed,stamped,width
49331,0.0,25.00,700000.0,0.0,500000.0,1.0,0.0,27.01
37504,0.0,20.08,180000.0,1.0,140000.0,0.0,0.0,20.00
45442,0.0,86.50,150000.0,0.0,100000.0,0.0,0.0,55.00
25546,0.0,58.00,400000.0,0.0,300000.0,1.0,0.0,66.00
40679,0.0,8.86,80000.0,0.0,60000.0,0.0,0.0,12.13
...,...,...,...,...,...,...,...,...
50616,1.0,23.74,120000.0,0.0,80000.0,1.0,0.0,0.00
43973,1.0,24.02,60000.0,0.0,40000.0,1.0,0.0,17.91
51386,1.0,50.98,110000.0,0.0,90000.0,1.0,0.0,43.01
15516,1.0,50.20,1500000.0,0.0,1000000.0,0.0,0.0,40.28


In [52]:
# Create model
from sklearn.ensemble import HistGradientBoostingClassifier

xgb = HistGradientBoostingClassifier(max_depth=4, random_state=randomState)

# Train model
xgb.fit(X_train, y_train.astype('int')) # XGBoost need the target variable to be binary in integer format

# Predict with test set
y_pred = xgb.predict(X_val)

# Calculate accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_val, y_pred)

# Calculate confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, y_pred)

#Print the accuracy score and confusion matrix
print('Accuracy: ', accuracy_score(y_val, y_pred))
print('Confusion Matrix: ', confusion_matrix(y_val, y_pred))


Accuracy:  0.6140011603171533
Confusion Matrix:  [[ 988 1266]
 [ 730 2187]]


In [63]:
# That is refreshing, what about now predicting the sales price?
# First, let's try to predict the sales price with the same features as before

#But keep only the rows where the piece is sold
df_sold = df[df['sold'] == 1]

X=df_sold[features_to_use] 
y=df_sold['sales_price_usd'] #Define target variable

#Splitting the data into train, validation, and test sets
randomState=15095

X_train, X_rem, y_train, y_rem = train_test_split( X, y, test_size=0.20, random_state=randomState)
X_val, X_test, y_val, y_test = train_test_split( X_rem, y_rem, test_size=0.50, random_state=randomState)

# Create model
from sklearn.ensemble import HistGradientBoostingRegressor

xgb = HistGradientBoostingRegressor(max_depth=4, random_state=randomState)

# Train model
xgb.fit(X_train, y_train.astype('int')) # XGBoost need the target variable to be binary in integer format

# Predict with test set
y_pred = xgb.predict(X_val)

# Calculate accuracy
from sklearn.metrics import mean_squared_error
mean_squared_error(y_val, y_pred)

# Calculate confusion matrix
from sklearn.metrics import r2_score
r2_score(y_val, y_pred)

print('MSE: ', mean_squared_error(y_val, y_pred))
print('R2: ', r2_score(y_val, y_pred))



MSE:  3545651703784.3047
R2:  0.455414163897797
