In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [27]:
train = pd.read_csv("/kaggle/input/royal-diamond-store/train.csv")
test = pd.read_csv("/kaggle/input/royal-diamond-store/test.csv")

In [28]:
train.head(3)

Unnamed: 0,SampleID,carat,cut,color,clarity,depth,table,price,x,y,z
0,19498,1.21,Ideal,H,VVS2,61.3,57.0,8131,6.92,6.87,4.23
1,31230,0.31,Ideal,E,VS2,62.0,56.0,756,4.38,4.36,2.71
2,22312,1.21,Ideal,E,VS1,62.4,57.0,10351,6.75,6.83,4.24


In [29]:
cut_list = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_list = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_list = ['FL', 'IF', 'VVS1', 'VVS2', 'VS1', 'VS2', 'SI1', 'SI2', 'I1', 'I2', 'I3']

cut_class2ord, color_class2ord, clarity_class2ord = dict(), dict(), dict()
for i, v in enumerate(cut_list):
    cut_class2ord[v] = i
for i, v in enumerate(color_list):
    color_class2ord[v] = i
for i, v in enumerate(clarity_list):
    clarity_class2ord[v] = i

In [30]:
def process_df(df):
    df['cut'] = df['cut'].map(cut_class2ord.get)
    df['color'] = df['color'].map(color_class2ord.get)
    df['clarity'] = df['clarity'].map(clarity_class2ord.get)
    df['volume'] = df['x'] * df['y'] * df['z']
    df['proportion'] = df['depth'] / df['table']
    if 'price' in df.columns:
        df['price'] = np.log10(df['price']*100+1)
    return df

train = process_df(train)
test = process_df(test)

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import Pool

features = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'volume', 'proportion']
features_to_scale = ['carat', 'depth', 'table', 'x', 'y', 'z', 'volume', 'proportion']

X, y = train[features], train['price']
X_test = test[features]

scaler = StandardScaler()
scaler.fit(X[features_to_scale])
X[features_to_scale] = scaler.transform(X[features_to_scale])
X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])


X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.2)

train_pool = Pool(X_train, y_train)
valid_pool = Pool(X_valid, y_valid)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[features_to_scale] = scaler.transform(X[features_to_scale])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])


In [50]:
from catboost import CatBoostRegressor

params = {
    'iterations': 20000,
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'metric_period': 1000,
    'max_depth': 6,
}

model = CatBoostRegressor(**params)

model.fit(train_pool, eval_set=valid_pool)

0:	learn: 0.3701508	test: 0.3706890	best: 0.3706890 (0)	total: 7.02ms	remaining: 2m 20s
1000:	learn: 0.0273246	test: 0.0285359	best: 0.0285359 (1000)	total: 5.94s	remaining: 1m 52s
2000:	learn: 0.0250933	test: 0.0271326	best: 0.0271326 (2000)	total: 11.8s	remaining: 1m 46s
3000:	learn: 0.0240303	test: 0.0265995	best: 0.0265995 (3000)	total: 17.7s	remaining: 1m 40s
4000:	learn: 0.0233009	test: 0.0263497	best: 0.0263497 (4000)	total: 23.4s	remaining: 1m 33s
5000:	learn: 0.0228102	test: 0.0261837	best: 0.0261837 (5000)	total: 29.1s	remaining: 1m 27s
6000:	learn: 0.0224331	test: 0.0260671	best: 0.0260671 (6000)	total: 34.9s	remaining: 1m 21s
7000:	learn: 0.0220938	test: 0.0259819	best: 0.0259819 (7000)	total: 40.7s	remaining: 1m 15s
8000:	learn: 0.0218254	test: 0.0259153	best: 0.0259153 (8000)	total: 46.4s	remaining: 1m 9s
9000:	learn: 0.0216065	test: 0.0258789	best: 0.0258789 (9000)	total: 52.2s	remaining: 1m 3s
10000:	learn: 0.0213980	test: 0.0258412	best: 0.0258412 (10000)	total: 57.9s	

<catboost.core.CatBoostRegressor at 0x7bc907a42c90>

In [51]:
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_valid)
y_pred_orig = (10**y_pred-1) / 100
y_valid_orig = (10**y_valid-1) / 100

mae = mean_absolute_error(y_valid_orig, y_pred_orig)
print(f"Score: {mae:.5f}")

Score: 252.45175


In [52]:
y_pred = model.predict(X_test)
y_pred = (10**y_pred-1) / 100

subm = []

for i in range(len(test)):
    for sid in range(1, 5):
        answer = y_pred[i]
        if sid==1:
            if test['carat'][i] < 0.5:
                answer = 'Light'
            elif 0.5 <= test['carat'][i] < 1.5:
                answer = 'Medium'
            else:
                answer = 'Heavy'
        elif sid==2:
            answer = test['proportion'][i]
        elif sid==3:
            answer = test['volume'][i]
        subm.append({
            'subtaskID': sid,
            'datapointID': test['SampleID'][i],
            'answer': answer
        })

subm = pd.DataFrame(subm)

subm.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,1389,Light
1,2,1389,1.108929
2,3,1389,39.2236
3,4,1389,540.074989
4,1,50053,Medium


In [53]:
subm.to_csv("submission.csv", index=False)