In [None]:
import seaborn as sns

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import lightgbm as lgb

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import xgboost as xgb

diamonds = sns.load_dataset("diamonds")

diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [None]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [None]:
X, y = diamonds.drop('price', axis = 1), diamonds['price']

In [None]:
cats = X.select_dtypes(exclude = np.number).columns.to_list()

for col in cats:
  X[col] = X[col].astype('category')

In [None]:
X.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
x           float64
y           float64
z           float64
dtype: object

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

#XGBoost

In [None]:
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical = True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical = True)

In [None]:
params = {'objective': 'reg:squarederror', 'tree_method':'hist'}

n = 100

xgb_model = xgb.train(params = params, dtrain = dtrain_reg, num_boost_round=n)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

preds = xgb_model.predict(dtest_reg)


In [None]:
rmse = mean_squared_error(y_test, preds, squared = False)
r2 = r2_score(y_test, preds)

print(rmse, ' ', r2)

552.8613060974551   0.9802323585387969


#Lets try to achieve better results

In [None]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

In [None]:
xgb_model_1 = xgb.train(params=params, dtrain=dtrain_reg, num_boost_round=n, evals=evals, verbose_eval=10)

[0]	train-rmse:2874.49146	validation-rmse:2817.90814
[10]	train-rmse:548.36512	validation-rmse:592.03160
[20]	train-rmse:491.09887	validation-rmse:558.53485
[30]	train-rmse:469.58201	validation-rmse:555.51015
[40]	train-rmse:454.32953	validation-rmse:554.45666
[50]	train-rmse:438.68033	validation-rmse:554.13365
[60]	train-rmse:425.38361	validation-rmse:551.57888
[70]	train-rmse:414.71115	validation-rmse:549.26109
[80]	train-rmse:405.41008	validation-rmse:549.03952
[90]	train-rmse:391.04269	validation-rmse:551.87206
[99]	train-rmse:383.48826	validation-rmse:552.86131


In [None]:
n = 10000

xgb_model_2 = xgb.train(params=params, dtrain=dtrain_reg, num_boost_round=n, evals=evals, verbose_eval=50, early_stopping_rounds=50)
# if validation loss doesn't improve for 50 consecutive rounds, XGBoost will automatically stop the training

[0]	train-rmse:2874.49146	validation-rmse:2817.90814
[50]	train-rmse:438.68033	validation-rmse:554.13365
[100]	train-rmse:381.96310	validation-rmse:553.73941
[129]	train-rmse:357.32858	validation-rmse:552.90303


In [None]:
results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold=5,
   early_stopping_rounds=20
)

In [None]:
results.head(10)

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,2874.224552,9.424846,2876.318793,36.995997
1,2088.350837,7.595382,2093.063623,25.351925
2,1552.629638,4.97414,1560.552731,19.550836
3,1185.994963,4.133544,1198.669943,14.648669
4,943.402904,4.757288,962.349383,11.724038
5,786.841146,4.264646,809.901753,9.642402
6,686.705114,3.653191,714.706753,7.950444
7,624.883276,3.915354,655.965053,8.900742
8,585.729338,3.647987,621.533835,9.175694
9,560.959594,4.043157,600.941918,9.311007


In [None]:
best_rmse = results['test-rmse-mean'].min()

best_rmse

550.2735543625861

#LightGBM

In [None]:
params_lgb = {
    'task': 'train',
    'boosting': 'gbdt',
    'objective': 'regression',
    'num_leaves': 10,
    'learnnig_rage': 0.05,
    'metric': {'l2','l1'},
    'verbose': -1
}

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [None]:
model_lgb = lgb.train(params_lgb, train_set=lgb_train,
                 valid_sets=lgb_eval,
                 callbacks = [lgb.early_stopping(stopping_rounds = 20)], )

Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l1: 304.065	valid_0's l2: 331256


In [None]:
predict_lgb = model_lgb.predict(X_test)

mse = mean_squared_error(predict_lgb, y_test)
rmse = mse**0.5

print(f'MSE = {mse}, RMSE = {rmse}')

MSE = 331255.72676143044, RMSE = 575.548196731977
