# Features

    · id: only for test & sample submission files, id for prediction sample identification
    
    · price: price in USD
    
    · carat: weight of the diamond
    
    · cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
    
    · color: diamond colour, from J (worst) to D (best)
    
    · clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))
    
    · x: length in mm
    
    · y: width in mm
    
    · z: depth in mm
    
    · depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
    
    · table: width of top of diamond relative to widest point (43--95)


In [1]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import r2_score


In [2]:
diamond=pd.read_csv('diamonds_train.csv')
diamond.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95


In [3]:
#diamond=diamond[(diamond[['x','y','z']]!=0).all(axis=1)]

In [4]:
X=diamond.drop(columns=['price'])
y=diamond.price

In [5]:
#scaler=StandardScaler()

#X[['carat']]=scaler.fit_transform(X[['carat']])
#X.head()

In [6]:
clarity={'I1':0, 'SI2':1, 'SI1':2, 'VS2':3, 'VS1':4, 'VVS2':5, 'VVS1':6, 'IF':7}
cut={'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4}
color={'J':0, 'I':1, 'H':2, 'G':3, 'F':4, 'E':5, 'D':6}

In [7]:
def labeling(s, dic):
    return dic[s]

In [8]:
X.clarity=X.clarity.apply(lambda x: labeling(x, clarity))
X.cut=X.cut.apply(lambda x: labeling(x, cut))
X.color=X.color.apply(lambda x: labeling(x, color))

In [9]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.21,3,0,3,62.4,58.0,6.83,6.79,4.25
1,0.32,2,2,3,63.0,57.0,4.35,4.38,2.75
2,0.71,0,3,4,65.5,55.0,5.62,5.53,3.65
3,0.41,1,6,2,63.8,56.0,4.68,4.72,3.0
4,1.02,4,3,2,60.5,59.0,6.55,6.51,3.95


In [10]:
X=X.drop(columns=['table'])
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,x,y,z
0,1.21,3,0,3,62.4,6.83,6.79,4.25
1,0.32,2,2,3,63.0,4.35,4.38,2.75
2,0.71,0,3,4,65.5,5.62,5.53,3.65
3,0.41,1,6,2,63.8,4.68,4.72,3.0
4,1.02,4,3,2,60.5,6.55,6.51,3.95


In [11]:
X['Vol']=X.x*X.y*X.z
X['Sum']=X.carat**2+2*X.clarity+X.color
X.head()
X[X.Vol==0]

Unnamed: 0,carat,cut,color,clarity,depth,x,y,z,Vol,Sum
1606,1.01,3,4,1,59.2,6.5,6.47,0.0,0.0,7.0201
3945,2.02,3,2,3,62.7,8.02,7.95,0.0,0.0,12.0804
6465,0.71,1,4,1,64.1,0.0,0.0,0.0,0.0,6.5041
13839,2.8,1,3,1,63.8,8.9,8.85,0.0,0.0,12.84
14815,1.07,4,4,1,61.6,0.0,6.62,0.0,0.0,7.1449
14891,2.18,3,2,1,59.4,8.49,8.45,0.0,0.0,8.7524
16425,2.2,3,2,2,61.2,8.42,8.37,0.0,0.0,10.84
19856,1.0,3,3,1,59.1,6.55,6.48,0.0,0.0,6.0
21602,1.15,4,3,3,59.2,6.88,6.83,0.0,0.0,10.3225
24795,1.1,3,3,1,63.0,6.5,6.47,0.0,0.0,6.21


In [12]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
gbm = lgb.LGBMRegressor(num_leaves=100,
                        learning_rate=0.05,
                        n_estimators=200)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l2',
        early_stopping_rounds=50, verbose=False)


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.05, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=200, n_jobs=-1, num_leaves=100, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [14]:
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
print('RMSE:', mean_squared_error(y_test, y_pred) ** 0.5)

RMSE: 517.2546738536256


# Gridsearch

In [15]:
def grid(x_tr, y_tr, x_te, y_te, model, param, cv=5):
    start = time.time()
    grid = GridSearchCV(model, param, cv=cv, iid=True, return_train_score=True, n_jobs=-1)
    grid.fit(x_tr, y_tr)
    
    print ('Best score : {:.2f}'.format(grid.score(x_te, y_te)))
    print ('Best parameters: {}'.format(grid.best_params_))
    print ('Best score cross-val: {:.2f}'.format(grid.best_score_))
    
    params = list(grid.best_params_.keys())
    best_params = list(grid.best_params_.values())
    
    best_m = grid.best_estimator_.fit(x_tr, y_tr)  
    y_pred = best_m.predict(x_te) 
    print ('R2 score : {}'.format(r2_score(y_te, y_pred)))
    print ('RMSE: {}'.format(mean_squared_error(y_te, y_pred)**(0.5)))
    
    print ("Time : {:.3f} seconds".format(time.time()-start))

In [None]:
params={'boosting_type':['gbdt'], 'class_weight':[None,'balanced'], 'colsample_bytree':[1.0],
        'importance_type':['split'], 'learning_rate':[0.049, 0.05, 0.051],
        'min_child_samples':[20], 'min_child_weight':[0.001], 'min_split_gain':[0.0],
        'n_estimators':[135, 140, 145], 'num_leaves':[80, 90, 100], 'objective':[None],
        'random_state':[None], 'reg_alpha':[0.0], 'reg_lambda':[0.0], 'silent':[True],
        'subsample':[1.0], 'subsample_for_bin':[200000], 'subsample_freq':[0]}

modelo=lgb.LGBMRegressor()

grid(X_train, y_train, X_test, y_test, modelo, params)

# Submit

In [22]:
def submit(modelo, X, y):
    
    clarity={'I1':0, 'SI2':1, 'SI1':2, 'VS2':3, 'VS1':4, 'VVS2':5, 'VVS1':6, 'IF':7}
    cut={'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4}
    color={'J':0, 'I':1, 'H':2, 'G':3, 'F':4, 'E':5, 'D':6}
    
    df=pd.read_csv('diamonds_test.csv')
    submit=pd.DataFrame()
    submit['id']=df.id
    df=df.drop(columns='id')
    
    df.clarity=df.clarity.apply(lambda x: labeling(x, clarity))
    df.cut=df.cut.apply(lambda x: labeling(x, cut))
    df.color=df.color.apply(lambda x: labeling(x, color))
    df=df.drop(columns=['table'])
    df['Vol']=df.x*df.y*df.z
    df['Sum']=df.carat**2+2*df.clarity+df.color

    
    modelo.fit(X, y)
    y_pred=modelo.predict(df)
    submit['price']=y_pred
    return submit

In [None]:
modelo_lgbm=lgb.LGBMRegressor()

In [23]:
res=submit(modelo_lgbm, X, y)

In [24]:
res.head()

Unnamed: 0,id,price
0,0,2879.927509
1,1,5637.900232
2,2,9474.501612
3,3,3981.760573
4,4,1664.969906


In [None]:
#res.to_csv('submit_lgbm.csv', index=False)