In [1]:
import time
import numpy as np
import pandas as pd
import xgboost as xgb

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from hyperopt.pyll import scope as ho_scope
from hyperopt.pyll.stochastic import sample as ho_sample

from functools import partial

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=FutureWarning)

In [2]:
diamond=pd.read_csv('diamonds_train.csv')
diamond.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95


In [3]:
X=diamond.drop(columns=['price'])
y=diamond.price

In [4]:
clarity={'I1':0, 'SI2':1, 'SI1':2, 'VS2':3, 'VS1':4, 'VVS2':5, 'VVS1':6, 'IF':7}
cut={'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4}
color={'J':0, 'I':1, 'H':2, 'G':3, 'F':4, 'E':5, 'D':6}

In [5]:
def labeling(s, dic):
    return dic[s]

In [6]:
X.clarity=X.clarity.apply(lambda x: labeling(x, clarity))
X.cut=X.cut.apply(lambda x: labeling(x, cut))
X.color=X.color.apply(lambda x: labeling(x, color))

In [7]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.21,3,0,3,62.4,58.0,6.83,6.79,4.25
1,0.32,2,2,3,63.0,57.0,4.35,4.38,2.75
2,0.71,0,3,4,65.5,55.0,5.62,5.53,3.65
3,0.41,1,6,2,63.8,56.0,4.68,4.72,3.0
4,1.02,4,3,2,60.5,59.0,6.55,6.51,3.95


In [8]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
space ={'n_estimators': hp.quniform('n_estimators', 25, 1000, 25),
        'learning_rate': hp.uniform('learning_rate', 0.0001, 1.0),
        'max_depth': hp.quniform('x_max_depth', 4, 16, 1),
        'min_child_weight': hp.quniform ('x_min_child', 1, 10, 1),
        'subsample': hp.uniform ('x_subsample', 0.7, 1),
        'gamma' : hp.uniform ('x_gamma', 0.1,0.5),
        'reg_lambda' : hp.uniform ('x_reg_lambda', 0,1)
    }

In [19]:
def objetivo(space):
    #print(space)
    clf = xgb.XGBRegressor(n_estimators =int(space['n_estimators']),
                           learning_rate = space['learning_rate'],
                           max_depth = int(space['max_depth']),
                           min_child_weight = space['min_child_weight'],
                           subsample = space['subsample'],
                           gamma = space['gamma'],
                           reg_lambda = space['reg_lambda'],
                           objective='reg:squarederror')

    eval_set=[(X_train, y_train), ( X_test, y_test)]

    clf.fit(X_train, y_train,
            eval_set=eval_set, eval_metric="rmse", verbose=False)

    y_pred = clf.predict(X_train)
    rmse = mean_squared_error(y_train, y_pred)**(0.5)
    #print ("SCORE:", np.sqrt(mse_scr))
    #change the metric if you like
    return {'loss':rmse, 'status': STATUS_OK }

In [20]:
trials_reg = Trials()
best = fmin(fn=objetivo,
            space=space,
            algo=tpe.suggest,
            max_evals=10,
            trials=trials_reg)

100%|██████████| 10/10 [14:03<00:00, 113.09s/it, best loss: 8.118200853213798]


In [21]:
print (best)

{'learning_rate': 0.534250791343454, 'n_estimators': 800.0, 'x_gamma': 0.28808016302157613, 'x_max_depth': 15.0, 'x_min_child': 3.0, 'x_reg_lambda': 0.4301922356405691, 'x_subsample': 0.9302525172870761}


In [None]:
modelo=xgb.XGBRegressor(n_estimators=int(best['n_estimators']), 
                        x_gamma=best['x_gamma'],
                        learning_rate=best['learning_rate'],
                        x_max_depth= best['x_max_depth'], 
                        x_min_child= best['x_min_child'], 
                        x_reg_lambda=best['x_reg_lambda'], 
                        x_subsample= best['x_subsample'],
                        objective='reg:squarederror')

In [None]:
def submit(modelo, X, y):
    
    clarity={'I1':0, 'SI2':1, 'SI1':2, 'VS2':3, 'VS1':4, 'VVS2':5, 'VVS1':6, 'IF':7}
    cut={'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4}
    color={'J':0, 'I':1, 'H':2, 'G':3, 'F':4, 'E':5, 'D':6}
    
    df=pd.read_csv('diamonds_test.csv')
    submit=pd.DataFrame()
    submit['id']=df.id
    df=df.drop(columns='id')
    
    df.clarity=df.clarity.apply(lambda x: labeling(x, clarity))
    df.cut=df.cut.apply(lambda x: labeling(x, cut))
    df.color=df.color.apply(lambda x: labeling(x, color))
    df=df.drop(columns=['table'])


    
    modelo.fit(X, y)
    y_pred=modelo.predict(df)
    
    submit['price']=y_pred
    return submit

In [None]:
res=submit(modelo, X, y)

In [None]:
res.head()

In [None]:
#res.to_csv('submit_xgb_bayes.csv', index=False)