In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import sklearn.metrics as metrics
import math
import re

In [2]:
# Importing **train** and **test** datasets
sample_submission = pd.read_csv("sample_submission.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
# Creating a copy of the train and test datasets
c_test = test.copy()
c_train = train.copy()

In [3]:
c_train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,x,y,z
0,0,1.01,Very Good,E,SI2,60.0,60.0,4540,6.57,6.49,3.92
1,1,1.1,Premium,H,VS2,62.5,58.0,5729,6.59,6.54,4.1
2,2,1.5,Good,E,SI2,61.5,65.0,6300,7.21,7.17,4.42
3,3,1.53,Premium,E,SI1,61.3,59.0,12968,7.4,7.35,4.52
4,4,0.84,Fair,D,SI2,64.5,60.0,2167,5.92,5.84,3.79


In [4]:
c_test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.24,Ideal,G,VVS1,62.1,56.0,3.97,4.0,2.47
1,1,1.21,Very Good,F,VS2,62.9,54.0,6.78,6.82,4.28
2,2,0.5,Fair,E,SI1,61.7,68.0,5.09,5.03,3.12
3,3,0.5,Ideal,D,SI2,62.8,56.0,5.06,5.03,3.17
4,4,1.55,Ideal,E,SI2,62.3,55.0,7.44,7.37,4.61


In [5]:
c_train['train']  = 1
c_test['train']  = 0
df = pd.concat([c_train, c_test], axis=0,sort=False)

In [6]:
#Percentage of NAN Values 
NAN = [(c, df[c].isna().mean()*100) for c in df]
NAN = pd.DataFrame(NAN, columns=["column_name", "percentage"])

In [7]:
NAN = NAN[NAN.percentage > 50]
NAN.sort_values("percentage", ascending=False)

Unnamed: 0,column_name,percentage


In [8]:
# Now we will select numerical and categorical features
object_columns_df = df.select_dtypes(include=['object'])
numerical_columns_df =df.select_dtypes(exclude=['object'])

In [9]:
object_columns_df.dtypes

cut        object
color      object
clarity    object
dtype: object

In [10]:
numerical_columns_df.dtypes

id         int64
carat    float64
depth    float64
table    float64
price    float64
x        float64
y        float64
z        float64
train      int64
dtype: object

In [11]:
df['cut']=df['cut'].map({'Ideal':5,'Premium':4, 'Very Good':3,'Good':2,'Fair':1})
df['color']=df['color'].map({'G':4,'E':6,'F':5,'H':3,'D':7,'I':2,'J' : 1})
df['clarity']=df['clarity'].map({'SI1' : 3, 'VS2' : 4, 'SI2': 2,'VS1' : 5,'VVS2' : 6, 'VVS1': 7, 'IF' : 8,'I1' : 1})

In [12]:
#Select categorical features
rest_object_columns = df.select_dtypes(include=['object'])
#Using One hot encoder
df = pd.get_dummies(df, columns=rest_object_columns.columns)  

In [13]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,x,y,z,train
0,0,1.01,3,6,2,60.0,60.0,4540.0,6.57,6.49,3.92,1
1,1,1.1,4,3,4,62.5,58.0,5729.0,6.59,6.54,4.1,1
2,2,1.5,2,6,2,61.5,65.0,6300.0,7.21,7.17,4.42,1
3,3,1.53,4,6,3,61.3,59.0,12968.0,7.4,7.35,4.52,1
4,4,0.84,1,7,2,64.5,60.0,2167.0,5.92,5.84,3.79,1


In [14]:
print(df)

          id  carat  cut  color  clarity  depth  table    price     x     y  \
0          0   1.01    3      6        2   60.0   60.0   4540.0  6.57  6.49   
1          1   1.10    4      3        4   62.5   58.0   5729.0  6.59  6.54   
2          2   1.50    2      6        2   61.5   65.0   6300.0  7.21  7.17   
3          3   1.53    4      6        3   61.3   59.0  12968.0  7.40  7.35   
4          4   0.84    1      7        2   64.5   60.0   2167.0  5.92  5.84   
...      ...    ...  ...    ...      ...    ...    ...      ...   ...   ...   
10784  10784   0.53    5      5        4   62.3   55.0      NaN  5.20  5.17   
10785  10785   0.41    4      4        3   61.3   60.0      NaN  4.75  4.81   
10786  10786   0.51    4      4        4   62.4   58.0      NaN  5.11  5.12   
10787  10787   1.08    2      1        2   63.2   59.0      NaN  6.40  6.57   
10788  10788   0.30    5      6        6   60.9   57.0      NaN  4.32  4.35   

          z  train  
0      3.92      1  
1      4.

In [15]:
df_final = df

In [21]:
#df_final = df_final.drop(['id',],axis=1)

df_train = df_final[df_final['train'] == 1]
df_train = df_train.drop(['train',],axis=1)


df_test = df_final[df_final['train'] == 0]
df_test = df_test.drop(['price'],axis=1)
df_test = df_test.drop(['train',],axis=1)

In [24]:
target= df_train['price']
df_train = df_train.drop(['price'],axis=1)

In [25]:
x_train,x_test,y_train,y_test = train_test_split(df_train,target,test_size=0.33,random_state=0)

In [26]:
xgb =XGBRegressor( booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=4, min_child_weight=1.5, n_estimators=2400,
             n_jobs=1, nthread=None, objective='reg:linear',
             reg_alpha=0.6, reg_lambda=0.6, scale_pos_weight=1, 
             silent=None, subsample=0.8, verbosity=1)


lgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=12000, 
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.4, 
                                       )

In [27]:
#Fitting
xgb.fit(x_train, y_train)
lgbm.fit(x_train, y_train,eval_metric='rmse')



LGBMRegressor(bagging_fraction=0.75, bagging_freq=5, bagging_seed=7,
              feature_fraction=0.4, learning_rate=0.01, max_bin=200,
              n_estimators=12000, num_leaves=4, objective='regression')

In [28]:
predict1 = xgb.predict(x_test)
predict = lgbm.predict(x_test)

In [29]:
print('Root Mean Square Error test = ' + str(math.sqrt(metrics.mean_squared_error(y_test, predict1))))
print('Root Mean Square Error test = ' + str(math.sqrt(metrics.mean_squared_error(y_test, predict))))

Root Mean Square Error test = 548.559068432399
Root Mean Square Error test = 583.2603336331485


In [30]:
xgb.fit(df_train, target)
lgbm.fit(df_train, target,eval_metric='rmse')



LGBMRegressor(bagging_fraction=0.75, bagging_freq=5, bagging_seed=7,
              feature_fraction=0.4, learning_rate=0.01, max_bin=200,
              n_estimators=12000, num_leaves=4, objective='regression')

In [31]:
predict4 = lgbm.predict(df_test)
predict3 = xgb.predict(df_test)
predict_y = ( predict3*0.45 + predict4 * 0.55)

In [32]:
submission = pd.DataFrame({
        "id": test["id"],
        "price": predict_y
    })
submission.to_csv('submission.csv', index=False)