In [2]:
# Import packages
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [3]:
sample_submission = pd.read_csv("data/sample_submission.csv")
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")
#Creating a copy of the train and test datasets

c_train = train.copy()
c_test = test.copy()

In [4]:
# drop the index column
c_train = c_train.loc[:, ~c_train.columns.str.contains('^Unnamed')]
c_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.01,Very Good,E,SI2,60.0,60.0,4540,6.57,6.49,3.92
1,1.1,Premium,H,VS2,62.5,58.0,5729,6.59,6.54,4.1
2,1.5,Good,E,SI2,61.5,65.0,6300,7.21,7.17,4.42
3,1.53,Premium,E,SI1,61.3,59.0,12968,7.4,7.35,4.52
4,0.84,Fair,D,SI2,64.5,60.0,2167,5.92,5.84,3.79


In [5]:
c_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43154 entries, 0 to 43153
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    43154 non-null  float64
 1   cut      43154 non-null  object 
 2   color    43154 non-null  object 
 3   clarity  43154 non-null  object 
 4   depth    43154 non-null  float64
 5   table    43154 non-null  float64
 6   price    43154 non-null  int64  
 7   x        43154 non-null  float64
 8   y        43154 non-null  float64
 9   z        43154 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.3+ MB


In [6]:
c_train.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,43154.0,43154.0,43154.0,43154.0,43154.0,43154.0,43154.0
mean,0.799047,61.742925,57.45901,3946.777054,5.733798,5.737574,3.539338
std,0.475214,1.42841,2.227191,3998.657385,1.123004,1.150325,0.696203
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,953.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2406.5,5.7,5.71,3.53
75%,1.04,62.5,59.0,5367.0,6.54,6.54,4.04
max,4.5,79.0,79.0,18823.0,10.23,58.9,8.06


In [7]:
# recover z
# fix = c_train[:][c_train['z'] == 0]
# fix['z'] = fix['depth']/100*(fix['x']+fix['y'])/2

# c_train[:][c_train['z'] == 0] = fix


In [8]:
c_train.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,43154.0,43154.0,43154.0,43154.0,43154.0,43154.0,43154.0
mean,0.799047,61.742925,57.45901,3946.777054,5.733798,5.737574,3.539338
std,0.475214,1.42841,2.227191,3998.657385,1.123004,1.150325,0.696203
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,953.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2406.5,5.7,5.71,3.53
75%,1.04,62.5,59.0,5367.0,6.54,6.54,4.04
max,4.5,79.0,79.0,18823.0,10.23,58.9,8.06


In [9]:
# create a new variable vol and replace x,y,z
c_train["vol"] = c_train["x"]*c_train["y"]*c_train["z"]
c_train = c_train.drop(["x","y","z"], axis="columns")

In [10]:
c_train.describe()

Unnamed: 0,carat,depth,table,price,vol
count,43154.0,43154.0,43154.0,43154.0,43154.0
mean,0.799047,61.742925,57.45901,3946.777054,130.037308
std,0.475214,1.42841,2.227191,3998.657385,78.79922
min,0.2,43.0,43.0,326.0,0.0
25%,0.4,61.0,56.0,953.0,65.205336
50%,0.7,61.8,57.0,2406.5,114.811404
75%,1.04,62.5,59.0,5367.0,171.0852
max,4.5,79.0,79.0,18823.0,3840.59806


In [11]:
# create dummy variables for categorical variables
dummies = pd.get_dummies(c_train[['cut', 'color','clarity']], drop_first=True) 
dummies.head()

Unnamed: 0,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0
2,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [12]:
c_train = pd.concat([c_train, dummies], axis="columns")
c_train = c_train.drop(['cut', 'color','clarity'], axis="columns")
c_train.head()

Unnamed: 0,carat,depth,table,price,vol,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,1.01,60.0,60.0,4540,167.146056,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0
1,1.1,62.5,58.0,5729,176.70426,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0
2,1.5,61.5,65.0,6300,228.494994,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
3,1.53,61.3,59.0,12968,245.8428,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,0.84,64.5,60.0,2167,131.030912,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [13]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [14]:
X = c_train.drop(["price"], axis="columns")
y = c_train["price"]

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Fitting Random Forest Regression to the dataset
# import the regressor
from sklearn.ensemble import RandomForestRegressor

 # create regressor object
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
  
# fit the regressor with x and y data
regressor.fit(X, y)  

preds = regressor.predict(X_test)  # test the output by changing values

rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 237.383281


In [16]:
pred_train = regressor.predict(X_train)
mae = metrics.mean_absolute_error(y_train, pred_train)
mse = metrics.mean_squared_error(y_train, pred_train)
rmse = np.sqrt(mse) # or mse**(0.5)  
r2 = metrics.r2_score(y_train,pred_train)

print("Results of sklearn.metrics (training):")
print("MAE:",mae)
print("MSE:", mse)
print("RMSE:", rmse)

mae = metrics.mean_absolute_error(y_test, preds)
mse = metrics.mean_squared_error(y_test, preds)
rmse = np.sqrt(mse) # or mse**(0.5)  
r2 = metrics.r2_score(y_test,preds)

print("Results of sklearn.metrics (training):")
print("MAE:",mae)
print("MSE:", mse)
print("RMSE:", rmse)

Results of sklearn.metrics (training):
MAE: 114.5885017439385
MSE: 52211.479763705
RMSE: 228.4983145751955
Results of sklearn.metrics (training):
MAE: 118.12684698634057
MSE: 56350.822187682425
RMSE: 237.38328118821346


In [17]:
# data cleaning
c_test = test.copy()

c_test = c_test.drop(['id'], axis="columns")
c_test["vol"] = c_test["x"]*c_test["y"]*c_test["z"]
c_test = c_test.drop(["x","y","z"], axis="columns")
dummies = pd.get_dummies(c_test[['cut', 'color','clarity']], drop_first=True) 
c_test = pd.concat([c_test, dummies], axis="columns")
c_test = c_test.drop(['cut', 'color','clarity'], axis="columns")
c_test.head()

# prefit for the submission
test = pd.read_csv("data/test.csv")

output = regressor.predict(c_test)

In [19]:
import pandas as pd

df = pd.DataFrame(output, columns=["price"])
df.to_csv("submission.csv")