In [1]:
import pandas as pd 
import numpy as np


In [2]:
df = pd.read_csv('diamonds_cleaned_data.csv', )

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
3,4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
4,5,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48


In [4]:
df.drop('Unnamed: 0', axis=1, inplace=True)

- nominal columns = ['color', 'clarity']
- ordinal column = ['cut']
- numerical columns = ['carat', 'depth','table','price','x','y','z']

In [5]:
X = df.drop('price', axis=1)
y = df['price']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

print('Shape of X_train: ',X_train.shape)
print('Shape of X_test: ',X_test.shape)
print('Shape of y_train: ',y_train.shape)
print('Shape of y_test: ',y_test.shape)

Shape of X_train:  (34818, 9)
Shape of X_test:  (11607, 9)
Shape of y_train:  (34818,)
Shape of y_test:  (11607,)


In [7]:

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler,FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



# One-hot encoding on ['color', 'clarity'] columns
ohe_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first'))
])

# Ordinal encoding on 'cut' column
ordinal_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder())
])

# Apply np.log on the numerical columns
numerical_transformer = Pipeline(steps=[
    ('log_transform', FunctionTransformer(np.log)),
    ('scaler', StandardScaler())
])

# Combine the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', ohe_transformer, ['color', 'clarity']),
        ('ordinal', ordinal_transformer, ['cut']),
        ('numerical', numerical_transformer, ['carat', 'depth','table','x','y','z'])
    ])

# Apply the transformations
X_train_processed = preprocessor.fit_transform(X_train)

# Retrieve the feature names after transformation
ohe_feature_names = preprocessor.named_transformers_['onehot'].named_steps['encoder'].get_feature_names(['color', 'clarity'])
feature_names = np.concatenate([ohe_feature_names, ['cut'], ['carat', 'depth','table','x','y','z']])

# Create a dataframe with transformed features
X_train_processed = pd.DataFrame(X_train_processed, columns=feature_names)



In [8]:
X_train_processed.shape

(34818, 20)

In [9]:
pd.set_option('display.max_columns', None)
X_train_processed.head()

Unnamed: 0,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,cut,carat,depth,table,x,y,z
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,-0.193376,-1.21096,-0.105155,-0.126118,-0.100898,-0.228997
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,-0.94342,0.008029,0.391035,-0.970413,-0.920605,-0.943292
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.262919,-1.116266,-0.105155,0.298579,0.354118,0.229158
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.368061,0.928389,-1.124201,0.318511,0.344167,0.421544
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.539073,-0.17783,0.391035,0.523705,0.48195,0.484258


In [10]:
np.round(X_train_processed.describe(),3)

Unnamed: 0,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,cut,carat,depth,table,x,y,z
count,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0,34818.0
mean,0.188,0.178,0.212,0.151,0.094,0.047,0.035,0.242,0.155,0.156,0.231,0.073,0.098,2.627,-0.0,-0.0,-0.0,-0.0,-0.0,0.0
std,0.391,0.383,0.409,0.358,0.291,0.212,0.183,0.428,0.362,0.363,0.421,0.261,0.298,0.942,1.0,1.0,1.0,1.0,1.0,1.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.107,-2.65,-2.724,-2.157,-2.255,-4.959
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-0.943,-0.551,-0.61,-0.933,-0.933,-0.943
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-0.061,0.101,-0.105,-0.051,-0.047,-0.072
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.957,0.654,0.879,0.922,0.922,0.948
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,2.249,2.549,2.976,2.37,2.407,2.355


# X_test encoding and scaling

In [11]:
# Apply the transformations to X_test
X_test_processed = preprocessor.transform(X_test)

# Create a dataframe with transformed features
X_test_processed = pd.DataFrame(X_test_processed, columns=feature_names)


In [12]:
pd.set_option('display.max_columns', None)
X_test_processed.head() 

Unnamed: 0,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,cut,carat,depth,table,x,y,z
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.180041,-1.116266,1.82984,0.21815,0.1723,0.080489
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.97524,-1.021728,-0.610129,1.072408,1.028891,0.948247
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.393458,1.292436,-0.610129,0.368038,0.314208,0.468644
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.31622,-1.400819,0.878744,0.338373,0.393754,0.229158
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,-1.103408,-0.645128,-0.610129,-1.058324,-1.03337,-1.105778


In [13]:
np.round(X_test_processed.describe(),3) 

Unnamed: 0,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,cut,carat,depth,table,x,y,z
count,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0,11607.0
mean,0.187,0.177,0.213,0.148,0.096,0.048,0.035,0.242,0.157,0.156,0.229,0.072,0.098,2.615,0.001,0.011,-0.011,0.001,0.001,0.002
std,0.39,0.382,0.409,0.355,0.294,0.214,0.184,0.429,0.364,0.363,0.42,0.258,0.298,0.946,0.99,1.007,0.993,0.991,0.99,0.989
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.015,-2.65,-2.724,-2.006,-2.01,-2.192
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-0.943,-0.551,-0.61,-0.908,-0.921,-0.943
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-0.029,0.101,-0.105,-0.02,-0.015,-0.038
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.957,0.654,0.879,0.913,0.913,0.934
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,2.249,2.549,2.931,2.398,2.329,2.333


In [14]:
from sklearn.metrics import r2_score, mean_absolute_error

In [15]:
from sklearn.ensemble import RandomForestRegressor
rfr_st = RandomForestRegressor()

rfr_st.fit(X_train_processed, y_train)
rfr_st_pred = rfr_st.predict(X_test_processed)
print("r2 score for standard sclaed data : ", r2_score(y_test, rfr_st_pred))
print("mean absolute error for standard scaled data : ", mean_absolute_error(y_test,rfr_st_pred)) 

r2 score for standard sclaed data :  0.9777838260399345
mean absolute error for standard scaled data :  206.08135999007172


In [18]:
import pickle
with open('model_rfr.pkl', 'wb') as file:
    pickle.dump(rfr_st, file) 