In [50]:
import pandas as pd
import numpy as np

In [51]:
data=pd.read_csv("./data/cubic_zirconia.csv")

In [52]:
data.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [53]:
data.drop(columns=['Unnamed: 0'], inplace=True)

In [54]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [55]:
X=data.drop(labels=['price'],axis=1)

In [56]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.30,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.70
2,0.90,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78
3,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.80,2.96
4,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65
...,...,...,...,...,...,...,...,...,...
26962,1.11,Premium,G,SI1,62.3,58.0,6.61,6.52,4.09
26963,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74
26964,0.51,Premium,E,VS2,61.7,58.0,5.12,5.15,3.17
26965,0.27,Very Good,F,VVS2,61.8,56.0,4.19,4.20,2.60


In [57]:
y=data[['price']]

In [58]:
y

Unnamed: 0,price
0,499
1,984
2,6289
3,1082
4,779
...,...
26962,5408
26963,1114
26964,1656
26965,682


In [59]:
X.select_dtypes(include=['object'])

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI1
1,Premium,G,IF
2,Very Good,E,VVS2
3,Ideal,F,VS1
4,Ideal,F,VVS1
...,...,...,...
26962,Premium,G,SI1
26963,Ideal,H,IF
26964,Premium,E,VS2
26965,Very Good,F,VVS2


In [60]:
categorical_cols=X.select_dtypes(include=['object']).columns

In [61]:
categorical_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [62]:
numerical_cols=X.select_dtypes(exclude=['object']).columns

In [63]:
numerical_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [64]:
X.select_dtypes(exclude=['object'])

Unnamed: 0,carat,depth,table,x,y,z
0,0.30,62.1,58.0,4.27,4.29,2.66
1,0.33,60.8,58.0,4.42,4.46,2.70
2,0.90,62.2,60.0,6.04,6.12,3.78
3,0.42,61.6,56.0,4.82,4.80,2.96
4,0.31,60.4,59.0,4.35,4.43,2.65
...,...,...,...,...,...,...
26962,1.11,62.3,58.0,6.61,6.52,4.09
26963,0.33,61.9,55.0,4.44,4.42,2.74
26964,0.51,61.7,58.0,5.12,5.15,3.17
26965,0.27,61.8,56.0,4.19,4.20,2.60


In [65]:
# Define the custom ranking for each ordinal variable

In [98]:
cut_categories=['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_categories=['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories=['SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2', 'IF', 'I1']

In [99]:
data['clarity'].unique()

array(['SI1', 'IF', 'VVS2', 'VS1', 'VVS1', 'VS2', 'SI2', 'I1'],
      dtype=object)

In [100]:
from sklearn.impute import SimpleImputer ## Handle Missing Values
from sklearn.preprocessing import OrdinalEncoder ## For ordinal encodings
from sklearn.preprocessing import StandardScaler ## Handling Feature Scaling
# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [101]:
# numerical_pipeline

num_pipeline=Pipeline(
      
      steps=[
            
            ('imputer',SimpleImputer()),
            ('scaler',StandardScaler())
            
      ]
      
)

In [102]:
# categorical_pipeline

cat_pipeline=Pipeline(
      
      steps=[
            
            ('imputer',SimpleImputer(strategy='most_frequent')),
            ('ordinal_encoder',OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories]))
            
            
      ]
)

In [103]:
# transfer the column

preprocessor=ColumnTransformer(
      
      [
            ('num_pipeline',num_pipeline,numerical_cols),
            ('cat_pipeline',cat_pipeline,categorical_cols)
      ]
)

In [104]:
# we can create a pipeline for the model to perform in a single short so we not to do it manually

In [105]:
# if we are not split the data and perform the transformer on the entire data then there be the 
# chance of data leakage so we need to split the data into train and test set

In [109]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [108]:
preprocessor

In [111]:
preprocessor.fit(X_train)

In [80]:
X_train[categorical_cols].nunique()
X_train[categorical_cols].dtypes


cut        object
color      object
clarity    object
dtype: object

In [112]:
preprocessor.fit_transform(X_train)

array([[-6.27970010e-01, -5.10431330e-15, -2.43676866e+00, ...,
         2.00000000e+00,  3.00000000e+00,  0.00000000e+00],
       [ 1.54029220e+00, -5.37637591e-01,  1.13527083e+00, ...,
         3.00000000e+00,  3.00000000e+00,  2.00000000e+00],
       [ 4.24584460e-01, -3.22127121e-01, -1.54375879e+00, ...,
         2.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [-8.59531993e-01, -5.10431330e-15, -6.50748916e-01, ...,
         4.00000000e+00,  3.00000000e+00,  4.00000000e+00],
       [-9.85838530e-01, -4.65800768e-01, -6.50748916e-01, ...,
         4.00000000e+00,  6.00000000e+00,  0.00000000e+00],
       [-1.02794071e+00,  4.68077937e-01, -1.54375879e+00, ...,
         4.00000000e+00,  1.00000000e+00,  3.00000000e+00]])

In [113]:
preprocessor.transform(X_train)

array([[-6.27970010e-01, -5.10431330e-15, -2.43676866e+00, ...,
         2.00000000e+00,  3.00000000e+00,  0.00000000e+00],
       [ 1.54029220e+00, -5.37637591e-01,  1.13527083e+00, ...,
         3.00000000e+00,  3.00000000e+00,  2.00000000e+00],
       [ 4.24584460e-01, -3.22127121e-01, -1.54375879e+00, ...,
         2.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [-8.59531993e-01, -5.10431330e-15, -6.50748916e-01, ...,
         4.00000000e+00,  3.00000000e+00,  4.00000000e+00],
       [-9.85838530e-01, -4.65800768e-01, -6.50748916e-01, ...,
         4.00000000e+00,  6.00000000e+00,  0.00000000e+00],
       [-1.02794071e+00,  4.68077937e-01, -1.54375879e+00, ...,
         4.00000000e+00,  1.00000000e+00,  3.00000000e+00]])

In [115]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [117]:
data.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984


In [118]:

X_train=pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [119]:

X_test

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.635095,-0.896822,1.135271,0.817008,0.839490,0.706565,2.0,2.0,0.0
1,-0.817430,1.258283,-1.097254,-0.925991,-0.898669,-0.787663,1.0,1.0,1.0
2,-1.154247,-1.399679,0.688766,-1.477347,-1.472083,-1.577880,3.0,4.0,4.0
3,1.519241,0.755425,-0.650749,1.412829,1.403944,1.511150,4.0,4.0,5.0
4,-0.796379,0.252567,-1.097254,-0.845955,-0.826993,-0.802030,4.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...
5389,-0.564817,0.539915,-1.543759,-0.481348,-0.468609,-0.414106,4.0,3.0,5.0
5390,0.971913,0.539915,-0.650749,1.021543,1.072440,1.108858,2.0,4.0,0.0
5391,-0.627970,0.324404,1.135271,-0.516920,-0.611963,-0.529046,3.0,0.0,3.0
5392,-0.185897,-1.615190,0.242261,0.078901,0.122723,-0.083651,2.0,2.0,3.0


In [120]:
X_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.627970,-5.104313e-15,-2.436769,-0.561384,-0.522367,-0.485944,2.0,3.0,0.0
1,1.540292,-5.376376e-01,1.135271,1.510650,1.493540,1.410577,3.0,3.0,2.0
2,0.424584,-3.221271e-01,-1.543759,0.568008,0.588622,0.534155,2.0,0.0,0.0
3,1.645548,-5.376376e-01,0.242261,1.626257,1.574177,1.511150,3.0,6.0,5.0
4,-0.164846,3.705700e-02,-0.650749,-0.010027,0.006249,0.002554,4.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...
21568,-0.817430,-6.813112e-01,-0.204244,-0.854848,-0.835952,-0.902603,4.0,2.0,2.0
21569,-0.985839,-1.471516e+00,0.688766,-1.077169,-1.077861,-1.204323,2.0,4.0,2.0
21570,-0.859532,-5.104313e-15,-0.650749,-0.952669,-0.925548,-0.916971,4.0,3.0,4.0
21571,-0.985839,-4.658008e-01,-0.650749,-1.103848,-1.059942,-1.118117,4.0,6.0,0.0


In [121]:
# model training

In [123]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

In [124]:
# Train Multiple Models
models={
      'LinearRegression':LinearRegression(),
      'Lasso':Lasso(),
      'Ridge':Ridge(),
      'ElasticNet':ElasticNet()
}

In [125]:
trained_model_list=[]
model_list=[]
r2_list=[]

In [135]:
import numpy as np
def evaluate_model(true,predicted):
      mae=mean_absolute_error(true,predicted)
      mse=mean_squared_error(true,predicted)
      rmse=np.sqrt(mean_squared_error(true,predicted))
      r2 = r2_score(true, predicted)
      return mae,rmse,r2_score,mse

In [139]:
for i in range(len(list(models))):
      model=list(models.values())[i]
      model.fit(X_train,y_train)
      
      y_pred=model.predict(X_test)
      
      mse,mae,rmse,r2_square=evaluate_model(y_test,y_pred)
      
      print(list(models.keys())[i])
      model_list.append(list(models.keys())[i])
      
      print('Model training performance')
      print('RMSE:',rmse)
      print('MAE',mae)
      print('r2score',r2_square*100)
      print('MSE',mse)
      
      r2_list.append(r2_square)
      
      print('='*35)
      print('\n')
      

LinearRegression
Model training performance
RMSE: <function r2_score at 0x7819f4dc74c0>
MAE 2281.4728950864724
r2score 520511857.1014249
MSE 890.0633925305499


Lasso
Model training performance
RMSE: <function r2_score at 0x7819f4dc74c0>
MAE 1904.835762196554
r2score 362839928.09429264
MSE 881.3237681720406


Ridge
Model training performance
RMSE: <function r2_score at 0x7819f4dc74c0>
MAE 2231.445751893076
r2score 497935014.36416537
MSE 889.0359545356306


ElasticNet
Model training performance
RMSE: <function r2_score at 0x7819f4dc74c0>
MAE 1843.5372121147554
r2score 339862945.24518454
MSE 1151.229852371325




  model = cd_fast.enet_coordinate_descent(


In [126]:
list(models)

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']

In [127]:
models.keys()

dict_keys(['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet'])

In [128]:
models.values()

dict_values([LinearRegression(), Lasso(), Ridge(), ElasticNet()])

In [140]:
trained_model_list

[]

In [141]:
model_list

['LinearRegression',
 'LinearRegression',
 'LinearRegression',
 'LinearRegression',
 'LinearRegression',
 'Lasso',
 'Ridge',
 'ElasticNet',
 'LinearRegression',
 'Lasso',
 'Ridge',
 'ElasticNet']

In [142]:
r2_list

[5205118.571014249,
 3628399.2809429266,
 4979350.1436416535,
 3398629.452451845,
 5205118.571014249,
 3628399.2809429266,
 4979350.1436416535,
 3398629.452451845]

In [143]:
len(r2_list)

8

In [144]:
len(model_list)

12