## Model Training

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('data/gemstone.csv')

In [4]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
df.drop(labels=['id'],axis=1)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [6]:
## Independent and dependent features

X = df.drop(labels=['price'],axis=1)
Y = df[['price']]

In [7]:
# Define which columns should be ordinal-encoded and which shold be scaled

categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [8]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [9]:
from sklearn.impute import SimpleImputer  ## Handling missing values
from sklearn.preprocessing import StandardScaler  ## Handling Feature scaling (to reach to the global minima, heaving heigher value need to feature scale down to reach to the global minima)

from sklearn.preprocessing import OrdinalEncoder  ## Ordinal Encoding (where ever categorical feature heaving rankes we have to use ordinal feature)


## pipelines

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer



In [10]:
## Numerical Pipeline

num_pipeline = Pipeline(

    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]

)

## Categorical Pipeline
cat_pipeline=Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
) 

preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [11]:
## Train test Split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.30, random_state = 42)

In [12]:

X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [13]:
X_train.head()

Unnamed: 0,num_pipeline__id,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-1.524302,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.8741,-0.936747,1.350746
1,-0.025886,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,0.910853,0.684455
2,1.574708,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,0.910853,0.018164
3,-1.633133,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.8741,-0.32088,2.017037
4,-0.916887,-0.995648,0.25823,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,1.52672,-0.648127


In [14]:
## Model Training
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [15]:
regression = LinearRegression()
regression.fit(X_train,y_train)

In [16]:
regression.coef_

array([[ 1.63146350e+00,  6.43302035e+03, -1.32347885e+02,
        -7.04957943e+01, -1.70138318e+03, -4.94177319e+02,
        -7.63579111e+01,  6.87980226e+01, -4.64679404e+02,
         6.52106050e+02]])

In [17]:
regression.intercept_

array([3976.8787389])

In [18]:
import numpy as np

def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [21]:
## Train multiple model

models= {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}
trained_model_list = []
model_list= []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    ## Make Prediction
    y_pred = model.predict(X_test)

    mae ,rmse, r2_square = evaluate_model(y_test, y_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    trained_model_list.append(model)

    print('Model Training Performance')
    print("RMSE" , rmse)
    print("MAE" , mae)
    print("R2 Sqaure", r2_square*100)

    r2_list.append(r2_square)

    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE 1014.6337606682047
MAE 675.0760335549242
R2 Sqaure 93.62901674122803


Lasso
Model Training Performance
RMSE 1014.6602348152879
MAE 676.2416203426912
R2 Sqaure 93.62868426944394


Ridge
Model Training Performance
RMSE 1014.6384101190757
MAE 675.1080421715081
R2 Sqaure 93.6289583523932


ElasticNet
Model Training Performance
RMSE 1533.369192209356
MAE 1060.9479633593678
R2 Sqaure 85.44938623256384


