## Model Training

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/gemstone.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
df = df.drop(labels=['id'],axis=1)

In [4]:
## Independent and Dependent features

X = df.drop(labels=['price'],axis=1)
y = df[['price']]

In [5]:
# Define which columns should be ordinal encoded and which should be scaled

categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [6]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['J', 'I', 'H', 'G', 'F', 'E', 'D']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']



In [7]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler ## Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder ## Feature Engineering (Ordinal Encoding)

## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
## Numerical Pipeline

num_pipeline = Pipeline(
                steps = [
                ('imputer',SimpleImputer(strategy='median')),
                ('scaler',StandardScaler())                
                ]
            )

## Categorical Piperline

cat_pipeline = Pipeline(
                steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')),
                ('ordinal_encoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
                ('scaler',StandardScaler())
                ]
            )

preprocessor = ColumnTransformer(
                [
                ('num_pipeline',num_pipeline,numerical_cols),
                ('cat_pipeline',cat_pipeline,categorical_cols)
                ]
            )


In [9]:
## Train test Split

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=30)


In [10]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), columns= preprocessor.get_feature_names_out())
x_test = pd.DataFrame(preprocessor.transform(x_test), columns=preprocessor.get_feature_names_out())

In [11]:
x_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.542206,2.112795,-1.684145,-0.455961,-0.544885,-0.354514,-2.146783,0.933319,0.686007
1,-0.153187,0.447103,-1.163003,0.021706,0.053798,0.080569,0.87371,0.317606,-1.313723
2,1.575782,-2.143974,0.921565,1.652982,1.595862,1.356812,-1.139952,0.317606,-0.647147
3,1.683842,0.447103,-0.641861,1.562856,1.550507,1.588856,0.87371,-1.529534,2.685738
4,-0.844775,1.094872,0.400423,-0.987703,-0.96215,-0.876613,-0.133121,1.549032,0.686007


In [12]:
## Model training

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [13]:
regression = LinearRegression()
regression.fit(x_train, y_train)

In [14]:
regression.coef_

array([[ 6428.85149938,  -130.08138227,   -69.46824678, -1789.29749321,
         -407.91015365,   -80.28571049,    71.50422628,   462.36992849,
          650.51163215]])

In [16]:
regression.intercept_

array([3968.61718478])

In [17]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [31]:
## Train multiple models

models = {
    "LinearRegression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "ElasticNet" : ElasticNet()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)


    # Make predictions

    y_pred = model.predict(x_test)

    mae, mse, rmse, r2_square = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE", rmse)
    print("MAE", mae)
    print("MSE", mse)
    print("R2 score", r2_square*100)


    r2_list.append(r2_score)

    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE 1015.7752655619722
MAE 675.5483623703187
MSE 1031799.3901274953
R2 score 93.67464279030942


Lasso
Model Training Performance
RMSE 1015.9362973179491
MAE 676.7001806379913
MSE 1032126.5602081043
R2 score 93.6726371023356


Ridge
Model Training Performance
RMSE 1015.7758609839723
MAE 675.5744982013186
MSE 1031800.5997577303
R2 score 93.67463537477553


ElasticNet
Model Training Performance
RMSE 1539.5381615531726
MAE 1063.9940993761822
MSE 2370177.7508785226
R2 score 85.46982963140233


