## Model Training

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("./data/gemstone.csv")
data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
data = data.drop('id', axis = 1)

In [4]:
# Independent and Dependent features
X = data.drop('price',axis= 1)
Y = data['price']

In [5]:
# devide features into ordinal encoded and which should be scaled
numerical_columns = X.columns[X.dtypes!='object']
categorical_columns = X.columns[X.dtypes=='object']
#categorical_cols = X.select_dtypes(include = 'object').columns     #one more way

In [44]:
numerical_columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [6]:
#define th custom ranking for each ordinal variable
cut_map = ["Fair","Good", "Very Good", "Premium","Ideal"]
color_map = ['D','E','F','G','H','I','J']
clarity_map = ['I1', 'SI2','SI1','VS2', 'VS1','VVS2', 'VVS1', 'IF']

In [7]:
from sklearn.impute import SimpleImputer #Handling the missing values
from sklearn.preprocessing import StandardScaler #Handling feature scaling
from sklearn.preprocessing import OrdinalEncoder #Ordinal Encoding

##for pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [45]:
## Numerical  Pipeline
num_pipeline = Pipeline(
    steps =[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

## categorical Pipeline
cat_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[cut_map,color_map,clarity_map])),
        ('scalar',StandardScaler()) #if one hot encoding is used then no need to do scaling for cat_features
        
    ]

)

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns)
])

In [46]:
## Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=26)

In [47]:
preprocessor.fit_transform(X_train)

array([[-1.03827974, -0.57236494, -0.63884359, ...,  0.87258537,
        -1.54901656,  0.68207718],
       [ 0.49517234,  0.81519874,  0.40173415, ..., -0.13387709,
         0.29799615,  1.3481299 ],
       [-0.58472349, -0.10984371, -0.11855472, ...,  0.87258537,
        -1.54901656, -0.65002824],
       ...,
       [-0.17436308,  0.90770299, -0.63884359, ..., -1.14033954,
         0.29799615,  0.01602447],
       [-1.05987766, -0.10984371, -1.15913246, ...,  0.87258537,
        -1.54901656, -0.65002824],
       [-0.82230058,  0.53768601,  0.40173415, ..., -0.13387709,
         0.29799615,  0.01602447]])

In [64]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns = preprocessor.feature_names_in_)
X_test = pd.DataFrame(preprocessor.transform(X_test),columns = preprocessor.feature_names_in_)

In [65]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,-1.03828,-0.572365,-0.638844,-1.211573,-1.205666,-1.236871,0.872585,-1.549017,0.682077
1,0.495172,0.815199,0.401734,0.670746,0.643586,0.73238,-0.133877,0.297996,1.34813
2,-0.584723,-0.109844,-0.118555,-0.518087,-0.498599,-0.512882,0.872585,-1.549017,-0.650028
3,0.473574,0.352678,-0.638844,0.661739,0.589196,0.674461,0.872585,0.913667,-1.316081
4,0.538368,0.445182,0.401734,0.688758,0.643586,0.70342,-0.133877,-0.317675,1.34813


In [66]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [67]:
regression = LinearRegression()
regression.fit(X_train, y_train)

LinearRegression()

In [68]:
regression.coef_

array([ 6432.96133053,  -131.99237194,   -67.38414897, -1800.0500053 ,
        -402.79179145,   -72.81395309,    70.97192567,  -466.67896913,
         651.19187544])

In [69]:
regression.intercept_

3970.075630438145

In [71]:
import numpy as np
def model_eval(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [77]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    #make predictions
    y_pred = model.predict(X_test)
    
    mae, rmse, r2_square = model_eval(y_test, y_pred)
    
    r2_list.append(r2_square)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model Training Performanance')
    print('RMSE',rmse)
    print('MAE',mae)
    print('R2 score', r2_square)
    
    print('='*40)
    print('\n')
    

LinearRegression
Model Training Performanance
RMSE 1013.9535184211724
MAE 676.2994474991389
R2 score 0.9367432571017835


Lasso
Model Training Performanance
RMSE 1014.1054352668878
MAE 677.441935854006
R2 score 0.9367243006416023


Ridge
Model Training Performanance
RMSE 1013.9537459677163
MAE 676.3304157770117
R2 score 0.9367432287102359


Elasticnet
Model Training Performanance
RMSE 1536.3418219433838
MAE 1065.391409510199
R2 score 0.8547733099881201




In [76]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet']