## **Model Training**

In [24]:
# import packages and libraries 
import numpy as np
import pandas as pd 

# import scikit-learn packages
# handliing missing values
from sklearn.impute import SimpleImputer
# feature scaling and ordinal encoding 
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
# pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# train test split
from sklearn.model_selection import train_test_split

# model training
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor

# model evaluation
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


#### Load the data

In [25]:
diamond_df = pd.read_csv('data/gemstone.csv')
diamond_df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [26]:
# drop 'id' column
diamond_df.drop(columns=['id'], axis=1,  inplace=True)

#### Features and Target

In [27]:
# features (independent variables)
X = diamond_df.drop(columns=['price'], axis=1)

# target (independent)
Y = diamond_df['price']

In [28]:
# feature variable 5 rows and shape of the dataframe
print(X.head())
print(X.shape)

   carat        cut color clarity  depth  table     x     y     z
0   1.52    Premium     F     VS2   62.2   58.0  7.27  7.33  4.55
1   2.03  Very Good     J     SI2   62.0   58.0  8.06  8.12  5.05
2   0.70      Ideal     G     VS1   61.2   57.0  5.69  5.73  3.50
3   0.32      Ideal     G     VS1   61.6   56.0  4.38  4.41  2.71
4   1.70    Premium     G     VS2   62.6   59.0  7.65  7.61  4.77
(193573, 9)


In [29]:
# target variable 
print(Y.head())
print(Y.shape)

0    13619
1    13387
2     2772
3      666
4    14453
Name: price, dtype: int64
(193573,)


#### Pre processing

In [30]:
# numerical and categorical variables in features
num_variables = X.select_dtypes(exclude='object').columns
cat_variables = X.select_dtypes(include='object').columns

# print the columns 
print('Numerical variables: ', num_variables)
print('Categorical Variables: ', cat_variables)

Numerical variables:  Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')
Categorical Variables:  Index(['cut', 'color', 'clarity'], dtype='object')


In [31]:
# categories in categorical variables
for col in cat_variables:
    print('Variable : ', col)
    print(diamond_df[col].unique())
    print('========')


Variable :  cut
['Premium' 'Very Good' 'Ideal' 'Good' 'Fair']
Variable :  color
['F' 'J' 'G' 'E' 'D' 'H' 'I']
Variable :  clarity
['VS2' 'SI2' 'VS1' 'SI1' 'IF' 'VVS2' 'VVS1' 'I1']


In [32]:
# ranking of ordinal variable 
cut_cat = ['Fair', 'Good', 'Very Good','Premium', 'Ideal']
color_cat = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_cat = [ 'I1',  'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

#### Numerical pipeline

In [33]:
# handling missing values and feature scaling
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

#### Categorical Pipeline

In [34]:
# handling missing values, feature scaling and Encoding(ordinal)
cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[cut_cat, color_cat, clarity_cat])),
        ('scaler', StandardScaler())
    ]
)

#### Preprocesser 

In [35]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_variables),
    ('cat_pipeline', cat_pipeline, cat_variables)
])

#### Train-Test split 

In [36]:
# split data into train and test set with 30% of test data and random_state=340
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=340)

In [37]:
# use previously defined 'preprocessor'  ('ColumnTransformer' object) to transform train  and test features and convert transformed data into dataframe 
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.fit_transform(X_test), columns=preprocessor.get_feature_names_out())

In [38]:
X_train.head() 

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.54011,-0.295898,-0.639471,0.753312,0.735678,0.709496,0.874573,-1.548963,-0.648619
1,-1.039524,-1.220575,1.963937,-1.222377,-1.207431,-1.275319,-1.135988,-0.932425,2.017294
2,0.561749,1.276052,0.922574,0.654077,0.681199,0.811655,-2.141268,-1.548963,-1.315097
3,0.475194,1.36852,-0.639471,0.572884,0.590399,0.709496,-1.135988,0.917187,-0.648619
4,-0.541831,-0.665769,-0.118789,-0.428493,-0.390235,-0.472637,0.874573,0.917187,-0.648619


#### Model Training

In [39]:
# linear regression
LR = LinearRegression()
LR.fit(X_train, Y_train)

In [40]:
# intercept and co-efficients 
print('Co-efficients')
print(LR.coef_)
print('========')
print('Intercept')
print(LR.intercept_)

Co-efficients
[ 6422.72487254   -92.93264099   -68.89057562 -1445.16159454
  -337.47815834  -494.36433877    71.28026614  -462.35991364
   651.44868598]
Intercept
3965.4521442646173


In [41]:
# model evaluation 
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    R2 = r2_score(true, predicted)
    return mae, mse, rmse, R2

In [44]:
# train multiple algorithms 
models = {
    'LinearRegression' : LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'RandomForestReg' : RandomForestRegressor(n_estimators=100, random_state=30),
}

model_list = []
r2_list = []
accuracy = {}

for model_name, model in models.items():

    model.fit(X_train, Y_train)

    # make prediction
    y_pred = model.predict(X_test)
    mae, mse, rmse, R2 = evaluate_model(Y_test, y_pred)
    print(model_name)
    model_list.append(model_name)
    
    print('Model Accuracy performance :')
    print('MAE :', mae)
    print('RMSE :', rmse)
    print('R2_score :', R2*100)
    accuracy[model_name] = round(R2, 4)
    r2_list.append(R2)
    print('='*25)
    print('\n')

LinearRegression
Model Accuracy performance :
MAE : 678.0756208975442
RMSE : 1024.349357640398
R2_score : 93.59271232463297


Lasso
Model Accuracy performance :
MAE : 678.719219935326
RMSE : 1022.5452407298237
R2_score : 93.61526189033043


Ridge
Model Accuracy performance :
MAE : 678.108082299904
RMSE : 1024.37206182451
R2_score : 93.59242829292032


ElasticNet
Model Accuracy performance :
MAE : 1068.0137124199332
RMSE : 1555.0570687968518
R2_score : 85.23373398117417


RandomForestReg
Model Accuracy performance :
MAE : 313.111027018936
RMSE : 615.494528721602
R2_score : 97.68672950787543




In [45]:
accuracy

{'LinearRegression': 0.9359,
 'Lasso': 0.9362,
 'Ridge': 0.9359,
 'ElasticNet': 0.8523,
 'RandomForestReg': 0.9769}

From the above result we can conclude `RandomForestRegressor` is better fit.