In [6]:
import pandas as pd

In [7]:
gemstone = pd.read_csv('../code/data/gemstone.csv')
gemstone.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [8]:
gemstone.drop(labels=['Unnamed: 0'], axis=1, inplace = True)

In [75]:
x = gemstone.drop(labels=['price'], axis=1)
y = gemstone[['price']]

In [76]:
categorical_cols = x.select_dtypes(include = 'O').columns
numerical_cols = x.select_dtypes(exclude = 'O').columns

In [77]:
cut_categories = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [78]:
from sklearn.impute import SimpleImputer # Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding

In [79]:
# Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [80]:
# Numerical Pipeline

num_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
    ]
)

# Categorical Pipeline

cat_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories = [
            cut_categories, color_categories, clarity_categories])),
            ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    [
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline', cat_pipeline, categorical_cols)
    ]
)

In [81]:
# Train Test Split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [82]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), 
             columns = preprocessor.get_feature_names_out())

x_test = pd.DataFrame(preprocessor.transform(x_test),
                      columns = preprocessor.get_feature_names_out())

In [83]:
# Model Training
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [84]:
regression = LinearRegression()
regression.fit(x_train, y_train)

In [85]:
regression.coef_

array([[ 5186.06288105,  -107.76919449,   -51.72550089, -2695.80757544,
         1733.85967975,   -27.38572298,   144.13559936,  -568.88829888,
          818.29807167]])

In [86]:
regression.intercept_

array([3945.0170587])

### Automating the Model Training process

In [87]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [88]:
# Training Multiple Models
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'Elasticnet': ElasticNet()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)

    # Make Predictions
    y_pred = model.predict(x_test)
    mae, rmse, r2_square = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print(f'RMSE: {rmse}')
    print(f'MAE: {mae}')
    print(f'R2 Score: {r2_square}')

    r2_list.append(r2_square)
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1497.8136836930128
MAE: 818.8155790792854
R2 Score: 0.8590298755263736


Lasso
Model Training Performance
RMSE: 1320.3274128514627
MAE: 814.3764605558664
R2 Score: 0.8904594856140777


Ridge
Model Training Performance
RMSE: 1476.3286090137285
MAE: 818.464162390759
R2 Score: 0.8630451025514674


Elasticnet
Model Training Performance
RMSE: 1659.273894209894
MAE: 1085.3043224405344
R2 Score: 0.8269993813411054




  model = cd_fast.enet_coordinate_descent(
