In [224]:
import pandas as pd

In [211]:
df = pd.read_csv("data/gemstone.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [212]:
df.drop(["id"], axis=1, inplace=True)

In [213]:
## Independent and dependent features
X = df.drop(["price"], axis=1)
y = df["price"]

In [214]:
# Define columns based on datatype
categorical_cols = X.select_dtypes(include="object").columns
numerical_cols = X.select_dtypes(exclude="object").columns

In [215]:
# Sorting the order for encoding 
cut_category_order = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_category_order = ['J', 'I', 'H', 'G', 'F', 'E', 'D']
clarity_category_order = ['I1', 'SI2', 'SI1','VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [216]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [217]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Numerical pipeline
num_pipeline = Pipeline(
    steps=[
    ("imputer", SimpleImputer(strategy="mean")), 
    ])

# Categorical pipeline
cat_pipeline = Pipeline (
    steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehotencoder", OneHotEncoder()),
    ]
)

# used to transform a dataset based on pipeline
preprocessor = ColumnTransformer([
    ("num_col", num_pipeline, numerical_cols),
    ('cat_col', cat_pipeline, categorical_cols)
])

In [218]:
### This code shared by krish but i prefer onehotencoding, so i commented out this

# num_pipeline = Pipeline(
#     steps=[
#     ("imputer", SimpleImputer(strategy="mean")), 
#     ("scaler", StandardScaler())
#     ])

# cat_pipeline = Pipeline (
#     steps=[
#     ("imputer", SimpleImputer(strategy="most_frequent")),
#     ("ordinal", OrdinalEncoder(categories=[cut_category_order, color_category_order, clarity_category_order])),
#     ("scaler", StandardScaler())
#     ]
# )

# preprocessor = ColumnTransformer([
#     ("num_pipeline", num_pipeline, numerical_cols),
#     ('cat_pipeline', cat_pipeline, categorical_cols)
# ])

In [219]:
# Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [220]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns = preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns = preprocessor.get_feature_names_out())

In [221]:
X_train.head()

Unnamed: 0,num_col__carat,num_col__depth,num_col__table,num_col__x,num_col__y,num_col__z,cat_col__cut_Fair,cat_col__cut_Good,cat_col__cut_Ideal,cat_col__cut_Premium,...,cat_col__color_I,cat_col__color_J,cat_col__clarity_I1,cat_col__clarity_IF,cat_col__clarity_SI1,cat_col__clarity_SI2,cat_col__clarity_VS1,cat_col__clarity_VS2,cat_col__clarity_VVS1,cat_col__clarity_VVS2
0,0.32,61.6,58.0,4.38,4.41,2.71,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.2,62.6,57.0,6.81,6.76,4.25,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.5,62.2,55.0,7.3,7.26,4.53,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.67,61.9,59.0,7.65,7.61,4.71,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,63.7,60.0,6.34,6.3,4.02,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [222]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [223]:
# Train multiple models

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

models = {
    "lr": LinearRegression(),
    "ls": Lasso(),
    "ri": Ridge(),
    "el": ElasticNet()
}

r2_list = []

for i in range(len(list(models))):
    # Training
    model = list(models.values())[i].fit(X_train, y_train)
    # Make Predictions
    y_pred = model.predict(X_test)
    # Evaluation
    mae, mse, rmse, r2_square = evaluate_model(y_test, y_pred)
    r2_list.append([{list(models.values())[i]: r2_square}])
    print(f"""{list(models.keys())[i]}->
    mse:{mse}
    mae: {mae}
    rmse: {rmse}
    r2 square: {r2_square}
    """)
    print("="*34)
print(r2_list)

lr->
    mse:892422.6213186507
    mae: 623.3302789616428
    rmse: 944.6812273559005
    r2 square: 0.9447743779374466
    
ls->
    mse:893501.5908891577
    mae: 621.5538302351229
    rmse: 945.2521308567136
    r2 square: 0.9447076082654386
    
ri->
    mse:892422.0774329444
    mae: 623.0237030419833
    rmse: 944.6809394885368
    r2 square: 0.9447744115946239
    
el->
    mse:3023870.770285766
    mae: 1203.3091247910272
    rmse: 1738.9280520728182
    r2 square: 0.8128743710249626
    
[[{LinearRegression(): 0.9447743779374466}], [{Lasso(): 0.9447076082654386}], [{Ridge(): 0.9447744115946239}], [{ElasticNet(): 0.8128743710249626}]]
