In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data\gemstone.csv")

In [3]:
data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
data = data.drop(labels=["id"],axis = 1)

In [7]:
X = data.iloc[:,:-1]
y = data['price']

In [6]:
y

0         13619.0
1         13387.0
2          2772.0
3           666.0
4         14453.0
           ...   
193568     1130.0
193569     2874.0
193570     3036.0
193571      681.0
193572     2258.0
Name: price, Length: 193573, dtype: float64

In [8]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((154858, 9), (38715, 9), (154858,), (38715,))

In [12]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
83475,0.32,Premium,E,SI1,61.6,58.0,4.38,4.41,2.71
160324,1.2,Premium,F,VS2,62.6,57.0,6.81,6.76,4.25
101740,1.5,Ideal,I,VS2,62.2,55.0,7.3,7.26,4.53
180341,1.67,Premium,I,SI2,61.9,59.0,7.65,7.61,4.71
48480,1.0,Good,H,VS2,63.7,60.0,6.34,6.3,4.02


In [9]:
cat_col = X.select_dtypes(include='object').columns
num_col = X.select_dtypes(exclude='object').columns
cat_col

Index(['cut', 'color', 'clarity'], dtype='object')

In [10]:
cut_val = ['Ideal','Premium', 'Very Good', 'Good', 'Fair']
clarity_val = ['I1','SI2','SI1','VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
color_val = ['D','E','F','G','H','I','J']

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

num_Pipeline = Pipeline(
    steps = [
        ("imputer",SimpleImputer(strategy = 'median')),
        ("StandardScaler",StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps = [
        ("imputer",SimpleImputer(strategy = "most_frequent")),
        ("encoding",OrdinalEncoder(categories = [cut_val,color_val,clarity_val])),
        ("StandardScaler",StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ("num_transformer",num_Pipeline,num_col),
    ("cat_transformer",cat_pipeline,cat_col)
                ]
)

In [14]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
83475,0.32,Premium,E,SI1,61.6,58.0,4.38,4.41,2.71
160324,1.2,Premium,F,VS2,62.6,57.0,6.81,6.76,4.25
101740,1.5,Ideal,I,VS2,62.2,55.0,7.3,7.26,4.53
180341,1.67,Premium,I,SI2,61.9,59.0,7.65,7.61,4.71
48480,1.0,Good,H,VS2,63.7,60.0,6.34,6.3,4.02


In [None]:
X_train = preprocessor.fit_transform(X_train,y_train)

In [None]:
X_train.shape

In [15]:
new_X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
new_X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [16]:
new_X_train.head()

Unnamed: 0,num_transformer__carat,num_transformer__depth,num_transformer__table,num_transformer__x,num_transformer__y,num_transformer__z,cat_transformer__cut,cat_transformer__color,cat_transformer__clarity
0,-1.016395,-0.204317,0.402608,-1.202472,-1.187395,-1.194148,0.132842,-0.936018,-0.64895
1,0.882396,0.720758,-0.118536,0.985177,0.941823,1.036109,0.132842,-0.320002,0.017052
2,1.529711,0.350728,-1.160823,1.426308,1.394848,1.441611,-0.872563,1.528047,0.017052
3,1.896523,0.073206,0.923751,1.741402,1.711965,1.70229,0.132842,1.528047,-1.314953
4,0.450852,1.73834,1.444895,0.562052,0.52504,0.703019,2.143651,0.912031,0.017052


In [17]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [26]:
models = [LinearRegression,Lasso,Ridge,ElasticNet]
def model_trainer(models): 
    val = {}
    for model in models:
        ML_model = model()
        ML_model.fit(new_X_train,y_train)
        y_pred = ML_model.predict(new_X_test)
        r2_score1 = r2_score(y_test,y_pred)
        mean_absolute_error1 = mean_absolute_error(y_test,y_pred)
        mean_squared_error1 = mean_squared_error(y_test,y_pred)
        metrics = [r2_score1,mean_absolute_error1,mean_squared_error1]
        val[model] = metrics
    
    return val
    

    

In [58]:
def Model_E(val = model_trainer(models)):
    m = 0.0
    for v in val:
        model_name = str(v)
        m_name = model_name.split(".")[-1]
        print(f"Model Name : {m_name}")
        Accuracy = val[v][0]
        print(f"Model Accuracy : {val[v][0]}")
        print(f"MSE : {val[v][2]}")
        print(f"MAE : {val[v][1]}","\n")
        m = float(max(m,Accuracy))
        if m <= Accuracy:
            best_model = m_name
            best_accuracy = Accuracy
    print(f"Best Model is : {best_model} with {best_accuracy} Accuracy")


In [59]:
Model_E()

Model Name : LinearRegression'>
Model Accuracy : 0.9372975155452961
MSE : 1013245.5452810804
MAE : 671.5856392794417 

Model Name : Lasso'>
Model Accuracy : 0.9372637996466201
MSE : 1013790.3798986507
MAE : 672.8634885961555 

Model Name : Ridge'>
Model Accuracy : 0.9372968620700983
MSE : 1013256.1051630179
MAE : 671.6137412324589 

Model Name : ElasticNet'>
Model Accuracy : 0.8540772247198525
MSE : 2358050.136186741
MAE : 1063.3297735460817 

Best Model is : LinearRegression'> with 0.9372975155452961 Accuracy
