In [13]:
import pandas as pd
df=pd.read_csv('https://raw.githubusercontent.com/krishnaik06/FSDSRegression/main/notebooks/data/gemstone.csv')

In [14]:
data=df.sample(5000)

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 41321 to 75543
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       5000 non-null   int64  
 1   carat    5000 non-null   float64
 2   cut      5000 non-null   object 
 3   color    5000 non-null   object 
 4   clarity  5000 non-null   object 
 5   depth    5000 non-null   float64
 6   table    5000 non-null   float64
 7   x        5000 non-null   float64
 8   y        5000 non-null   float64
 9   z        5000 non-null   float64
 10  price    5000 non-null   int64  
dtypes: float64(6), int64(2), object(3)
memory usage: 468.8+ KB


In [5]:
data.head(5)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
78293,78293,1.51,Ideal,F,VS2,62.4,57.0,7.35,7.3,4.57,12748
155707,155707,0.34,Premium,F,SI2,59.3,58.0,4.59,4.55,2.71,574
122702,122702,0.33,Premium,F,SI2,62.3,60.0,4.42,4.37,2.74,557
35854,35854,0.9,Good,G,SI1,64.0,59.0,6.02,6.07,3.86,3715
110826,110826,1.0,Ideal,I,SI1,61.6,56.0,6.44,6.41,3.95,4312


In [16]:
data=data.drop(labels=['id'],axis=1)

In [17]:
data.head(3)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
41321,1.57,Ideal,F,SI1,61.6,56.0,7.45,7.51,4.61,13112
64615,0.7,Premium,F,VS1,61.1,59.0,5.74,5.68,3.48,2819
169471,1.0,Very Good,H,VS1,62.7,57.0,6.34,6.39,3.99,5978


In [18]:
## Independent and dependent features
X = data.drop(labels=['price'],axis=1)
Y = data[['price']]

In [19]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [20]:
categorical_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [21]:
numerical_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [21]:
data.head(1)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
78293,1.51,Ideal,F,VS2,62.4,57.0,7.35,7.3,4.57,12748


In [22]:
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [1]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [2]:
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median'))#for missing values,
    ('scaler',StandardScaler())]
)

In [5]:
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())
    ])

In [7]:
# StandardScaler is used to resize the distribution of values ​​so that the mean of the observed values ​​is 0
#  and the standard deviation is 1, so that we can reach the global minima.


In [23]:
preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [24]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [26]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [29]:
X_train.head(5)

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.539689,-1.8144,-0.637478,0.830064,0.776282,0.592582,0.877709,0.917464,0.011423
1,-1.030351,0.764255,-0.11767,-1.276362,-1.253245,-1.205251,-1.177128,0.298989,0.011423
2,0.47427,2.606152,-0.11767,0.539523,0.483738,0.725209,-3.231965,-0.319487,-1.321236
3,-1.052157,0.395876,0.402139,-1.267283,-1.298955,-1.264196,-0.14971,0.298989,-0.654907
4,-0.616035,-0.064598,-0.637478,-0.568167,-0.540168,-0.556852,0.877709,0.298989,2.010411


In [31]:
X_test.head(5)

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.517883,0.764255,-0.637478,0.648476,0.684862,0.739945,-1.177128,-0.319487,-0.654907
1,1.237484,0.487971,-1.157286,1.265877,1.215099,1.314662,0.877709,0.917464,0.011423
2,0.452464,0.303781,0.921947,0.557682,0.648294,0.636791,-1.177128,-0.937962,0.011423
3,-0.637841,1.685204,0.921947,-0.67712,-0.631588,-0.512643,-2.204546,0.298989,0.677752
4,-0.201719,-1.261831,0.921947,-0.005243,0.044921,-0.100026,-1.177128,-0.319487,-0.654907


In [32]:
#Model Training

In [33]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [34]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [35]:
regression.coef_

array([[ 6462.15911067,   314.64845501,   -69.02713974,   241.65824718,
         2475.00538991, -5067.75968045,    53.61770148,  -472.65307633,
          645.8842759 ]])

In [36]:
regression.intercept_

array([3960.92171429])

In [71]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [76]:
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    
    'Elasticnet':ElasticNet()
}
model_list=[]
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    
    y_pred=model.predict(X_test)
    mae,mse,rmse,r2_square=evaluate_model(y_test,y_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print("RMSE:",rmse)
    print("mse:", mse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    print('='*35)
    print('\n')
    

LinearRegression
RMSE: 1024.4179935105756
mse: 1049432.2254282336
MAE: 678.0753406837131
R2 score 93.7182257199594


Lasso
RMSE: 1023.5519239172431
mse: 1047658.54095469
MAE: 677.5545432163885
R2 score 93.72884278053448


Ridge
RMSE: 1023.8509772977533
mse: 1048270.8237135645
MAE: 678.6777046276638
R2 score 93.72517773004945


Elasticnet
RMSE: 1599.5700769309963
mse: 2558624.4310130333
MAE: 1095.6759021408734
R2 score 84.68438384721557




  model = cd_fast.enet_coordinate_descent(


In [77]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet']

In [79]:
#whichever will be the highest r2 score we will take that