In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df=pd.read_csv('https://raw.githubusercontent.com/krishnaik06/FSDSRegression/main/notebooks/data/gemstone.csv')

In [3]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
df.drop(columns=['id'],inplace=True)

In [5]:
X=df.drop(columns=['price'])
Y=df['price']

In [6]:
numerical_col=X.columns[X.dtypes!="object"]
categorical_col=X.columns[X.dtypes=="object"]


In [7]:
#define the custom ranking for each variable
cut_category=['Fair','Good','Very Good','Premium','Ideal']
color_category=['D','E','F','G','H','I','J']
clarity_category = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [8]:
from sklearn.impute import SimpleImputer # handling missimg values
from sklearn.preprocessing import StandardScaler # feature scaling
from sklearn.preprocessing import OrdinalEncoder # handling ordinal data
from sklearn.pipeline import Pipeline #pipline
from sklearn.compose import ColumnTransformer


In [9]:
# numerical pipeline
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy="median")),
        ('scaler',StandardScaler())
        
        ]
)
#categorical pipeline
cat_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy='most_frequent')),
        ('encoding',OrdinalEncoder(categories=[cut_category,color_category,clarity_category])),
        ('scaler',StandardScaler())
    ]
)
#created numerical pipeline and categorical pipeline seperatelly, need to combine that
preprocessor=ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_col),
        ('cat_pipeline',cat_pipeline,categorical_col)
    ]
)


In [10]:
#train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.33,random_state=42)

In [11]:
#transforming the data
# preprocessor.fit_transform(x_train)

In [12]:
x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test=pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

In [14]:
## Model training 
from sklearn.linear_model import LinearRegression, Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error

In [15]:
regressor=LinearRegression()
regressor.fit(x_train,y_train)

In [16]:
regressor.coef_

array([ 6432.59272318,  -133.11853452,   -70.36485019, -1713.18964719,
        -490.48291102,   -68.02812257,    68.36709467,  -464.25812278,
         651.94096231])

In [17]:
regressor.intercept_

3979.27372333125

In [18]:
def evaluation_model(true, predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)
    return mae,mse,rmse,r2_square

In [22]:
#train multiple model
models={
    "LinearRegression":LinearRegression(),
    'Lasso':Lasso(),
    "Ridge":Ridge(),
    'ElasticNet':ElasticNet()
}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)
    # make prediction
    y_pred=model.predict(x_test)

    mae,mse,rmse,r2_square=evaluation_model(y_test,y_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print("Model training performance")
    print("RMSE",rmse)
    print('MAE',mae)
    print("R2 Score",r2_square*100)
    r2_list.append(r2_square)
    print("="*30)
    print('\n')

LinearRegression
Model training performance
RMSE 1014.2749330568641
MAE 674.7352796098304
R2 Score 93.63893549824441


Lasso
Model training performance
RMSE 1014.3366158273099
MAE 675.8986621286323
R2 Score 93.63816178295377


Ridge
Model training performance
RMSE 1014.2792052203256
MAE 674.7687088427485
R2 Score 93.63888191205454


ElasticNet
Model training performance
RMSE 1534.0746306294475
MAE 1061.3169023914195
R2 Score 85.4483784776376


