In [96]:
import pandas as pd
df = pd.read_csv("GemStone.csv")

In [97]:
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [98]:
df = df.drop(columns = ['Unnamed: 0'],axis = 1)

In [99]:
df.duplicated().sum()

np.int64(34)

In [100]:
df = df.drop_duplicates()

In [66]:
df.duplicated().sum()

np.int64(0)

In [101]:
df = df.drop(columns =['x','y','z','depth'])

In [102]:
num_features = df.select_dtypes(exclude = 'O').columns.drop('price')
print('num_features:',list(num_features))

num_features: ['carat', 'table']


In [103]:
cat_features = [features for features in df.columns if df[features].dtype == 'O']
print("cat features:",cat_features)

cat features: ['cut', 'color', 'clarity']


In [104]:
from sklearn.impute import SimpleImputer #uses to fill null values
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline  #To Combine Multtiple Steps

In [105]:
#To perform ordinal encoding we need to perform based on ranking 
cut_categories = ['Fair','Good','Very Good','Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [106]:
#Numeric Categories
numeric_pipeline = Pipeline(
    [
        ("SimpleImputer:",SimpleImputer(strategy='median')),
        ("Scaler:",StandardScaler())
    ]
)

In [107]:
categoric_pipeline = Pipeline(
    [
        ('SimpleImputer:',SimpleImputer(strategy = 'most_frequent')),
        ('OrdinalEncoder:',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ("StandardScaler:",StandardScaler()) #we are Performing scaling also because if i have 10 different data's in my 
        #categorical variable ten i will have ranking from 1 to 10 then to increase accuracy we use scaling to make it from 0 to 1
    ]
)

In [112]:
preprocessor = ColumnTransformer(
    [
        ("numeric_pipeline:",numeric_pipeline,num_features),
        ("categoric_pipeline:", categoric_pipeline,cat_features)
        ]
)

In [113]:
print(preprocessor)

ColumnTransformer(transformers=[('numeric_pipeline:',
                                 Pipeline(steps=[('SimpleImputer:',
                                                  SimpleImputer(strategy='median')),
                                                 ('Scaler:',
                                                  StandardScaler())]),
                                 Index(['carat', 'table'], dtype='object')),
                                ('categoric_pipeline:',
                                 Pipeline(steps=[('SimpleImputer:',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('OrdinalEncoder:',
                                                  OrdinalEncoder(categories=[['Fair',
                                                                              'Good',
                                                                              'Very '
                                         

In [109]:
x = df.drop(columns=["price"],axis = 1)
y = df['price']

In [44]:
x

Unnamed: 0,carat,cut,color,clarity,table
0,0.30,Ideal,E,SI1,58.0
1,0.33,Premium,G,IF,58.0
2,0.90,Very Good,E,VVS2,60.0
3,0.42,Ideal,F,VS1,56.0
4,0.31,Ideal,F,VVS1,59.0
...,...,...,...,...,...
26962,1.11,Premium,G,SI1,58.0
26963,0.33,Ideal,H,IF,55.0
26964,0.51,Premium,E,VS2,58.0
26965,0.27,Very Good,F,VVS2,56.0


In [45]:
x.isnull().sum()

carat      0
cut        0
color      0
clarity    0
table      0
dtype: int64

In [114]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25,random_state = 42)

In [115]:
x_train.shape

(20199, 5)

In [119]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train),columns = preprocessor.get_feature_names_out())
x_train

Unnamed: 0,numeric_pipeline:__carat,numeric_pipeline:__table,categoric_pipeline:__cut,categoric_pipeline:__color,categoric_pipeline:__clarity
0,-0.543717,-0.657759,0.983195,0.230035,1.781373
1,-0.585546,-1.105275,0.983195,0.230035,-1.249145
2,0.209209,-0.210243,0.983195,1.992736,2.387476
3,0.020978,-1.105275,-1.712795,-1.532666,-1.249145
4,-1.045668,-0.657759,0.983195,0.230035,1.781373
...,...,...,...,...,...
20194,-0.606461,-1.105275,0.983195,-0.945099,-0.643042
20195,-0.773778,-1.552792,0.983195,0.230035,2.387476
20196,-0.857436,-0.657759,0.983195,0.230035,1.781373
20197,-1.003838,-0.210243,0.983195,-0.357532,1.175269


In [120]:
x_test

Unnamed: 0,carat,cut,color,clarity,table
3916,0.41,Premium,G,VS2,59.0
10439,0.31,Very Good,I,VVS1,60.0
24237,0.41,Ideal,J,VS2,55.0
9590,0.27,Very Good,E,VVS2,61.0
7451,1.24,Premium,D,SI1,59.0
...,...,...,...,...,...
13977,0.37,Ideal,D,VS2,56.0
6426,0.32,Premium,H,VS2,58.0
3168,0.51,Ideal,D,VS1,54.0
23159,0.31,Ideal,G,VS2,57.0


In [122]:
x_test = pd.DataFrame(preprocessor.transform(x_test),columns = preprocessor.get_feature_names_out())

In [123]:
x_test.head()

Unnamed: 0,numeric_pipeline:__carat,numeric_pipeline:__table,categoric_pipeline:__cut,categoric_pipeline:__color,categoric_pipeline:__clarity
0,-0.815607,0.684789,0.084532,0.230035,-0.036938
1,-1.024753,1.132305,-0.814131,1.405169,1.781373
2,-0.815607,-1.105275,0.983195,1.992736,-0.036938
3,-1.108412,1.579821,-0.814131,-0.945099,1.175269
4,0.920306,0.684789,0.084532,-1.532666,-0.643042


In [89]:
x_train.isnull().sum()

carat      0
cut        0
color      0
clarity    0
table      0
dtype: int64

In [70]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [71]:
linear = LinearRegression()
ridge = Ridge()
lasso = Lasso()
elastic = ElasticNet()

In [81]:
linear.fit(x_train,y_train)
print("Slope:",linear.coef_, sep = ',')
print("intercept:",linear.intercept_)

Slope:,[4211.03201747  -36.0706491   160.48486712 -557.67741318  858.22306872]
intercept: 3948.8576167136994


In [124]:
y_pred = linear.predict(x_test)

In [127]:
import numpy as np
def evalute_model(y_test,y_pred):
    mae = mean_absolute_error(y_test,y_pred)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test,y_pred)
    return mae,mse,rmse,r2

In [142]:
#Training Multiple Models
models = {
    "ridge" : ridge,
    "lasso" : lasso,
    "elasticnet" : elastic,
}
for model_name,model_evalation in models.items():
    print(model_name)
    model_evalation.fit(x_train,y_train)
    y_pred = model_evalation .predict(x_test)
    mae,mse,rmse,r2 = evalute_model(y_test,y_pred)
    print("rmse:",rmse)
    print("mae:",mae)
    print("mse:",mse)
    print("r2_score:",round(r2*100,4), "percentage")
    print('\n' + '-'*30 + '\n')

ridge
rmse: 1249.9142772066668
mae: 869.1441329672422
mse: 1562285.7003650642
r2_score: 90.4161 percentage

------------------------------

lasso
rmse: 1249.935411110729
mae: 868.7717876580706
mse: 1562338.531948547
r2_score: 90.4158 percentage

------------------------------

elasticnet
rmse: 1961.86204500826
mae: 1315.5978132874639
mse: 3848902.6836439925
r2_score: 76.3888 percentage

------------------------------

