In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [10]:
df=pd.read_csv(r'C:\Users\ys136\Desktop\Data Science\Diamond Price Prediction\Notebooks\data\gemstone.csv')

In [11]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [12]:
df.drop(labels='id',axis=1,inplace=True)

In [13]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


Lets divide our data into Dependent And Independent Feature

In [14]:
X=df.drop(labels='price',axis=1)
y=df['price']

In [15]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77


In [16]:
y

0         13619
1         13387
2          2772
3           666
4         14453
          ...  
193568     1130
193569     2874
193570     3036
193571      681
193572     2258
Name: price, Length: 193573, dtype: int64

In [17]:
# segeregate numerical and categorical data

In [18]:
numerical_col=X.columns[X.dtypes!='O']
categorical_col=X.columns[X.dtypes=='O']

In [19]:
numerical_col

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [20]:
categorical_col

Index(['cut', 'color', 'clarity'], dtype='object')

In [21]:
df.cut.unique()

array(['Premium', 'Very Good', 'Ideal', 'Good', 'Fair'], dtype=object)

In [22]:
cut_categories=['Fair','Good','Very Good','Premium','Ideal']

In [23]:
df.color.unique()

array(['F', 'J', 'G', 'E', 'D', 'H', 'I'], dtype=object)

In [24]:
color_categories=['D','E','F','G','H','I','J']

In [25]:
df.clarity.unique()

array(['VS2', 'SI2', 'VS1', 'SI1', 'IF', 'VVS2', 'VVS1', 'I1'],
      dtype=object)

In [26]:
clarity_categories=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [27]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

In [28]:
numerical_pipeline=Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())

    ]

)
categorical_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)

In [29]:
preprocessor=ColumnTransformer(
    transformers=[
        ('numerical_pipeline',numerical_pipeline,numerical_col),
        ('categorical_pipeline',categorical_pipeline,categorical_col)
    ]
)

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [32]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [33]:
X_train.head()

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,-0.975439,-0.849607,-0.121531,-1.042757,-1.08097,-1.12315,0.874076,1.528722,1.352731
1,0.235195,1.833637,-0.121531,0.318447,0.279859,0.485354,-2.144558,-0.935071,-0.646786
2,0.494617,0.815855,0.3998,0.570855,0.606458,0.673737,-0.132136,0.296826,0.686225
3,-1.018676,0.260701,0.921131,-1.214034,-1.24427,-1.195605,-0.132136,0.296826,0.01972
4,-0.953821,-0.664555,-0.642862,-1.069801,-1.044681,-1.094168,0.874076,2.14467,1.352731


In [34]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet


In [35]:
models={
    
    'LinearRegression':LinearRegression(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'ridgeregreesion':Ridge(),
    'lassoregression':Lasso(),
    'ElasticNet':ElasticNet()
}

In [36]:
from sklearn.metrics import r2_score

In [37]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [38]:
def evaluate_model(true,predicted):
    r2score=r2_score(true,predicted)
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mse)
    return r2score,mse,mae,rmse



In [39]:
model_list=[]
test_model=[]
r2_list=[]
for i in range(len(list(models))):
     model=list(models.values())[i]
     model.fit(X_train, y_train)
     y_pred=model.predict(X_test)
     r2score,mae,mse,rmse=evaluate_model(y_test,y_pred)
     print(list(models.values())[i])
     model_list.append(list(models.values())[i])
     print('Model training performance')
     print('Accuracy',r2score)
     print('mse',mse)
     print('mae',mae)
     print('rmse',rmse)
     r2_list.append(r2score)
     print("="*35)




     

    

LinearRegression()
Model training performance
Accuracy 0.9368908248567511
mse 674.0255115796833
mae 1028002.7598132562
rmse 1013.9047094344005


DecisionTreeRegressor()
Model training performance
Accuracy 0.9572341768325102
mse 422.96004672360743
mae 696624.2886565146
rmse 834.6402150966095
Ridge()
Model training performance
Accuracy 0.9368906732505947
mse 674.055580079838
mae 1028005.2293677513
rmse 1013.9059272771568
Lasso()
Model training performance
Accuracy 0.9368940971841704
mse 675.071692336216
mae 1027949.4559693959
rmse 1013.8784226767013
ElasticNet()
Model training performance
Accuracy 0.8556494831165181
mse 1060.7368759154729
mae 2351365.3822896425
rmse 1533.416245606405
