In [20]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
import sys



In [21]:
path=os.path.join('data','gemstone.csv')
df=pd.read_csv(path)
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [22]:
df=df.drop(labels=['id'],axis=1)

In [23]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [24]:
X=df.drop(labels=['price'],axis=1)
y=df['price']

In [25]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77


In [26]:
y

0         13619
1         13387
2          2772
3           666
4         14453
          ...  
193568     1130
193569     2874
193570     3036
193571      681
193572     2258
Name: price, Length: 193573, dtype: int64

In [27]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import  Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [28]:
categorical_col= X.select_dtypes(include='object').columns
numerical_col= X.select_dtypes(exclude='object').columns


In [29]:
categorical_col,numerical_col

(Index(['cut', 'color', 'clarity'], dtype='object'),
 Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))

In [30]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']
    

In [31]:
Num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())

    ]
)


Cat_pipeline=Pipeline(

    steps=[

        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencodr',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)


processor=ColumnTransformer(

    [

        ('num_pipeline',Num_pipeline,numerical_col),
        ('cat_pipeline',Cat_pipeline,categorical_col)
    ]
)

In [33]:
print(processor)

ColumnTransformer(transformers=[('num_pipeline',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')),
                                ('cat_pipeline',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('ordinalencodr',
                                                  OrdinalEncoder(categories=[['Fair',
                                                                              'Good',
                                                                              'Very '
                                                                              'Good',
          

In [34]:
from sklearn.model_selection import  train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=30)

In [35]:

X_test

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
70432,0.53,Premium,E,VS2,60.8,56.0,5.24,5.21,3.19
64839,0.71,Very Good,H,SI1,62.9,57.0,5.67,5.69,3.56
185316,0.30,Ideal,H,IF,62.1,57.0,4.27,4.29,2.66
84658,1.24,Premium,G,VS2,61.6,61.0,6.88,6.82,4.21
31953,0.36,Premium,E,VS1,60.4,58.0,4.60,4.63,2.80
...,...,...,...,...,...,...,...,...,...
83708,0.41,Ideal,F,VS2,61.4,57.0,4.76,4.79,2.93
66755,0.31,Ideal,E,VVS2,62.0,56.0,4.32,4.35,2.68
10324,0.33,Ideal,G,IF,60.6,56.0,4.51,4.54,2.74
157076,0.54,Ideal,F,VS1,61.4,56.0,5.27,5.25,3.23


In [36]:
X_train=pd.DataFrame(processor.fit_transform(X_train),columns=processor.get_feature_names_out())
X_test=pd.DataFrame(processor.transform(X_test),columns=processor.get_feature_names_out())

In [38]:
print(X_test)

       num_pipeline__carat  num_pipeline__depth  num_pipeline__table  \
0                -0.563818            -0.940974            -0.641861   
1                -0.174800             1.002333            -0.120719   
2                -1.060896             0.262026            -0.120719   
3                 0.970643            -0.200666             1.963849   
4                -0.931224            -1.311128             0.400423   
...                    ...                  ...                  ...   
38710            -0.823163            -0.385743            -0.120719   
38711            -1.039284             0.169487            -0.641861   
38712            -0.996060            -1.126051            -0.641861   
38713            -0.542206            -0.385743            -0.641861   
38714            -1.060896             0.076949            -0.641861   

       num_pipeline__x  num_pipeline__y  num_pipeline__z  cat_pipeline__cut  \
0            -0.428923        -0.463247        -0.499542

In [None]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [None]:
import numpy as np
def Evaluate_model(true,y_prdict):

    r2score=r2_score(true,y_prdict)
    MAE=mean_absolute_error(true,y_prdict)

    RMSE=np.sqrt(mean_squared_error(true,y_prdict))


    return r2score,MAE,RMSE


In [None]:
Model={"linearreagression":LinearRegression,'ridge':Ridge,'lasso':Lasso,'elasticnet':ElasticNet}
def ModelTraining(Model,X_train,X_test,y_train,y_test):
    for k,v in Model.items():
        report={}
        model=v()
        model.fit(X_train,y_train)

        y_predic_test=model.predict(X_test)

        r2s,mae,rmse=Evaluate_model(y_test,y_predic_test)

        report[k]=r2s


        print(report)
        # print("Model Name",k)
        # print("R2 score",r2s*100)
        # print("MAE",mae)
        # print("RMS",rmse)
        print()


In [None]:
import pandas as pd
import numpy as np
from src.logger import logging
from src.exception import CustomException
import os
import sys
# from src.utils import load_object
import pickle


In [39]:
data=pd.DataFrame([{'carat':[0.52],'depth':[60.2],'table':[50.6],'x':[3.2],'y':[2.3],'z':[4.5],'cut':['Premium'],'color':['E'],'clarity':['VS2']}])

In [40]:
# def load_object(file_path):
    # try:
    #     with open(file_path,'rb') as file_obj:
    #         return pickle.load(file_obj)
    # except Exception as e:
    #     # logging.info('Exception Occured in load_object function utils')
    #     raise CustomException(e,sys)

data

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
0,[0.52],[60.2],[50.6],[3.2],[2.3],[4.5],[Premium],[E],[VS2]


In [None]:
proccesr_path=os.path.join('artifacts','preprocesssor.pkl')
print(proccesr_path)#
model = pickle.load(open(proccesr_path, 'rb'))

artifacts\preprocesssor.pkl


FileNotFoundError: [Errno 2] No such file or directory: 'artifacts\\preprocesssor.pkl'

In [None]:
a=[[ -0.54071387, -34.94241387,   1.44481904 , -1.54534864,  -0.65211629,   0.6807403,   -3.14709146,  -1.54909596,  -1.98038114]]

In [None]:
Model_Path=os.path.join('artifacts','model.pkl')
# proccesr_path=os.path.join('artifacts','preprocesssor.pkl')

# processor=load_object(proccesr_path)
Models=pickle.load(open(Model_Path, 'rb'))

# scaled_data=processor.transform(data)
result=Models.predict(a)
print(result)

FileNotFoundError: [Errno 2] No such file or directory: 'artifacts\\model.pkl'

In [None]:
Model={"linearreagression":LinearRegression,'ridge':Ridge,'lasso':Lasso,'elasticnet':ElasticNet}
def ModelTraining1(Model,X_train,X_test,y_train,y_test):
    report={}
    for k,v in Model.items():
       
        model=v()
        model.fit(X_train,y_train)

        y_predic_test=model.predict(X_test)

        r2s=r2_score(y_test,y_predic_test)
      
        report[k]=r2s

    return report    


In [None]:


model_dic:dict=ModelTraining1(Model,X_train,X_test,y_train,y_test)
model_dic

{'linearreagression': 0.9367464279030936,
 'ridge': 0.9367463537477555,
 'lasso': 0.936726371023356,
 'elasticnet': 0.8546982963140234}

In [None]:
model_dic

{'linearreagression': 0.9367464279030936,
 'ridge': 0.9367463537477555,
 'lasso': 0.936726371023356,
 'elasticnet': 0.8546982963140234}

In [None]:
best_model_report=max(sorted(model_dic.values()))
best_model_report

0.9367464279030936

In [None]:
best_model_name=list(model_dic.keys())[list(model_dic.values()).index(best_model_report)]
best_model_name

'linearreagression'

In [None]:
best_model=Model[best_model_name]
best_model

sklearn.linear_model._base.LinearRegression

In [None]:
max_report=max(sorted(ModelTraining(Model,X_train,X_test,y_train,y_test)))
#save this model in pikkle file

Model Name linearreagression
R2 score 93.67464279030936
MAE 675.5483623703053
RMS 1015.7752655619774

Model Name ridge
R2 score 93.67463537477555
MAE 675.5744982013293
RMS 1015.7758609839717

Model Name lasso
R2 score 93.6726371023356
MAE 676.7001806379913
RMS 1015.9362973179491

Model Name elasticnet
R2 score 85.46982963140233
MAE 1063.9940993761822
RMS 1539.5381615531726



TypeError: 'NoneType' object is not iterable

In [None]:
   models={
            'LinearRegression':LinearRegression(),
            'Lasso':Lasso(),
            'Ridge':Ridge(),
            'Elasticnet':ElasticNet()
           
                   }

In [None]:
best_model=models['Lasso']

In [None]:
best_model