In [2]:
import pandas as pd

In [3]:
df=pd.read_csv("gemstone.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [4]:
df= df.drop(columns='Unnamed: 0',axis=1 )

In [5]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [6]:
X= df.drop(columns='price', axis=1)
y=df[['price']]

In [7]:
X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26967 entries, 0 to 26966
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    26967 non-null  float64
 1   cut      26967 non-null  object 
 2   color    26967 non-null  object 
 3   clarity  26967 non-null  object 
 4   depth    26270 non-null  float64
 5   table    26967 non-null  float64
 6   x        26967 non-null  float64
 7   y        26967 non-null  float64
 8   z        26967 non-null  float64
dtypes: float64(6), object(3)
memory usage: 1.9+ MB


In [8]:
cols=list(X.columns)
cols


['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']

In [9]:
categorical_cols=X.select_dtypes(include='object').columns
numerical_cols=X.select_dtypes(exclude='object').columns

In [10]:
numerical_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [11]:
#define custom ranking for each ordinal variable 
cut_categories= ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_categories=['D','E','F','G','H','I','J']
clarity_categories =['IF','VVS1', 'VVS2', 'VS1','VS2', 'SI1','SI2','I1']


In [12]:
#automating missing values
from sklearn.impute import SimpleImputer  
#feature scaling
from sklearn.preprocessing import StandardScaler 
#ordinal encoding the values in categorical columns 
from sklearn.preprocessing import OrdinalEncoder 

## create pipelines so that values get adjusted in their respective columns 
from sklearn.pipeline import Pipeline

## Now we are grouping all the above steps into one 
from sklearn.compose import ColumnTransformer


In [13]:
#creating numerical pipeline 

import numpy as np
num_pipeline= Pipeline(
                        steps=[
                            ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
                            ('scaler', StandardScaler())
                            ]
                        )

#creating categorical pipeline
cat_pipeline= Pipeline( 
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler', StandardScaler()) 
        ]

)
#combine both numerical and categorical pipelines 
preprocessor= ColumnTransformer(transformers= [
('num_pipeline',num_pipeline,list(numerical_cols)),
('cat_pipeline',cat_pipeline,list(categorical_cols))
])

In [14]:
## train test split 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.30, random_state=42)

In [15]:
#training data - fit,transform, test data = transform 
#preprocessor.fit_transform(X_train)
#convert to a dataframe 
X_train=pd.DataFrame(preprocessor.fit_transform(X_train), columns= preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test), columns= preprocessor.get_feature_names_out())

In [18]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error


In [20]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [21]:
regression.coef_

array([[ 5134.94904902,   -67.41049312,   -63.44838633, -2608.5353019 ,
         1988.31634422,  -361.0516354 ,   138.46524479,  -572.15521544,
         -817.73546177]])

In [22]:
regression.intercept_

array([3918.29561348])

In [25]:
#function to find errors between predicted and true values 
import numpy as np
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2score=r2_score(true,predicted)
    return mae,rmse,r2score
    

In [35]:
#train multiple models ie regression, lasso, etc 
models={'Linear regression':LinearRegression(),
        'lasso':Lasso(),
        'Ridge':Ridge(),
        'ElasticNet':ElasticNet()
          }
model_list=[]
r2_score_list=[]

for i in range(len(list(models))):
    model= list(models.values())[i]
    model.fit(X_train,y_train)

    #make predictions 
    y_pred=model.predict(X_test)

    #find the errors specific to the model 
    mse,rmse,r2score = evaluate_model(y_test,y_pred)

    #for each model, this list will append the model name from the models dictionary 
    
    model_list.append(list(models.keys())[i])
    r2_score_list.append(r2score*100)
    print("model performance with the respective errors:")
    print(list(models.values())[i])
    print("mse:", mse)
    print("rmse:", rmse)
    print("r2_Score:", r2score*100)
    print('*'*50)

model performance with the respective errors:
LinearRegression()
mse: 835.8862454561088
rmse: 1587.1167009084086
r2_Score: 84.78189531683755
**************************************************
model performance with the respective errors:
Lasso()
mse: 830.6779148344891
rmse: 1367.9530690614838
r2_Score: 88.69461803291462
**************************************************
model performance with the respective errors:
Ridge()
mse: 835.4727156231977
rmse: 1561.732000685184
r2_Score: 85.26480584831465
**************************************************
model performance with the respective errors:
ElasticNet()
mse: 1091.7710931670827
rmse: 1704.2038945761692
r2_Score: 82.45368382630797
**************************************************


  model = cd_fast.enet_coordinate_descent(


In [36]:
model_list

['Linear regression', 'lasso', 'Ridge', 'ElasticNet']

In [37]:
r2_score_list

[84.78189531683755, 88.69461803291462, 85.26480584831465, 82.45368382630797]