In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('data/diamonds.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.drop(['Unnamed: 0','z','y'],axis=True,inplace=True)

In [4]:
X = df.drop('price',axis=1)
y = df[['price']]

In [5]:
categorical_feature = X.select_dtypes(include='object').columns
numerical_feature = X.select_dtypes(exclude='object').columns


In [6]:
# Defining the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
numerical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,numerical_feature),
    ('categorical_pipeline',categorical_pipeline,categorical_feature)
])

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=27)

In [10]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [11]:
X_train.head()

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,1.524119,-0.391051,0.245894,1.514417,0.086544,1.410779,-1.849266
1,-0.186265,1.719162,-0.203084,-0.126521,-1.705952,1.410779,-0.636992
2,-0.608582,-0.039349,-1.101039,-0.536755,0.982792,-0.351143,0.575283
3,0.468326,-0.883434,0.245894,0.720703,-0.809704,-0.938451,-1.243129
4,0.468326,1.4378,-1.101039,0.569094,-0.809704,0.236164,-1.243129


In [15]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
import xgboost as xgb
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [17]:
import numpy as np
def evaluate_model(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    score = r2_score(y_test, y_pred)
    return mae, rmse, score

In [23]:
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'XGboost':xgb.XGBRegressor()
}

model_list= []
r2_list = []

for key,model in models.items():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    model_list.append(key)

    mae,rmse,score = evaluate_model(y_test,y_pred)
    r2_list.append(score)
    print('Model Training Performance')
    print('Model',key)
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",score*100)
    print('='*35)
    print('\n')



Model Training Performance
Model LinearRegression
RMSE: 1239.7550756820315
MAE: 800.3430119074743
R2 score 90.24786278772336


Model Training Performance
Model Lasso
RMSE: 1239.1349594507426
MAE: 801.5146069946622
R2 score 90.25761624023487


Model Training Performance
Model Ridge
RMSE: 1239.6978926849868
MAE: 800.445545020238
R2 score 90.24876239052966


Model Training Performance
Model Elasticnet
RMSE: 1671.480063903356
MAE: 1056.847662054455
R2 score 82.27319712848427


Model Training Performance
Model XGboost
RMSE: 552.5451038721293
MAE: 283.07699724328694
R2 score 98.06284896443692




In [24]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet', 'XGboost']

In [25]:
r2_list

[0.9024786278772337,
 0.9025761624023487,
 0.9024876239052966,
 0.8227319712848428,
 0.9806284896443692]