In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("./data/diamonds.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
df.drop(labels=["Unnamed: 0"],axis=1,inplace=True)

In [5]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [6]:
df.select_dtypes(include="object")

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI2
1,Premium,E,SI1
2,Good,E,VS1
3,Premium,I,VS2
4,Good,J,SI2
...,...,...,...
53935,Ideal,D,SI1
53936,Good,D,SI1
53937,Very Good,D,SI1
53938,Premium,H,SI2


In [7]:
X=df.drop(labels=['price'],axis=1)
y=df["price"]

In [8]:
categorical_col=X.select_dtypes(include="object").columns

In [9]:
numerical_col=X.select_dtypes(exclude="object").columns

In [10]:
categorical_col,numerical_col

(Index(['cut', 'color', 'clarity'], dtype='object'),
 Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))

In [11]:
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [12]:
cut_category=['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
color_category=['E', 'I', 'J', 'H', 'F', 'G', 'D']
clarity_category=['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF']

Now we have to import some sklearn library for pipleline and imputer and standardscaling,Ordinalencoding ,transformer,pipleline

In [13]:
from sklearn.impute import SimpleImputer # for impute the missing values
from sklearn.preprocessing import StandardScaler # for scaling purpose
from sklearn.preprocessing import OrdinalEncoder # for encode the categorical data

# now it for pipleline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [14]:
# now we have to create a piple line for that we use the object of pipleline class/library
# In a project there will be two pipleline first for numberical data and second for categorical data
num_pipeline=Pipeline(

        [
            ("imputer",SimpleImputer()),
            ("scaler",StandardScaler())
        ]


)
cat_pipleline=Pipeline(
    [
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("encoding",OrdinalEncoder(categories=[cut_category,color_category,clarity_category]))

    ]
)

In [15]:
column_trans=ColumnTransformer(
    [
        ("numpipeline",num_pipeline,numerical_col),
        ("cat_pipleline",cat_pipleline,categorical_col)
    ]
)

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

In [17]:
X_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
19497,1.21,Ideal,H,VVS2,61.3,57.0,6.92,6.87,4.23
31229,0.31,Ideal,E,VS2,62.0,56.0,4.38,4.36,2.71
22311,1.21,Ideal,E,VS1,62.4,57.0,6.75,6.83,4.24
278,0.81,Ideal,F,SI2,62.6,55.0,5.92,5.96,3.72
6646,0.79,Ideal,I,VVS2,61.7,56.0,5.94,5.95,3.67
...,...,...,...,...,...,...,...,...,...
11284,1.05,Very Good,I,VS2,62.4,59.0,6.48,6.51,4.05
44732,0.47,Ideal,D,VS1,61.0,55.0,5.03,5.01,3.06
38158,0.33,Very Good,F,IF,60.3,58.0,4.49,4.46,2.70
860,0.90,Premium,J,SI1,62.8,59.0,6.13,6.03,3.82


In [18]:
column_trans.fit_transform(X_train)

array([[ 0.86265905, -0.31143736, -0.20709927, ...,  0.        ,
         3.        ,  4.        ],
       [-1.02988861,  0.17854897, -0.65621302, ...,  0.        ,
         0.        ,  3.        ],
       [ 0.86265905,  0.45854116, -0.20709927, ...,  0.        ,
         0.        ,  2.        ],
       ...,
       [-0.987832  , -1.01141784,  0.24201448, ...,  3.        ,
         4.        ,  7.        ],
       [ 0.21078152,  0.73853335,  0.69112823, ...,  1.        ,
         2.        ,  1.        ],
       [ 0.7154609 , -0.94141979,  0.24201448, ...,  1.        ,
         4.        ,  1.        ]])

In [19]:
column_trans.transform(X_test)

array([[-1.17708677,  0.24854702, -0.65621302, ...,  0.        ,
         5.        ,  5.        ],
       [-0.46212432, -1.22141198, -0.20709927, ...,  3.        ,
         4.        ,  4.        ],
       [-0.84063385,  0.24854702, -1.10532676, ...,  0.        ,
         0.        ,  4.        ],
       ...,
       [ 1.49350827, -0.6614276 ,  1.14024197, ...,  1.        ,
         5.        ,  3.        ],
       [-0.18875632, -1.9913905 ,  1.58935572, ...,  2.        ,
         4.        ,  4.        ],
       [-0.84063385, -0.73142565, -0.65621302, ...,  0.        ,
         4.        ,  5.        ]])

In [20]:
column_trans.get_feature_names_out()

array(['numpipeline__carat', 'numpipeline__depth', 'numpipeline__table',
       'numpipeline__x', 'numpipeline__y', 'numpipeline__z',
       'cat_pipleline__cut', 'cat_pipleline__color',
       'cat_pipleline__clarity'], dtype=object)

In [21]:
# now we have to create the dataframe because when we transform and fit_transform data then it convert into the data into numpy like array,so we have to create the dataframe.
X_train=pd.DataFrame(column_trans.fit_transform(X_train),columns=column_trans.get_feature_names_out())
X_test=pd.DataFrame(column_trans.fit_transform(X_test),columns=column_trans.get_feature_names_out())


In [22]:
X_test

Unnamed: 0,numpipeline__carat,numpipeline__depth,numpipeline__table,numpipeline__x,numpipeline__y,numpipeline__z,cat_pipleline__cut,cat_pipleline__color,cat_pipleline__clarity
0,-1.177152,0.235906,-0.642780,-1.562934,-1.523982,-1.536090,0.0,5.0,5.0
1,-0.454302,-1.220570,-0.198865,-0.251053,-0.269090,-0.394598,3.0,4.0,4.0
2,-0.836987,0.235906,-1.086695,-0.857909,-0.870024,-0.842525,0.0,0.0,4.0
3,-0.773206,-0.665722,-0.198865,-0.715120,-0.737465,-0.799177,1.0,0.0,4.0
4,1.607947,0.374617,-1.086695,1.533821,1.454177,1.556054,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
16177,-0.815727,-0.388298,0.688966,-0.840061,-0.799326,-0.856974,1.0,0.0,1.0
16178,0.566192,0.860109,-1.086695,0.686006,0.632311,0.790242,0.0,3.0,1.0
16179,1.522906,-0.665722,1.132881,1.515972,1.463014,1.397112,1.0,5.0,3.0
16180,-0.177918,-1.983486,1.576796,0.070225,0.155099,-0.134511,2.0,4.0,4.0


In [23]:
X_test.shape

(16182, 9)

In [24]:
X_train.shape

(37758, 9)

After making the DataFrame in term of fit_transformer and transformer , Now we are able to train our model for that we have to choose a specific machine learning algorithm and on the behalf of algorithm then we check accuracy... and so another things.
lets start with importing statements and train the model

In [25]:
from sklearn.linear_model import LinearRegression

In [26]:
reg=LinearRegression()

In [27]:
reg.fit(X_train,y_train)

In [28]:
Y_predict=reg.predict(X_test)

In [29]:
Y_predict

array([  816.18300917,  2649.27570532,  1125.70410625, ...,
       10168.35664551,  3680.06482443,  1837.28977169])

In [30]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [31]:
r2_scores=r2_score(y_test,Y_predict)

In [32]:
r2_scores

0.8736204489069922

In [33]:
mae=mean_absolute_error(y_test,Y_predict)
mae

855.7543809306109

In [34]:
mse=mean_squared_error(y_test,Y_predict,squared=False)
mse

1403.9269382386321

for perfect and modular coding for linear regression and ridge regression and lasso regression,and elasticnet regression ,,we will write  a code 

In [35]:
import numpy as np

In [43]:
def accuracy_check(true_value,predict_value):
    mae=mean_absolute_error(true_value,predict_value)
    mse=mean_squared_error(true_value,predict_value)
    rmse=np.sqrt(mean_squared_error(true_value,predict_value))
    r2score=r2_score(true_value,predict_value)*100
    return (mae,mse,rmse,r2score)

In [37]:
"""# for train different different module we have to import first,
for train all modules at a time then we have to make a dictionary and gives parameters as name and modules
and we have also makes some empty list for further print statements"""

'# for train different different module we have to import first,\nfor train all modules at a time then we have to make a dictionary and gives parameters as name and modules\nand we have also makes some empty list for further print statements'

In [38]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet

In [39]:
modules={
    "LinearRegression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "ElasticNet":ElasticNet()
}

In [40]:
module_list=[]
r2_score_list=[]

In [45]:
for a in range(len(list(modules))):
    module=list(modules.values())[a]
    module.fit(X_train,y_train)
    # make prediction along with prediction feature
    Y_predict=module.predict(X_test)

    # now for check accuracy we have to call accuracy_check method and stored individual in the variables
    mae,mse,rmse,r2score=accuracy_check(y_test,Y_predict)

    print(list(modules.keys())[a])

    module_list.append(module)
    print("mae is ",mae)
    print("mse is ",mse)
    print("rmse is ",rmse)
    print("r2 score is ",r2score)

    r2_score_list.append(r2_score)
    print("="*40)
    print("\n")


LinearRegression
mae is  855.7543809306109
mse is  1971010.8479121001
rmse is  1403.9269382386321
r2 score is  87.36204489069922


Ridge
mae is  855.8993027222566
mse is  1971039.856319798
rmse is  1403.9372693677585
r2 score is  87.36185889123877


Lasso
mae is  857.2153835274129
mse is  1971820.4695099283
rmse is  1404.2152504192254
r2 score is  87.3568536653847


ElasticNet
mae is  1114.4619236873639
mse is  2789459.9119238206
rmse is  1670.1676298874377
r2 score is  82.11421860847135


