In [None]:
!pip install scikit-learn==1.3.0

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Reading the Dataset
diamond_data = pd.read_csv("Data/Diamonds Prices2022.csv")

In [3]:
diamond_data.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
diamond_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
diamond_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [6]:
diamond_data.shape

(53943, 10)

In [7]:
diamond_data.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [19]:
diamond_data['cut'].value_counts(normalize=True)

Ideal        0.399514
Premium      0.255696
Very Good    0.223996
Good         0.090948
Fair         0.029846
Name: cut, dtype: float64

In [8]:
# Label Encoding
cut_dict = {'Fair':0, "Good":1,"Very Good":2,\
           'Ideal':3,"Premium":4}
diamond_data['cut'].map(cut_dict)

0        3
1        4
2        1
3        4
4        1
        ..
53938    4
53939    3
53940    4
53941    4
53942    2
Name: cut, Length: 53943, dtype: int64

In [11]:
# One Hot Encoding
cat_columns = diamond_data.select_dtypes(include=['object']).columns

In [13]:
pd.get_dummies(diamond_data,columns=cat_columns).columns

Index(['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'cut_Fair',
       'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_D',
       'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J',
       'clarity_I1', 'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1',
       'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2'],
      dtype='object')

In [14]:
pd.get_dummies(diamond_data,columns=cat_columns, drop_first=True).columns

Index(['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'cut_Good',
       'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_E', 'color_F',
       'color_G', 'color_H', 'color_I', 'color_J', 'clarity_IF', 'clarity_SI1',
       'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1',
       'clarity_VVS2'],
      dtype='object')

In [20]:
diamond_data['color'].value_counts()

G    11292
E     9799
F     9543
H     8304
D     6775
I     5422
J     2808
Name: color, dtype: int64

In [21]:
diamond_data['clarity'].value_counts()

SI1     13067
VS2     12259
SI2      9194
VS1      8171
VVS2     5066
VVS1     3655
IF       1790
I1        741
Name: clarity, dtype: int64

In [22]:
# Checking for the missing
diamond_data.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [4]:
# Divide the dataset into train and test. 
# Target: price (output)
from sklearn.model_selection import train_test_split

In [5]:
# train_test_split => shuffles the data first, then it will split the data into train and test. 
X_train, X_test, Y_train, Y_test = train_test_split(diamond_data.drop('price', axis=1),\
                                                   diamond_data['price'], test_size=0.3, random_state=100)

In [25]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
20981,1.14,Ideal,D,VS1,59.8,57.0,6.83,6.74,4.06
8725,0.38,Very Good,D,SI1,62.0,55.0,4.67,4.72,2.91
15577,1.0,Very Good,H,VVS2,62.6,56.0,6.36,6.39,3.99
19651,1.7,Premium,H,SI2,59.8,61.0,7.67,7.62,4.57
34795,0.4,Ideal,G,VS1,62.2,56.0,4.72,4.74,2.94


In [26]:
pd.get_dummies(X_train, columns=['cut','color','clarity'])

Unnamed: 0,carat,depth,table,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
51798,0.70,62.6,60.0,5.61,5.57,3.50,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
6914,0.90,62.4,61.0,6.17,6.13,3.84,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
30755,0.25,61.6,57.0,4.10,4.08,2.52,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
12513,1.16,62.2,55.7,6.68,6.74,4.18,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
24869,1.52,62.7,58.0,7.38,7.29,4.60,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773,0.70,61.6,57.0,5.70,5.73,3.53,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
296,0.70,58.8,66.0,5.81,5.90,3.44,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
575,0.80,62.2,56.0,5.94,5.87,3.67,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
10357,1.11,62.3,58.0,6.60,6.65,4.13,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0


In [27]:
# One Hot Encoding => Method -1
pd.get_dummies(X_train, columns=['cut','color','clarity'], drop_first=True)
# Recommended for Data Analysis
# Not recommended for model building. 

Unnamed: 0,carat,depth,table,x,y,z,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
51798,0.70,62.6,60.0,5.61,5.57,3.50,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
6914,0.90,62.4,61.0,6.17,6.13,3.84,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
30755,0.25,61.6,57.0,4.10,4.08,2.52,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
12513,1.16,62.2,55.7,6.68,6.74,4.18,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
24869,1.52,62.7,58.0,7.38,7.29,4.60,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773,0.70,61.6,57.0,5.70,5.73,3.53,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
296,0.70,58.8,66.0,5.81,5.90,3.44,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
575,0.80,62.2,56.0,5.94,5.87,3.67,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
10357,1.11,62.3,58.0,6.60,6.65,4.13,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0


In [6]:
# Second Method
# scikit-learn or sklearn
from sklearn.preprocessing import OneHotEncoder

In [7]:
# Filtering all categorical type data
X_train_obj = X_train.select_dtypes(['object'])

In [8]:
# Filtering all Numerical data type
X_train_numeric = X_train.select_dtypes(['int','float'])

In [9]:
oe = OneHotEncoder(drop='first', sparse_output=False)

In [32]:
####
"""
Very Very Very Imp Note:
On training data apply fit/fit_transform only. 
On testing data apply transform method only.
fit => Learn from the Data. 
transform => Apply the learning on the data. 
fit_transform => Learn from the data, Apply on the Data. 
"""

'\nVery Very Very Imp Note:\nOn training data apply fit/fit_transform only. \nOn testing data apply transform method only.\nfit => Learn from the Data. \ntransform => Apply the learning on the data. \nfit_transform => Learn from the data, Apply on the Data. \n'

In [33]:
diamond_data['cut'].value_counts()

Ideal        21551
Premium      13793
Very Good    12083
Good          4906
Fair          1610
Name: cut, dtype: int64

In [34]:
# Label Encoded 1
le = {'Fair':1,'Good':2,'Very Good':3,'Ideal':4, "Premium":5}
diamond_data['cut'].apply(lambda x:le[x])

0        4
1        5
2        2
3        5
4        2
        ..
53938    5
53939    4
53940    5
53941    5
53942    3
Name: cut, Length: 53943, dtype: int64

In [35]:
# Label Encoded 2
le = {'Fair':1,'Good':2,'Very Good':3,'Ideal':4, "Premium":5}
diamond_data['cut'].map(le) 

0        4
1        5
2        2
3        5
4        2
        ..
53938    5
53939    4
53940    5
53941    5
53942    3
Name: cut, Length: 53943, dtype: int64

In [34]:
X_train_obj

Unnamed: 0,cut,color,clarity
20981,Ideal,D,VS1
8725,Very Good,D,SI1
15577,Very Good,H,VVS2
19651,Premium,H,SI2
34795,Ideal,G,VS1
...,...,...,...
16304,Very Good,G,VS2
79,Very Good,E,VVS1
12119,Premium,G,VVS2
14147,Premium,I,SI1


In [40]:
pd.DataFrame(oe.fit_transform(X_train_obj), index=X_train_obj.index, columns=oe.get_feature_names_out())

Unnamed: 0,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
20981,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8725,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
15577,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19651,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
34795,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16304,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
79,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12119,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
14147,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [39]:
oe.get_feature_names_out()

array(['cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_E',
       'color_F', 'color_G', 'color_H', 'color_I', 'color_J',
       'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1',
       'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2'], dtype=object)

In [41]:
# It will remove column names, and index also removed.
X_train_obj_oe = pd.DataFrame(oe.fit_transform(X_train_obj), index=X_train_obj.index,\
            columns=oe.get_feature_names_out(X_train_obj.columns))
# Use for Modelling

In [10]:
from sklearn import set_config

In [11]:
set_config(transform_output='pandas') # Output as Pandas

In [12]:
oe.fit_transform(X_train_obj)

In [45]:
set_config(transform_output='default')# Default output of sklearn methods

In [46]:
oe.fit_transform(X_train_obj)

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [47]:
set_config(transform_output='pandas')

In [48]:
X_train_obj_processed = oe.fit_transform(X_train_obj)

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Standardization => StandardScaler
# MinMaxScaler => Normalization (Min Max normalization)

In [14]:
sc = StandardScaler()
mx = MinMaxScaler()

In [15]:
X_train_num_df = pd.DataFrame(sc.fit_transform(X_train_numeric), index=X_train_numeric.index,\
             columns=X_train_numeric.columns)

In [16]:
X_train_num_df = sc.fit_transform(X_train_numeric)

In [17]:
X_train_num_mx = mx.fit_transform(X_train_numeric)

In [18]:
X_train_num_df

Unnamed: 0,carat,depth,table,x,y,z
20981,0.715908,-1.366001,-0.199516,0.974713,0.872464,0.728356
8725,-0.883507,0.167319,-1.096033,-0.948477,-0.887412,-0.888909
15577,0.421279,0.585498,-0.647774,0.556241,0.567535,0.629914
19651,1.894423,-1.366001,1.593520,1.722621,1.639143,1.445578
34795,-0.841417,0.306712,-0.647774,-0.903959,-0.869987,-0.846720
...,...,...,...,...,...,...
16304,0.568593,0.794587,0.697002,0.671989,0.680794,0.770546
79,-1.136046,0.585498,0.697002,-1.491600,-1.436284,-1.395184
12119,0.231874,0.027927,0.248743,0.449397,0.367153,0.404903
14147,0.947402,0.446105,0.248743,1.028135,0.968299,1.051809


In [19]:
X_train_num_mx

Unnamed: 0,carat,depth,table,x,y,z
20981,0.218605,0.466667,0.269231,0.667644,0.114431,0.127673
8725,0.041860,0.527778,0.230769,0.456500,0.080136,0.091509
15577,0.186047,0.544444,0.250000,0.621701,0.108489,0.125472
19651,0.348837,0.466667,0.346154,0.749756,0.129372,0.143711
34795,0.046512,0.533333,0.250000,0.461388,0.080475,0.092453
...,...,...,...,...,...,...
16304,0.202326,0.552778,0.307692,0.634409,0.110696,0.128616
79,0.013953,0.544444,0.307692,0.396872,0.069440,0.080189
12119,0.165116,0.522222,0.288462,0.609971,0.104584,0.120440
14147,0.244186,0.538889,0.288462,0.673509,0.116299,0.134906


In [20]:
#X_train_obj_oe, X_train_num_df
X_train_processed = X_train_num_df.merge(X_train_obj_oe, left_index=True, right_index=True)

In [21]:
X_train_processed

Unnamed: 0,carat,depth,table,x,y,z,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
20981,0.715908,-1.366001,-0.199516,0.974713,0.872464,0.728356,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8725,-0.883507,0.167319,-1.096033,-0.948477,-0.887412,-0.888909,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
15577,0.421279,0.585498,-0.647774,0.556241,0.567535,0.629914,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19651,1.894423,-1.366001,1.593520,1.722621,1.639143,1.445578,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
34795,-0.841417,0.306712,-0.647774,-0.903959,-0.869987,-0.846720,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16304,0.568593,0.794587,0.697002,0.671989,0.680794,0.770546,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
79,-1.136046,0.585498,0.697002,-1.491600,-1.436284,-1.395184,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12119,0.231874,0.027927,0.248743,0.449397,0.367153,0.404903,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
14147,0.947402,0.446105,0.248743,1.028135,0.968299,1.051809,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [22]:
X_test_obj = X_test.select_dtypes(['object'])
X_test_num = X_test.select_dtypes(['int','float'])

In [45]:
#X_test_obj_oe = pd.DataFrame(oe.transform(X_test_obj), index=X_test_obj.index,\
#            columns=oe.get_feature_names_out(X_test_obj.columns))

In [46]:
#X_test_num_df = pd.DataFrame(sc.transform(X_test_num), index=X_test_num.index,\
#             columns=X_test_num.columns)

In [23]:
X_test_obj_oe = oe.transform(X_test_obj)
X_test_num_df = sc.transform(X_test_num)

In [24]:
X_test_processed = X_test_num_df.merge(X_test_obj_oe, left_index=True, right_index=True)

In [48]:
# Infrequent to Other. 
s1 = pd.DataFrame(diamond_data['cut'].value_counts()>5000).reset_index()
l = s1[s1['cut']==False]['index'].values.tolist()
ol = s1[s1['cut']==True]['index'].values.tolist()
d1 = {i:'Other' for i in l}
d2 = {i:i for i in ol}
d1.update(d2)
diamond_data['cut'].map(d1)

0            Ideal
1          Premium
2            Other
3          Premium
4            Other
           ...    
53938      Premium
53939        Ideal
53940      Premium
53941      Premium
53942    Very Good
Name: cut, Length: 53943, dtype: object

In [63]:
X = pd.concat([X_train,Y_train], axis=1)

In [66]:
X['color'].value_counts()

G    7966
E    6872
F    6710
H    5752
D    4697
I    3768
J    1995
Name: color, dtype: int64

In [67]:
X.groupby('color')['price'].mean()

color
D    3190.024910
E    3129.151630
F    3745.744709
G    4015.309315
H    4518.201495
I    5146.659501
J    5259.990977
Name: price, dtype: float64

In [80]:
color_df = X.groupby('color')['price'].apply('mean').reset_index()

In [82]:
color_df.rename(columns={'price':'color_te_apply'}, inplace=True)
X_df = X.merge(color_df, on='color')

In [84]:
X_df['color_te_apply']

0        3190.024910
1        3190.024910
2        3190.024910
3        3190.024910
4        3190.024910
            ...     
37755    5259.990977
37756    5259.990977
37757    5259.990977
37758    5259.990977
37759    5259.990977
Name: color_te_apply, Length: 37760, dtype: float64

In [70]:
X['color_price'] = X.groupby('color')['price'].apply('mean')

In [71]:
X['color_price']

20981   NaN
8725    NaN
15577   NaN
19651   NaN
34795   NaN
         ..
16304   NaN
79      NaN
12119   NaN
14147   NaN
38408   NaN
Name: color_price, Length: 37760, dtype: float64

In [78]:
X.groupby('color')['price'].transform('mean')

20981    3190.024910
8725     3190.024910
15577    4518.201495
19651    4518.201495
34795    4015.309315
            ...     
16304    4015.309315
79       3129.151630
12119    4015.309315
14147    5146.659501
38408    4015.309315
Name: price, Length: 37760, dtype: float64

In [72]:
X['color_price'] = X.groupby('color')['price'].transform('mean')

In [74]:
X['color_price'].isnull().sum()

0

In [75]:
# Target Encoding. 
X['color_te'] = X.groupby('color')['price'].transform('mean')

In [51]:
X.groupby('color')['price'].apply('mean')

color
D    3153.037943
E    3078.827175
F    3756.779653
G    3992.923968
H    4470.039293
I    5128.865354
J    5322.400919
Name: price, dtype: float64

In [52]:
X.groupby('color')['price'].transform('mean')

51798    3992.923968
6914     3992.923968
30755    3078.827175
12513    4470.039293
24869    3992.923968
            ...     
2773     3992.923968
296      3992.923968
575      3992.923968
10357    5128.865354
7982     4470.039293
Name: price, Length: 37760, dtype: float64

In [76]:
X[['color','color_te']]

Unnamed: 0,color,color_te
20981,D,3190.024910
8725,D,3190.024910
15577,H,4518.201495
19651,H,4518.201495
34795,G,4015.309315
...,...,...
16304,G,4015.309315
79,E,3129.151630
12119,G,4015.309315
14147,I,5146.659501


In [85]:
X['color_lte_sum'] = X.groupby('color')['price'].transform('sum')

In [88]:
X[['color','price','color_lte_sum']]

Unnamed: 0,color,price,color_lte_sum
20981,D,9193,14983547
8725,D,586,14983547
15577,H,6249,25988695
19651,H,8263,25988695
34795,G,877,31985954
...,...,...,...
16304,G,6525,31985954
79,E,554,21503530
12119,G,5167,31985954
14147,I,5740,19392613


In [89]:
X['color_lte_sum'] = X['color_lte_sum']-X['price']

In [90]:
X['color_lte_sum']

20981    14974354
8725     14982961
15577    25982446
19651    25980432
34795    31985077
           ...   
16304    31979429
79       21502976
12119    31980787
14147    19386873
38408    31984930
Name: color_lte_sum, Length: 37760, dtype: int64

In [91]:
X['color_lte_count'] = X.groupby('color')['price'].transform('count')

In [92]:
X['color_lte'] = X['color_lte_sum']/(X['color_lte_count']-1)

In [95]:
X['color_lte']

20981    3188.746593
8725     3190.579429
15577    4517.900539
19651    4517.550339
34795    4015.703327
            ...     
16304    4014.994225
79       3129.526415
12119    4015.164721
14147    5146.501991
38408    4015.684871
Name: color_lte, Length: 37760, dtype: float64

In [93]:
# Leave One Out Target Encoding. 
X['color_le_ts'] = X.groupby('color')['price'].transform('sum')
X['color_le_ts'] = X['color_le_ts']-X['price']
X['color_le_tl'] = X.groupby('color')['price'].transform('count')-1
X['color_le'] = X['color_le_ts']/X['color_le_tl']

In [94]:
X['color_le']

20981    3188.746593
8725     3190.579429
15577    4517.900539
19651    4517.550339
34795    4015.703327
            ...     
16304    4014.994225
79       3129.526415
12119    4015.164721
14147    5146.501991
38408    4015.684871
Name: color_le, Length: 37760, dtype: float64

In [96]:
X['color_ke_tk'] = X.groupby('color')['price'].apply(lambda x:pd.qcut(x, 5, labels=range(1,6)))

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  X['color_ke_tk'] = X.groupby('color')['price'].apply(lambda x:pd.qcut(x, 5, labels=range(1,6)))


In [97]:
X['color_ke_tk']

20981    5
8725     1
15577    4
19651    5
34795    2
        ..
16304    4
79       1
12119    4
14147    4
38408    2
Name: color_ke_tk, Length: 37760, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]

In [98]:
X['color_ke_ts'] = X.groupby('color')['price'].transform('sum')

In [99]:
X['color_ke_ts']

20981    14983547
8725     14983547
15577    25988695
19651    25988695
34795    31985954
           ...   
16304    31985954
79       21503530
12119    31985954
14147    19392613
38408    31985954
Name: color_ke_ts, Length: 37760, dtype: int64

In [100]:
X.groupby(['color','color_ke_tk'])['price'].apply('sum')

color  color_ke_tk
D      1                616574
       2                964972
       3               1784632
       4               3433315
       5               8184054
E      1                876999
       2               1362623
       3               2497615
       4               4774532
       5              11991761
F      1                905141
       2               1538441
       3               3138367
       4               5881206
       5              13670792
G      1               1046274
       2               1767757
       3               3719931
       4               8107571
       5              17344421
H      1                701381
       2               1612547
       3               4015282
       4               6377450
       5              13282035
I      1                441328
       2               1288305
       3               2793233
       4               4845784
       5              10023963
J      1                276360
       2            

In [56]:
# K Fold Target Encoding. 
X['color_ke_tk'] = X.groupby('color')['price'].apply(lambda x:pd.qcut(x, 5, labels=range(1,6)))
X['color_ke_ts'] = X.groupby('color')['price'].transform('sum')
X['color_ke_tks'] = X.groupby(['color','color_ke_tk'])['price'].transform('sum')
X['color_ke_ts'] = X['color_ke_ts']-X['color_ke_tks']
X['color_ke_tl'] = X.groupby('color')['price'].transform('count')
X['color_ke_tkl'] = X.groupby(['color','color_ke_tk'])['price'].transform('count')
X['color_ke_tl'] = X['color_ke_tl']-X['color_ke_tkl']
X['color_ke_te'] = X['color_ke_ts']/X['color_ke_tl']

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  X['color_ke_tk'] = X.groupby('color')['price'].apply(lambda x:pd.qcut(x, 5, labels=range(1,6)))


In [57]:
!pip install category_encoders



In [102]:
from category_encoders import TargetEncoder, LeaveOneOutEncoder

In [111]:
te = TargetEncoder(cols=['color'])

In [112]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price,color_price,color_te,color_lte_sum,color_lte_count,color_lte,color_le_ts,color_le_tl,color_le,color_ke_tk,color_ke_ts
20981,1.14,Ideal,D,VS1,59.8,57.0,6.83,6.74,4.06,9193,3190.024910,3190.024910,14974354,4697,3188.746593,14974354,4696,3188.746593,5,14983547
8725,0.38,Very Good,D,SI1,62.0,55.0,4.67,4.72,2.91,586,3190.024910,3190.024910,14982961,4697,3190.579429,14982961,4696,3190.579429,1,14983547
15577,1.00,Very Good,H,VVS2,62.6,56.0,6.36,6.39,3.99,6249,4518.201495,4518.201495,25982446,5752,4517.900539,25982446,5751,4517.900539,4,25988695
19651,1.70,Premium,H,SI2,59.8,61.0,7.67,7.62,4.57,8263,4518.201495,4518.201495,25980432,5752,4517.550339,25980432,5751,4517.550339,5,25988695
34795,0.40,Ideal,G,VS1,62.2,56.0,4.72,4.74,2.94,877,4015.309315,4015.309315,31985077,7966,4015.703327,31985077,7965,4015.703327,2,31985954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16304,1.07,Very Good,G,VS2,62.9,59.0,6.49,6.52,4.09,6525,4015.309315,4015.309315,31979429,7966,4014.994225,31979429,7965,4014.994225,4,31985954
79,0.26,Very Good,E,VVS1,62.6,59.0,4.06,4.09,2.55,554,3129.151630,3129.151630,21502976,6872,3129.526415,21502976,6871,3129.526415,1,21503530
12119,0.91,Premium,G,VVS2,61.8,58.0,6.24,6.16,3.83,5167,4015.309315,4015.309315,31980787,7966,4015.164721,31980787,7965,4015.164721,4,31985954
14147,1.25,Premium,I,SI1,62.4,58.0,6.89,6.85,4.29,5740,5146.659501,5146.659501,19386873,3768,5146.501991,19386873,3767,5146.501991,4,19392613


In [101]:
X['color_te'].head(3)

20981    3190.024910
8725     3190.024910
15577    4518.201495
Name: color_te, dtype: float64

In [113]:
te.fit_transform(X.drop('price', axis=1), X['price'])#['color'].head(3)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,color_price,color_te,color_lte_sum,color_lte_count,color_lte,color_le_ts,color_le_tl,color_le,color_ke_tk,color_ke_ts
20981,1.14,Ideal,3190.024910,VS1,59.8,57.0,6.83,6.74,4.06,3190.024910,3190.024910,14974354,4697,3188.746593,14974354,4696,3188.746593,5,14983547
8725,0.38,Very Good,3190.024910,SI1,62.0,55.0,4.67,4.72,2.91,3190.024910,3190.024910,14982961,4697,3190.579429,14982961,4696,3190.579429,1,14983547
15577,1.00,Very Good,4518.201495,VVS2,62.6,56.0,6.36,6.39,3.99,4518.201495,4518.201495,25982446,5752,4517.900539,25982446,5751,4517.900539,4,25988695
19651,1.70,Premium,4518.201495,SI2,59.8,61.0,7.67,7.62,4.57,4518.201495,4518.201495,25980432,5752,4517.550339,25980432,5751,4517.550339,5,25988695
34795,0.40,Ideal,4015.309315,VS1,62.2,56.0,4.72,4.74,2.94,4015.309315,4015.309315,31985077,7966,4015.703327,31985077,7965,4015.703327,2,31985954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16304,1.07,Very Good,4015.309315,VS2,62.9,59.0,6.49,6.52,4.09,4015.309315,4015.309315,31979429,7966,4014.994225,31979429,7965,4014.994225,4,31985954
79,0.26,Very Good,3129.151630,VVS1,62.6,59.0,4.06,4.09,2.55,3129.151630,3129.151630,21502976,6872,3129.526415,21502976,6871,3129.526415,1,21503530
12119,0.91,Premium,4015.309315,VVS2,61.8,58.0,6.24,6.16,3.83,4015.309315,4015.309315,31980787,7966,4015.164721,31980787,7965,4015.164721,4,31985954
14147,1.25,Premium,5146.659501,SI1,62.4,58.0,6.89,6.85,4.29,5146.659501,5146.659501,19386873,3768,5146.501991,19386873,3767,5146.501991,4,19392613


In [106]:
X['color_le'].head(3)

20981    3188.746593
8725     3190.579429
15577    4517.900539
Name: color_le, dtype: float64

In [114]:
loe = LeaveOneOutEncoder(cols=['color'])
loe.fit_transform(X.drop('price', axis=1), X['price'])['color'].head(3)

20981    3188.746593
8725     3190.579429
15577    4517.900539
Name: color, dtype: float64

In [25]:
# Building Machine Learning Models.  
from sklearn.linear_model import LinearRegression

In [26]:
lr = LinearRegression()

In [27]:
lr.fit(X_train_processed, Y_train)

In [28]:
set_config(display='diagram')

In [29]:
y_pred = lr.predict(X_test_processed)

In [125]:
lr.score(X_test_processed, Y_test) #

0.9198906066729085

In [None]:
# Fit => Training the model
# predict => predicting on the data
# score => for evaluating
# score => for regression will give R-squared
# for classification will give Accuracy

In [30]:
pd.DataFrame({'Real_Price':Y_test,"Predicted_Price":y_pred})

Unnamed: 0,Real_Price,Predicted_Price
52267,2491,2392.016775
47746,1892,2560.059108
42162,1284,1405.918845
35975,921,1664.415348
7641,4268,5230.812187
...,...,...
1637,3017,3582.343774
41851,1255,1284.649820
44784,1623,1725.298761
1319,2958,3270.194680


In [31]:
from sklearn.tree import DecisionTreeRegressor
dr = DecisionTreeRegressor()
dr.fit(X_train_processed, Y_train)
y_pred_d = dr.predict(X_test_processed)
dr.score(X_test_processed, Y_test)
#mean_absolute_error(Y_test, y_pred_d)

0.9534204228980123

In [32]:
from sklearn.ensemble import RandomForestRegressor
rrr = RandomForestRegressor()
rrr.fit(X_train_processed, Y_train)
y_pred_rf = rrr.predict(X_test_processed)
rrr.score(X_test_processed, Y_test)

0.9750215004933199

In [34]:
pd.DataFrame({"Predicted":y_pred_rf,"Actual":Y_test})

Unnamed: 0,Predicted,Actual
52267,2271.97,2491
47746,1862.84,1892
42162,1447.33,1284
35975,1029.14,921
7641,3974.53,4268
...,...,...
1637,2552.63,3017
41851,1409.75,1255
44784,1608.94,1623
1319,3558.09,2958


In [35]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()
knr.fit(X_train_processed, Y_train)
y_pred_knr = knr.predict(X_test_processed)
knr.score(X_test_processed, Y_test)

0.9542597496337532

In [None]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train_processed, Y_train)
y_pred_svm = svr.predict(X_test_processed)
svr.score(X_test_processed, Y_test)