Dataset link: https://www.kaggle.com/datasets/shivam2503/diamonds

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
#index_col=[0]
#While you read csv file, if you set index_col=[0] you're explicitly stating to treat the first column as the index.

df = pd.read_csv('data\diamonds.csv')

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [4]:
df.shape

(53940, 10)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [6]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

### Machine Learning Problem
Build a system which can take features of diamond like carat, cut, color, clarity, x, y, z, etc.. and predicts the price of diamond.

In [7]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [8]:
df = df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'price']]

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335


#### a. Identify the Target Variable and Splitting the Data into Train and Test

In [9]:
# Identifying the inputs (X) and output (y)

y = df['price']

X = df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']]

In [10]:
# split into train and test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

In [11]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
29477,0.32,Ideal,F,VS1,61.3,56.0,4.4,4.44,2.71
52542,0.72,Very Good,I,VVS2,61.7,55.0,5.76,5.81,3.57
8725,0.38,Very Good,D,SI1,62.0,55.0,4.67,4.72,2.91
15577,1.0,Very Good,H,VVS2,62.6,56.0,6.36,6.39,3.99
19651,1.7,Premium,H,SI2,59.8,61.0,7.67,7.62,4.57


In [12]:
print(X_train.shape, y_train.shape)

print(X_test.shape, y_test.shape)

(37758, 9) (37758,)
(16182, 9) (16182,)


#### b. Separating Categorical and Numerical Columns:

In [13]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
29477,0.32,Ideal,F,VS1,61.3,56.0,4.4,4.44,2.71
52542,0.72,Very Good,I,VVS2,61.7,55.0,5.76,5.81,3.57
8725,0.38,Very Good,D,SI1,62.0,55.0,4.67,4.72,2.91
15577,1.0,Very Good,H,VVS2,62.6,56.0,6.36,6.39,3.99
19651,1.7,Premium,H,SI2,59.8,61.0,7.67,7.62,4.57


In [14]:

X_train.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object

In [15]:
X_train_cat = X_train.select_dtypes(include=['object'])

X_train_cat.head()

Unnamed: 0,cut,color,clarity
29477,Ideal,F,VS1
52542,Very Good,I,VVS2
8725,Very Good,D,SI1
15577,Very Good,H,VVS2
19651,Premium,H,SI2


In [16]:
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])

X_train_num.head()

Unnamed: 0,carat,depth,table,x,y,z
29477,0.32,61.3,56.0,4.4,4.44,2.71
52542,0.72,61.7,55.0,5.76,5.81,3.57
8725,0.38,62.0,55.0,4.67,4.72,2.91
15577,1.0,62.6,56.0,6.36,6.39,3.99
19651,1.7,59.8,61.0,7.67,7.62,4.57


#### c. Scaling the Numerical Features

In [17]:
X_train_num.head()

Unnamed: 0,carat,depth,table,x,y,z
29477,0.32,61.3,56.0,4.4,4.44,2.71
52542,0.72,61.7,55.0,5.76,5.81,3.57
8725,0.38,62.0,55.0,4.67,4.72,2.91
15577,1.0,62.6,56.0,6.36,6.39,3.99
19651,1.7,59.8,61.0,7.67,7.62,4.57


In [18]:
# scaling the numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# column names are (annoyingly) lost after Scaling
# (i.e. the dataframe is converted to a numpy ndarray)

X_train_num_rescaled = pd.DataFrame(scaler.fit_transform(X_train_num), 
                                    columns = X_train_num.columns, 
                                    index = X_train_num.index)

X_train_num_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
29477,-1.007784,-0.316595,-0.647912,-1.187006,-1.122457,-1.191717
52542,-0.166198,-0.038325,-1.095448,0.023744,0.062997,0.043156
8725,-0.881547,0.170377,-1.095448,-0.946636,-0.880174,-0.904537
15577,0.422912,0.587782,-0.647912,0.557898,0.564868,0.646234
19651,1.895688,-1.360107,1.589767,1.724135,1.62918,1.479055


In [19]:

X_train_cat['color'].value_counts(normalize=True)

G    0.210419
E    0.181736
F    0.178479
H    0.153583
D    0.122888
I    0.100244
J    0.052651
Name: color, dtype: float64

In [20]:
X_train_cat['clarity'].value_counts(normalize=True)

SI1     0.242094
VS2     0.226283
SI2     0.169765
VS1     0.152153
VVS2    0.095212
VVS1    0.068515
IF      0.032602
I1      0.013375
Name: clarity, dtype: float64

#### e. Applying Label Encoding on Categorical Columns

In [21]:
X_train_cat_le = pd.DataFrame(index=X_train_cat.index)

X_train_cat_le.head()

29477
52542
8725
15577
19651


In [22]:

X_train_cat.cut.unique()

array(['Ideal', 'Very Good', 'Premium', 'Good', 'Fair'], dtype=object)

In [23]:
cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5}

X_train_cat_le['cut'] = X_train_cat['cut'].apply(lambda x : cut_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut
29477,4
52542,3
8725,3
15577,3
19651,5


In [24]:
X_train_cat.color.unique()

array(['F', 'I', 'D', 'H', 'G', 'E', 'J'], dtype=object)

In [25]:
color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}

X_train_cat_le['color'] = X_train_cat['color'].apply(lambda x : color_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut,color
29477,4,5
52542,3,2
8725,3,7
15577,3,3
19651,5,3


In [26]:
X_train_cat.clarity.unique()

array(['VS1', 'VVS2', 'SI1', 'SI2', 'VVS1', 'VS2', 'I1', 'IF'],
      dtype=object)

In [27]:
clarity_encoder = {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}

X_train_cat_le['clarity'] = X_train_cat['clarity'].apply(lambda x : clarity_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut,color,clarity
29477,4,5,5
52542,3,2,6
8725,3,7,3
15577,3,3,6
19651,5,3,2


#### f. Concatinating the Encoded Categorical Features and Rescaled Numerical Features:

In [28]:
X_train_transformed = pd.concat([X_train_num_rescaled, X_train_cat_le], axis=1)

X_train_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
29477,-1.007784,-0.316595,-0.647912,-1.187006,-1.122457,-1.191717,4,5,5
52542,-0.166198,-0.038325,-1.095448,0.023744,0.062997,0.043156,3,2,6
8725,-0.881547,0.170377,-1.095448,-0.946636,-0.880174,-0.904537,3,7,3
15577,0.422912,0.587782,-0.647912,0.557898,0.564868,0.646234,3,3,6
19651,1.895688,-1.360107,1.589767,1.724135,1.62918,1.479055,5,3,2


#### g. Preparing Test Data

In [29]:
X_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
52264,0.57,Ideal,E,VS2,61.5,57.0,5.35,5.32,3.28
21073,1.16,Ideal,G,VS1,61.5,55.0,6.75,6.81,4.17
42161,0.51,Ideal,G,SI1,63.2,58.0,5.05,5.08,3.2
35974,0.42,Ideal,F,VS1,60.6,56.0,4.83,4.87,2.94
7641,0.8,Premium,G,IF,62.6,58.0,5.89,5.93,3.7


In [30]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16182 entries, 52264 to 1319
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    16182 non-null  float64
 1   cut      16182 non-null  object 
 2   color    16182 non-null  object 
 3   clarity  16182 non-null  object 
 4   depth    16182 non-null  float64
 5   table    16182 non-null  float64
 6   x        16182 non-null  float64
 7   y        16182 non-null  float64
 8   z        16182 non-null  float64
dtypes: float64(6), object(3)
memory usage: 1.2+ MB


In [31]:
X_test_cat = X_test.select_dtypes(include=['object'])

X_test_cat.head()

Unnamed: 0,cut,color,clarity
52264,Ideal,E,VS2
21073,Ideal,G,VS1
42161,Ideal,G,SI1
35974,Ideal,F,VS1
7641,Premium,G,IF


In [32]:
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])

X_test_num.head()

Unnamed: 0,carat,depth,table,x,y,z
52264,0.57,61.5,57.0,5.35,5.32,3.28
21073,1.16,61.5,55.0,6.75,6.81,4.17
42161,0.51,63.2,58.0,5.05,5.08,3.2
35974,0.42,60.6,56.0,4.83,4.87,2.94
7641,0.8,62.6,58.0,5.89,5.93,3.7


In [33]:
X_test_num_rescaled = pd.DataFrame(scaler.transform(X_test_num), 
                                   columns = X_test_num.columns, 
                                   index = X_test_num.index)

X_test_num_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
52264,-0.481793,-0.17746,-0.200377,-0.341261,-0.360998,-0.373255
21073,0.759547,-0.17746,-1.095448,0.905098,0.928292,0.904696
42161,-0.608031,1.005187,0.247159,-0.608339,-0.568668,-0.488127
35974,-0.797388,-0.803567,-0.647912,-0.804195,-0.75038,-0.86146
7641,0.002119,0.587782,0.247159,0.139477,0.166832,0.229823


In [34]:
X_test_cat_le = pd.DataFrame(index = X_test_cat.index)

X_test_cat_le.head()

52264
21073
42161
35974
7641


In [35]:
X_test_cat_le['cut'] = X_test_cat['cut'].apply(lambda x : cut_encoder[x])

X_test_cat_le['color'] = X_test_cat['color'].apply(lambda x : color_encoder[x])

X_test_cat_le['clarity'] = X_test_cat['clarity'].apply(lambda x : clarity_encoder[x])

X_test_cat_le.head()

Unnamed: 0,cut,color,clarity
52264,4,6,4
21073,4,4,5
42161,4,4,3
35974,4,5,5
7641,5,4,8


In [36]:
X_test_transformed = pd.concat([X_test_num_rescaled, X_test_cat_le], axis=1)

X_test_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
52264,-0.481793,-0.17746,-0.200377,-0.341261,-0.360998,-0.373255,4,6,4
21073,0.759547,-0.17746,-1.095448,0.905098,0.928292,0.904696,4,4,5
42161,-0.608031,1.005187,0.247159,-0.608339,-0.568668,-0.488127,4,4,3
35974,-0.797388,-0.803567,-0.647912,-0.804195,-0.75038,-0.86146,4,5,5
7641,0.002119,0.587782,0.247159,0.139477,0.166832,0.229823,5,4,8


#### Linear Regression

In [37]:
from sklearn.linear_model import LinearRegression
li_regressor = LinearRegression()
li_regressor.fit(X_train_transformed, y_train)

In [38]:
y_test_pred = li_regressor.predict(X_test_transformed)

In [39]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,2363.83474
21073,9248,7469.644228
42161,1284,643.298938
35974,921,1516.80821
7641,4268,5721.128606


In [40]:
from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  795.4560135879767
Mean Squared Error:  1471939.1560760003
Root Mean Squared Error:  1213.2349962295023


#### KNN Regression

In [41]:
from sklearn.neighbors import KNeighborsRegressor
knn_regressor = KNeighborsRegressor()
knn_regressor.fit(X_train_transformed, y_train)

In [42]:
y_test_pred = knn_regressor.predict(X_test_transformed)

In [43]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,1779.6
21073,9248,9000.6
42161,1284,1136.6
35974,921,960.4
7641,4268,5510.6


In [44]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  384.03119515511065
Mean Squared Error:  567200.9668570017
Root Mean Squared Error:  753.127457245453


#### Decision Tree Regression

In [45]:
from sklearn.tree import DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(X_train_transformed, y_train)

In [46]:
y_test_pred = dt_regressor.predict(X_test_transformed)

In [47]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,1625.0
21073,9248,8020.0
42161,1284,1237.0
35974,921,1031.0
7641,4268,4844.0


In [48]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  365.8589791125942
Mean Squared Error:  561274.6101223582
Root Mean Squared Error:  749.182628016933


#### Random Forest Regression

In [49]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train_transformed, y_train)

In [50]:
y_test_pred = rf_regressor.predict(X_test_transformed)

In [51]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52264,2491,1843.27
21073,9248,8634.55
42161,1284,1243.63
35974,921,1033.73
7641,4268,4320.63


In [52]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  269.1487469437431
Mean Squared Error:  290912.1054739687
Root Mean Squared Error:  539.3626845397897


#### Saving the Model (Serialization)

In [53]:
from pickle import dump

dump(scaler, open('models/standard_scaler.pkl', 'wb'))
dump(li_regressor, open('models/linearregression.pkl', 'wb'))
dump(knn_regressor, open('models/knnregression.pkl', 'wb'))
dump(dt_regressor, open('models/decisiontreeregression.pkl', 'wb'))
dump(rf_regressor, open('models/randomforestregression.pkl', 'wb'))


#### Deserialization

In [54]:
from pickle import load

In [55]:
rf_regressor = load(open('models/randomforestregression.pkl', 'rb'))
scaler = load(open('models/standard_scaler.pkl', 'rb'))

In [56]:
clarity_encoder = {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}
color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}
cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5}

In [57]:
clarity_encoder.keys()

dict_keys(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'])

In [58]:
num_scal=scaler.transform([[1.16 ,61.5 ,55.0 ,6.75 ,6.81 ,4.17]])
num_scal.flatten()



array([ 0.7595468 , -0.1774599 , -1.09544817,  0.90509846,  0.92829164,
        0.90469557])

In [59]:
cat_encod=np.array([clarity_encoder["I1"],color_encoder["J"],cut_encoder["Good"]])
cat_encod

array([1, 1, 2])

In [60]:
np.concatenate((cat_encod, num_scal.flatten()), axis=None).reshape(1,-1)

array([[ 1.        ,  1.        ,  2.        ,  0.7595468 , -0.1774599 ,
        -1.09544817,  0.90509846,  0.92829164,  0.90469557]])

In [61]:
rf_regressor.predict(np.concatenate((cat_encod, num_scal.flatten()), axis=None).reshape(1,-1)).item()



2435.64

### Running the Experiment

In [62]:
import mlflow

In [63]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment("Diamond price prediction logs")

<Experiment: artifact_location='./mlruns/1', creation_time=1664031693590, experiment_id='1', last_update_time=1664031693590, lifecycle_stage='active', name='Diamond price prediction logs', tags={}>

### Experiment 1 - Training KNN Regressor

In [64]:
from sklearn import metrics

In [65]:
with mlflow.start_run():
    mlflow.set_tag("developer","NILKANTHA")
    mlflow.set_tag("Algorithm","KNN")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path","data/diamonds.csv")
    k=30
    mlflow.log_param("n_neighbors",k)
    knn_regressor = KNeighborsRegressor(n_neighbors=k)
    knn_regressor.fit(X_train_transformed, y_train)
    y_test_pred = knn_regressor.predict(X_test_transformed)
    MAE=metrics.mean_absolute_error(y_test, y_test_pred)
    #acc=metrics.accuracy_score(y_test,y_test_pred)
    mlflow.log_metric("Mean Absolute Error",MAE)
    mlflow.sklearn.log_model(knn_regressor,artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")



### Experiment 2 - Training Decision Tree Regression

In [66]:
from pprint import pprint
pprint(dt_regressor.get_params())

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}


In [67]:
with mlflow.start_run():
    mlflow.set_tag("developer","NILKANTHA")
    mlflow.set_tag("Algorithm","DT")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path","data/diamonds.csv")
    d=5
    f='auto'
    l=50
    mlflow.log_param("max_depth",k)
    mlflow.log_param("max_features",f)
    mlflow.log_param("max_leaf_nodes",l)
    dt_regressor =DecisionTreeRegressor(max_depth=k,max_features=f,max_leaf_nodes=l)#,min_samples_leaf=2,min_weight_fraction_leaf=0.1,splitter='random')
    dt_regressor.fit(X_train_transformed, y_train)
    y_test_pred = dt_regressor.predict(X_test_transformed)
    MAE=metrics.mean_absolute_error(y_test, y_test_pred)
    mlflow.log_metric("Mean Absolute Error",MAE)
    mlflow.sklearn.log_model(dt_regressor,artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")



### Experiment 3 - Training Random Forest Regression

In [68]:
with mlflow.start_run():
    mlflow.set_tag("developer","NILKANTHA")
    mlflow.set_tag("Algorithm","RT")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path","data/diamonds.csv")
    d=5
    f='auto'
    l=50
    n=100
    mlflow.log_param("max_depth",k)
    mlflow.log_param("max_features",f)
    mlflow.log_param("max_leaf_nodes",l)
    mlflow.log_param("n_estimators",n)
    rf_regressor =RandomForestRegressor(max_depth=k,max_features=f,max_leaf_nodes=l,n_estimators=n)
    rf_regressor.fit(X_train_transformed, y_train)
    y_test_pred = dt_regressor.predict(X_test_transformed)
    MAE=metrics.mean_absolute_error(y_test, y_test_pred)
    mlflow.log_metric("Mean Absolute Error",MAE)
    mlflow.sklearn.log_model(dt_regressor,artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")

  warn(


### Experiment 4- Training KNN Regressor with Hyperparameter Tuning

In [69]:
from sklearn.model_selection import GridSearchCV

In [70]:
# Enabling automatic MLflow logging for scikit-learn runs
mlflow.sklearn.autolog(max_tuning_runs=None)

with mlflow.start_run():
    tuned_parameters = [{'n_neighbors':[i for i in range(1, 51)], 'p':[1, 2]}]

    reg = GridSearchCV(
        estimator=KNeighborsRegressor(), 
        param_grid=tuned_parameters, 
        scoring='neg_mean_absolute_error',
        cv=5,
        return_train_score=True,
        verbose=1
    )
    reg.fit(X_train_transformed, y_train)
    
    # Disabling autologging
    mlflow.sklearn.autolog(disable=True)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


