In [33]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy import stats 
import pickle

In [2]:
# Load data 
df = pd.read_csv("diamonds.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [5]:
df.shape

(53940, 10)

# Remove Outliers

In [10]:
df_clean = df.copy()

cols = ["carat", "depth", "table", "x", "y", "z"]   # only these

for col in cols:
    low = df_clean[col].quantile(0.01)   # 1%
    high = df_clean[col].quantile(0.99)  # 99%
    df_clean = df_clean[(df_clean[col] >= low) & (df_clean[col] <= high)]

print("Original Shape :", df.shape)
print("Cleaned Shape  :", df_clean.shape)


Original Shape : (53940, 10)
Cleaned Shape  : (48734, 10)


# Split Features ANd Target

In [16]:
X = df_clean.drop("price",axis = 1)
y = df_clean["price"]
print(X.shape)
print(y.shape)

(48734, 9)
(48734,)


# train test split 

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(36550, 9)
(36550,)
(12184, 9)
(12184,)


In [18]:
# let us use column transformer to apply feature scaling
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ]
)

In [21]:
# let us apply fit_transofrm on train data \
X_train_trans = preprocessor.fit_transform(X_train)

# let us apply transform on test data
X_test_trans=preprocessor.transform(X_test)

In [22]:
# let us convert X_train_trans into data Frame
X_train_trans=pd.DataFrame(X_train_trans)
X_train_trans.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,-0.554871,-1.21045,0.334837,-0.356132,-0.422255,-0.513333,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.833881,-3.112743,-0.156463,-0.716004,-0.673617,-0.999851,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-1.163619,0.112884,-0.156463,-1.395762,-1.377432,-1.372848,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.840175,0.195592,-1.630364,0.93341,0.97532,0.978656,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.859245,0.195592,0.334837,-0.855954,-0.834489,-0.821461,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# let us convert X_test_trans into data frame
X_train_trans=pd.DataFrame(X_train_trans)
X_train_trans.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,-0.554871,-1.21045,0.334837,-0.356132,-0.422255,-0.513333,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.833881,-3.112743,-0.156463,-0.716004,-0.673617,-0.999851,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-1.163619,0.112884,-0.156463,-1.395762,-1.377432,-1.372848,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.840175,0.195592,-1.630364,0.93341,0.97532,0.978656,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.859245,0.195592,0.334837,-0.855954,-0.834489,-0.821461,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# KNN Regression Model Building 

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor 
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", KNeighborsRegressor(n_neighbors=5))
])

In [29]:
pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


# Evaluation

In [31]:

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)

def evaluate(y_true, y_pred, name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)

    print(f"\n{name} Results")
    print("-" * 30)
    print("MAE  :", round(mae, 2))
    print("RMSE :", round(rmse, 2))
    print("R2   :", round(r2, 4))


evaluate(y_train, y_pred_train, "Train")
evaluate(y_test, y_pred_test, "Test")



Train Results
------------------------------
MAE  : 286.61
RMSE : 538.54
R2   : 0.9748

Test Results
------------------------------
MAE  : 354.65
RMSE : 674.01
R2   : 0.96


# Model

In [34]:
with open("diamond_knn_model.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("\nModel saved successfully as diamond_knn_model.pkl")
print("Training Completed ✅")


Model saved successfully as diamond_knn_model.pkl
Training Completed ✅
