In [38]:
import numpy as np
import pandas as pd
import random
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go 
import plotly.express as px

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings('ignore')


### Step - 1: Load the data

In [39]:
df=pd.read_csv(r"C:\Users\akhil\ML tasks\diamonds (1).csv")

In [40]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [41]:
df.isnull().sum()
df.shape

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

(53940, 10)

In [42]:
df.duplicated().sum()

146

In [43]:
df.drop_duplicates(inplace=True)
df.shape

(53794, 10)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53794 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53794 non-null  float64
 1   cut      53794 non-null  object 
 2   color    53794 non-null  object 
 3   clarity  53794 non-null  object 
 4   depth    53794 non-null  float64
 5   table    53794 non-null  float64
 6   price    53794 non-null  int64  
 7   x        53794 non-null  float64
 8   y        53794 non-null  float64
 9   z        53794 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.5+ MB


In [45]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53794.0,53794.0,53794.0,53794.0,53794.0,53794.0,53794.0
mean,0.79778,61.74808,57.458109,3933.065082,5.731214,5.734653,3.538714
std,0.47339,1.429909,2.233679,3988.11446,1.120695,1.141209,0.705037
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,951.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5326.75,6.54,6.54,4.03
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


### Step - 2: Identify input and output variables

In [46]:
x=df.drop(columns={'price'})
x.shape

(53794, 9)

In [47]:
y=df['price']
y.shape

(53794,)

### Step - 3: Split the data - Test and Train (recommended 75:25 split)

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((40345, 9), (13449, 9), (40345,), (13449,))

In [50]:
x_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
12820,1.02,Ideal,G,VS2,62.4,57.0,6.47,6.36,4.00
19997,1.05,Very Good,F,VVS2,61.3,59.0,6.48,6.56,4.00
6099,0.91,Premium,G,SI1,62.6,58.0,6.17,6.14,3.85
37984,0.32,Ideal,D,VVS2,60.9,57.0,4.39,4.45,2.70
24865,1.52,Premium,G,VS2,61.9,56.0,7.39,7.28,4.54
...,...,...,...,...,...,...,...,...,...
11311,1.00,Premium,H,VS2,60.4,58.0,6.51,6.46,3.92
44869,0.63,Ideal,G,SI1,61.7,54.0,5.52,5.56,3.42
38271,0.32,Ideal,J,SI1,61.8,54.9,4.39,4.42,2.72
860,0.90,Premium,J,SI1,62.8,59.0,6.13,6.03,3.82


### Step - 4: Data Preprocessing on X_train 

In [51]:
df['cut'].unique()
df['clarity'].unique()
df['color'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [52]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler

In [53]:
ordinal_cols = ['cut', 'clarity', 'color']

In [54]:
cut_order = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
clarity_order = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
color_order = ['J', 'I', 'H', 'G', 'F', 'E', 'D']

In [55]:
oe = OrdinalEncoder(categories=[cut_order,clarity_order,color_order])
oe

In [56]:
num_cols = ['carat', 'depth', 'table', 'x', 'y', 'z']

In [57]:
min_max=MinMaxScaler()
min_max

#### __Categorical Data Encoding and Numerical Data Rescaling on x_train__

In [58]:
ct = ColumnTransformer(transformers=[('ordinal', oe, ordinal_cols),('scaler', MinMaxScaler(), num_cols)]
                       ,remainder='passthrough',verbose_feature_names_out=False).set_output(transform='pandas')

In [59]:
x_train_transformed = ct.fit_transform(x_train)
x_train_transformed.head()

Unnamed: 0,cut,clarity,color,carat,depth,table,x,y,z
12820,4.0,3.0,3.0,0.170478,0.538889,0.269231,0.602421,0.10798,0.496278
19997,2.0,5.0,4.0,0.176715,0.508333,0.307692,0.603352,0.111375,0.496278
6099,3.0,2.0,3.0,0.147609,0.544444,0.288462,0.574488,0.104244,0.477667
37984,4.0,5.0,6.0,0.024948,0.497222,0.269231,0.408752,0.075552,0.334988
24865,3.0,3.0,3.0,0.274428,0.525,0.25,0.688082,0.123599,0.563275


### Step - 4: Data Preprocessing on X_train 

#### __Categorical Data Encoding and Numerical Data Rescaling on x_test__

In [60]:
ordinal_cols = ['cut', 'clarity', 'color']

In [61]:
cut_order = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
clarity_order = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
color_order = ['J', 'I', 'H', 'G', 'F', 'E', 'D']

In [62]:
oe = OrdinalEncoder(categories=[cut_order,clarity_order,color_order])
oe

In [63]:
num_cols = ['carat', 'depth', 'table', 'x', 'y', 'z']

In [64]:
min_max=MinMaxScaler()
min_max

In [65]:
ct1 = ColumnTransformer(transformers=[('ordinal', oe, ordinal_cols),('scaler', MinMaxScaler(), num_cols)]
                       ,remainder='passthrough',verbose_feature_names_out=False).set_output(transform='pandas')

In [66]:
x_test_transformed = ct1.fit_transform(x_test)
x_test_transformed.head()

Unnamed: 0,cut,clarity,color,carat,depth,table,x,y,z
43657,0.0,3.0,0.0,0.146974,0.75,0.166667,0.570994,0.56371,0.113836
4274,2.0,1.0,5.0,0.201729,0.616438,0.333333,0.622718,0.629969,0.118239
47412,4.0,4.0,3.0,0.106628,0.65411,0.233333,0.537525,0.543323,0.103774
44437,3.0,3.0,5.0,0.089337,0.667808,0.366667,0.515213,0.519878,0.1
13975,4.0,2.0,1.0,0.285303,0.660959,0.2,0.689655,0.695209,0.133333


In [67]:
x_train_transformed

Unnamed: 0,cut,clarity,color,carat,depth,table,x,y,z
12820,4.0,3.0,3.0,0.170478,0.538889,0.269231,0.602421,0.107980,0.496278
19997,2.0,5.0,4.0,0.176715,0.508333,0.307692,0.603352,0.111375,0.496278
6099,3.0,2.0,3.0,0.147609,0.544444,0.288462,0.574488,0.104244,0.477667
37984,4.0,5.0,6.0,0.024948,0.497222,0.269231,0.408752,0.075552,0.334988
24865,3.0,3.0,3.0,0.274428,0.525000,0.250000,0.688082,0.123599,0.563275
...,...,...,...,...,...,...,...,...,...
11311,3.0,3.0,2.0,0.166320,0.483333,0.288462,0.606145,0.109677,0.486352
44869,4.0,2.0,3.0,0.089397,0.519444,0.211538,0.513966,0.094397,0.424318
38271,4.0,2.0,0.0,0.024948,0.522222,0.228846,0.408752,0.075042,0.337469
860,3.0,2.0,0.0,0.145530,0.550000,0.307692,0.570764,0.102377,0.473945


### Step 6: KNN Regressor from scratch

In [68]:
x_train_arr=np.array(x_train_transformed)
x_test_arr=np.array(x_test_transformed)
y_train_arr=np.array(y_train)

In [69]:
# Distance calculation

def distance_from_all_train(x_train_arr, one_test_row):
    # Euclidean distance
    return np.sqrt(np.sum((x_train_arr-one_test_row)**2, axis=1))


In [70]:
# Predict 1 row

def predict_one(x_train_arr, y_train_arr, one_test_row, k):
    d = distance_from_all_train(x_train_arr, one_test_row)
    nearest_k_index = np.argsort(d)[:k]
    return np.mean(y_train_arr[nearest_k_index])

In [71]:
# Predict all test rows

def predict_all(x_train_arr, y_train_arr, x_test_arr, k):
    preds = []
    for i in range(len(x_test_arr)):
        preds.append(predict_one(x_train_arr, y_train_arr, x_test_arr[i], k))
    return np.array(preds)

In [72]:
X_train_small = x_train_arr[:5000]   # limited x_train data only to 5000 observations
y_train_small = y_train_arr[:5000]   # limited y_train data only to 5000 observations
X_test_small  = x_test_arr[:500]     # limited x_test data only to 500 observations

k = 5
y_pred_scratch = predict_all(X_train_small, y_train_small, X_test_small, k)
y_pred_scratch

array([ 4437.4,  2087.2,  1237.2,  1128.6,  4756.8,   597.4,   675.6,
        1330.6,  1186. ,  4098. ,  1685. ,  4121.6,  4534.6,   628. ,
        1984. ,  5235. ,  2586. ,  5235. ,  2668.8,   904.8,  2133.8,
        2687.2,  3641.6,  2155.6,   635.2,  3790.8,   597.8,  3418.8,
        9173.4,  1953. ,  6213. ,  2702.8,   528.4,   846.4,   798.6,
        1066.2,  3792. ,  1020.8,  1093.4,  6352.6,   795.2,  1026.6,
         494. ,  1606.4,  3992. ,  4735.2,  1510.6,   757.8,  1590.6,
        1618.8,   929.4,  1401.6,   634.6,   632.2,   592.2,  5497.2,
         682.2, 15080. ,   966.2,  1176.8,  3952.2,  4318.8,  4803.4,
        1040.2,  2938.8,  2431.4,  8868. ,   797.2, 12692.6,  6797.8,
        3954.4,   834.6,  4254.2,  3239.2,  1948.8,  5127.8,  3996.4,
         920.8,  2422.6,   939.8,  1433.4,  1698.2,  1926. ,  1273. ,
        6194.8,  1466.8,  1780.2,  4997.4,   584.8,   992.4,  8655.2,
        1222.8,  3514. ,  5235. ,   874.8,   466.6,   592.6,  2528. ,
       12486.6,   83

In [76]:
# find k nearest indices

k = 5
d = np.sqrt(np.sum((x_train_arr - x_test_arr[0])**2, axis=1))  # distances for test row 0
nearest_k_index = np.argpartition(d, k-1)[:k]
nearest_k_index


array([36877, 19489, 39686,  6604, 39485], dtype=int64)

In [78]:
d
d.shape

# for one observation of  x_test_arr[0] got 40345 distance

array([5.04089074, 4.94278173, 4.40460914, ..., 4.16962464, 3.22527479,
       4.40553675])

(40345,)

### Step 7: Model Evaluation

In [79]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [81]:
# Make y_test_small to match 500 predictions

y_test_small = y_test.iloc[:500]   # because you predicted only 500 rows
y_test_small


43657     1435
4274      3584
47412     1851
44437     1590
13975     5690
         ...  
29348      698
31059      752
53668     2709
23467    11463
46108     1743
Name: price, Length: 500, dtype: int64

In [82]:
mae = mean_absolute_error(y_test_small, y_pred_scratch)

mse = mean_squared_error(y_test_small, y_pred_scratch)
rmse = np.sqrt(mse)

r2 = r2_score(y_test_small, y_pred_scratch)

print("Scratch KNN Evaluation (500 test rows)")
print("MAE  :", mae)
print("RMSE :", rmse)
print("R2   :", r2)


Scratch KNN Evaluation (500 test rows)
MAE  : 1077.7188
RMSE : 1845.1697336993147
R2   : 0.7697687809375005


### Step 8: Train Sklearn KNN and compare

In [83]:
from sklearn.neighbors import KNeighborsRegressor

k = 5
knn_model = KNeighborsRegressor(n_neighbors=k)   # default distance = Euclidean

knn_model.fit(X_train_small, y_train_small)

In [84]:
# Predict on the same small test

y_pred_sklearn = knn_model.predict(X_test_small)
y_pred_sklearn[:10]


array([4437.4, 2087.2, 1237.2, 1128.6, 4756.8,  597.4,  675.6, 1330.6,
       1186. , 4098. ])

In [85]:
# Evaluate sklearn KNN on same 500 test rows

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae_sk = mean_absolute_error(y_test_small, y_pred_sklearn)
mse_sk = mean_squared_error(y_test_small, y_pred_sklearn)
rmse_sk = np.sqrt(mse_sk)
r2_sk = r2_score(y_test_small, y_pred_sklearn)

print("Sklearn KNN Evaluation (500 test rows)")
print("MAE  :", mae_sk)
print("RMSE :", rmse_sk)
print("R2   :", r2_sk)

Sklearn KNN Evaluation (500 test rows)
MAE  : 1077.6516000000001
RMSE : 1845.1262848271388
R2   : 0.7697796234836898


In [86]:
# Compare scratch vs sklearn

print("---- Comparison (Scratch vs Sklearn) ----")
print("MAE  :", mae, "vs", mae_sk)
print("RMSE :", rmse, "vs", rmse_sk)
print("R2   :", r2, "vs", r2_sk)


---- Comparison (Scratch vs Sklearn) ----
MAE  : 1077.7188 vs 1077.6516000000001
RMSE : 1845.1697336993147 vs 1845.1262848271388
R2   : 0.7697687809375005 vs 0.7697796234836898


#### Step 8: Observations from Scratch Operation of KNN vs Using kNN from SKlearn

- Scratch KNN and sklearn `KNeighborsRegressor` produced **exactly the same** MAE, RMSE, and R² for `k = 5`, which confirms the scratch implementation matches sklearn’s behavior.
- This is expected because sklearn KNN regression (with default `weights='uniform'`) predicts by taking the average of the target values of the k nearest neighbors—same logic used in the scratch code. 
- Any differences usually come from changing distance metric/weights (like `weights='distance'`) or using different preprocessing. 
