# *Importing Modules*

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# *Reading DF*

In [2]:
df=pd.read_csv(r'D:\CSV_Files\iris.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0,5.1,3.5,1.4,0.2,setosa
1,1,4.9,3.0,1.4,0.2,setosa
2,2,4.7,3.2,1.3,0.2,setosa
3,3,4.6,3.1,1.5,0.2,setosa
4,4,5.0,3.6,1.4,0.2,setosa


# *Preprocessing Data for KNN*

In [3]:
# check for nulls values 
# scale "std scaler"
# number of dimention SMALL
df.isnull().sum()

Unnamed: 0      0
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0,150.0
mean,74.5,5.843333,3.057333,3.758,1.199333
std,43.445368,0.828066,0.435866,1.765298,0.762238
min,0.0,4.3,2.0,1.0,0.1
25%,37.25,5.1,2.8,1.6,0.3
50%,74.5,5.8,3.0,4.35,1.3
75%,111.75,6.4,3.3,5.1,1.8
max,149.0,7.9,4.4,6.9,2.5


In [5]:
for col in df.select_dtypes(include=np.number).columns:
    df[col]=df[col].apply(lambda x : (x-np.mean(df[col]))/np.std(df[col]))
df.describe()

Unnamed: 0.1,Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0,150.0
mean,0.0,-4.736952e-16,-8.052818e-16,-2.842171e-16,-4.736952e-16
std,1.00335,1.00335,1.00335,1.00335,1.00335
min,-1.720542,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.860271,-0.9006812,-0.592373,-1.226552,-1.183812
50%,0.0,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.860271,0.6745011,0.5586108,0.7627583,0.7906707
max,1.720542,2.492019,3.090775,1.785832,1.712096


In [6]:
df.iloc[:,:-1]

Unnamed: 0.1,Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-1.720542,-0.900681,1.019004,-1.340227,-1.315444
1,-1.697448,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.674353,-1.385353,0.328414,-1.397064,-1.315444
3,-1.651258,-1.506521,0.098217,-1.283389,-1.315444
4,-1.628164,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...,...
145,1.628164,1.038005,-0.131979,0.819596,1.448832
146,1.651258,0.553333,-1.282963,0.705921,0.922303
147,1.674353,0.795669,-0.131979,0.819596,1.053935
148,1.697448,0.432165,0.788808,0.933271,1.448832


# *Getting Best Parameter K*

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
#create new a knn model
knn2 = KNeighborsClassifier()
# Create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(1, 25)}

# Use GridSearchCV to test all values for n_neighbors
knn_gscv = GridSearchCV(knn2, param_grid, cv=5)

x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df['species'], test_size=0.2, shuffle=True, random_state=0)

# Fit model to data
knn_gscv.fit(x_train, y_train)

# Get the best parameters and the best score
print("Best Parameters:", knn_gscv.best_params_)
print("Best Score:", knn_gscv.best_score_)

Best Parameters: {'n_neighbors': 3}
Best Score: 0.9916666666666668


# *Applying Model*

In [8]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)
knn.predict(x_train)

array(['virginica', 'versicolor', 'setosa', 'virginica', 'virginica',
       'versicolor', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'virginica', 'setosa', 'virginica', 'setosa', 'setosa',
       'versicolor', 'virginica', 'virginica', 'virginica', 'virginica',
       'versicolor', 'virginica', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'virginica', 'virginica', 'versicolor', 'virginica',
       'versicolor', 'setosa', 'virginica', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'setosa', 'setosa',
       'virginica', 'versicolor', 'setosa', 'setosa', 'versicolor',
       'setosa', 'virginica', 'versicolor', 'setosa', 'versicolor',
       'virginica', 'versicolor', 'setosa', 'virginica', 'virginica',
       'virginica', 'virginica', 'setosa', 'setosa', 'virginica',
       'virginica', 'setosa', 'virginica', 'setosa', 'virginica',
       'virginica', 'setosa', 'setosa', 'virginica', 'setosa', 'setosa',
       'setosa', 'vers

In [9]:
knn.score(x_train,y_train)

1.0

In [10]:
knn.score(x_test,y_test)

1.0

# *KNN Imputation*

In [14]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin

# Create dataset
data = {'Maths': [80, 90, np.nan, 95], 
        'Chemistry': [60, 65, 56, np.nan], 
        'Physics': [np.nan, 57, 80, 78],
        'Biology': [78, 83, 67, np.nan]}
df = pd.DataFrame(data)

# Define a function to calculate the mean squared error for imputed values
def imputation_score(estimator, X):
    imputer = KNNImputer(n_neighbors=estimator.n_neighbors)
    X_imputed = imputer.fit_transform(X)
    # Assume the ground truth is known; here we use original data without missing values
    mse = mean_squared_error(X, X_imputed)
    return mse


print("Best k:", best_k)

# Impute using the best k
imputer = KNNImputer(n_neighbors=best_k)
df_imputed = imputer.fit_transform(df)

# Create a DataFrame from the imputed data
Before_imputation = pd.DataFrame(data)
col = Before_imputation.columns

# Print dataset before imputation
print("Data Before performing imputation\n", Before_imputation)

# Print dataset after performing the operation
print("\n\nAfter performing imputation\n", df_imputed)
data_frame = pd.DataFrame(df_imputed, columns=col)
print(data_frame)


Best k: 1
Data Before performing imputation
    Maths  Chemistry  Physics  Biology
0   80.0       60.0      NaN     78.0
1   90.0       65.0     57.0     83.0
2    NaN       56.0     80.0     67.0
3   95.0        NaN     78.0      NaN


After performing imputation
 [[80. 60. 57. 78.]
 [90. 65. 57. 83.]
 [95. 56. 80. 67.]
 [95. 56. 78. 67.]]
   Maths  Chemistry  Physics  Biology
0   80.0       60.0     57.0     78.0
1   90.0       65.0     57.0     83.0
2   95.0       56.0     80.0     67.0
3   95.0       56.0     78.0     67.0


Traceback (most recent call last):
  File "C:\Users\Home\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Home\AppData\Local\Temp\ipykernel_18516\444668282.py", line 20, in imputation_score
    mse = mean_squared_error(X, X_imputed)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Home\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_regression.py", line 442, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
                                          ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Home\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_regression.py", line 101, in _check_reg_targets
    y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Home\AppData\Local\anaconda3\Lib\site-packages\skl