# Importing Libraries:-

In [1]:
import pandas as pd
import numpy as np
import random

# Reading CSV File:-

In [2]:
X = pd.read_csv("data.csv")
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,0,33.6,0.627,50
1,1,85,66,0,26.6,0.351,31
2,8,183,64,0,23.3,0.672,32
3,1,89,66,94,28.1,0.167,21
4,0,137,40,168,43.1,2.288,33


In [3]:
columns = X.columns # get columns name
columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

# Min-Max Normalization:-

In [4]:
def normalize(df):
    normalized = df.copy()
    for features in columns:
        max_value = df[features].max()
        min_value = df[features].min()
        normalized[features] = (df[features] - min_value) / (max_value - min_value)
    return normalized

In [5]:
normalize_X = normalize(X)
normalize_X.to_csv("data_scaled.csv")
normalize_X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.4,0.751269,0.654545,0.0,0.676056,0.240165,0.74359
1,0.066667,0.431472,0.6,0.0,0.535211,0.113907,0.25641
2,0.533333,0.928934,0.581818,0.0,0.468813,0.26075,0.282051
3,0.066667,0.451777,0.6,0.111111,0.565392,0.029735,0.0
4,0.0,0.695431,0.363636,0.198582,0.867203,1.0,0.307692


# Randomly Creating Missingness:-

In [6]:
def missing_data(dataframe, normalize_dataframe):
    df1 = dataframe.copy()
    df2 = normalize_dataframe.copy()
    
    features = columns
    
    for col in dataframe[features]:
        df_split = dataframe.sample(frac=0.5,random_state=200)  # select 50% rows randomly
        df_split.reset_index()
        indices = df_split.sample(frac=0.5, replace=True).index # take random rows of those 50%
        df1.loc[indices,col] = np.nan # set nan at this positions
        df2.loc[indices, col] = np.nan
    return df1, df2                                                              

In [7]:
X_original_missing,normalize_X_missing = missing_data(X, normalize_X)
X_original_missing.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,0.0,26.6,0.351,31.0
2,,183.0,64.0,0.0,,,32.0
3,1.0,89.0,66.0,94.0,28.1,0.167,21.0
4,,,40.0,,43.1,2.288,33.0


In [8]:
normalize_X_missing.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.4,0.751269,0.654545,0.0,0.676056,0.240165,0.74359
1,0.066667,0.431472,0.6,0.0,0.535211,0.113907,0.25641
2,,0.928934,0.581818,0.0,,,0.282051
3,0.066667,0.451777,0.6,0.111111,0.565392,0.029735,0.0
4,,,0.363636,,0.867203,1.0,0.307692


In [9]:
'''
dataframe convert into array.
'''
ori_data = X_original_missing.values
normdata = normalize_X_missing.values

In [10]:
attributes = ori_data.shape[1] # 7 cols
instances = ori_data.shape[0] # 101 rows

In [11]:
null_indexes = np.argwhere(np.isnan(ori_data)) # row_col combinations who has null values

# Imputation:-

## Mean Method:-

### mean imputation function

In [12]:
def mean(dataframe):
    features = columns
    for col in features:
        avg = dataframe[col].mean()
        dataframe[col].replace(np.nan, avg, inplace = True)
    return dataframe

#### ==> Original dataframe's imputation with mean

In [13]:
X_mean_impute = X_original_missing.copy()
mean(X_mean_impute) # mean imputation
X_mean_impute.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,0.0,26.6,0.351,31.0
2,4.384615,183.0,64.0,0.0,31.587654,0.502859,32.0
3,1.0,89.0,66.0,94.0,28.1,0.167,21.0
4,4.384615,118.469136,40.0,81.566265,43.1,2.288,33.0


#### ==> Scaled dataframe's imputation with mean

In [14]:
normalize_X_mean_impute = normalize_X_missing.copy()
mean(normalize_X_mean_impute) # normalize-mean imputation
normalize_X_mean_impute.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.4,0.751269,0.654545,0.0,0.676056,0.240165,0.74359
1,0.066667,0.431472,0.6,0.0,0.535211,0.113907,0.25641
2,0.292308,0.928934,0.581818,0.0,0.635566,0.183376,0.282051
3,0.066667,0.451777,0.6,0.111111,0.565392,0.029735,0.0
4,0.292308,0.601366,0.363636,0.096414,0.867203,1.0,0.307692


### Calculating Euclidean Distance For KNN:-

In [15]:
def euclidean_distance(i,j): 
    column = len(i)
    distance = 0
    for a in range(column):
        if np.isnan(i[a]): # if there are nan in missing index then skip it
            continue
        distance = distance + (i[a]-j[a])**2
    return np.sqrt(distance)

### KNN:-


In [16]:
'''
Reference of KNN :- https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/

'''

def sortSecond(val): 
    return val[1] # get value of 1st position

def knn(missingdata,k):
    fill_value = missingdata.copy()
    for j in null_indexes: # null_indexes contains all the row_col combinations of missing value
        distances = []
        for i in range(instances):
            if(i==j[0]): # if two rows are same then skip it
                continue
            else:
                if np.isnan(missingdata[i]).any(): # if row contains null anywhere then it skips
                    continue
                d = euclidean_distance(missingdata[j[0]], missingdata[i])  # distance only calculate for complete case
                distances.append([missingdata[i][j[1]], d] ) 
        distances.sort(key = sortSecond) # call sortSecond function and sort the list on the basis of value of 1st position in this case it is "distance"
        distances = np.array(distances)
        count = len(distances)
        nearest = [] # store the nearest neighbor in list
        for l in range(count):
            nearest.append(distances[l,0])    
        x = nearest[:k] #  slice list on the value of k
        mean = sum(x) / len(x)
        fill_value[j[0]][j[1]] = mean  # replace the nan with mean value  
    return fill_value

#### ==> Original's dataframe imputation with k=1,3,5

In [17]:
knn1 = knn(ori_data, 1)
knn1_df = pd.DataFrame(knn1)
knn1_df.columns = columns
knn1_df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,0.0,26.6,0.351,31.0
2,0.0,183.0,64.0,0.0,42.0,1.893,32.0
3,1.0,89.0,66.0,94.0,28.1,0.167,21.0
4,3.0,78.0,40.0,88.0,43.1,2.288,33.0
5,5.0,116.0,44.0,0.0,22.4,0.201,22.0
6,3.0,78.0,50.0,88.0,31.0,0.248,26.0
7,10.0,115.0,0.0,0.0,35.3,0.134,29.0
8,2.0,197.0,70.0,543.0,30.5,0.158,53.0
9,8.0,99.0,84.0,0.0,35.4,0.388,50.0


In [18]:
knn3 = knn(ori_data, 3)
knn3_df = pd.DataFrame(knn3)
knn3_df.columns = columns
knn3_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,0.0,26.6,0.351,31.0
2,3.666667,183.0,64.0,0.0,39.666667,1.217333,32.0
3,1.0,89.0,66.0,94.0,28.1,0.167,21.0
4,2.333333,97.333333,40.0,41.333333,43.1,2.288,33.0


In [19]:
knn5 = knn(ori_data, 5)
knn5_df = pd.DataFrame(knn5)
knn5_df.columns = columns
knn5_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,0.0,26.6,0.351,31.0
2,5.0,183.0,64.0,0.0,39.64,0.872,32.0
3,1.0,89.0,66.0,94.0,28.1,0.167,21.0
4,2.0,91.4,40.0,24.8,43.1,2.288,33.0


#### ==> Scaled dataframe imputation with k=1,3,5

In [20]:
scaled_knn1 = knn(normdata, 1)
scaled_knn1_df = pd.DataFrame(scaled_knn1)
scaled_knn1_df.columns = columns
scaled_knn1_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.4,0.751269,0.654545,0.0,0.676056,0.240165,0.74359
1,0.066667,0.431472,0.6,0.0,0.535211,0.113907,0.25641
2,0.066667,0.928934,0.581818,0.0,0.784708,0.512351,0.282051
3,0.066667,0.451777,0.6,0.111111,0.565392,0.029735,0.0
4,0.0,0.913706,0.363636,0.0,0.867203,1.0,0.307692


In [21]:
scaled_knn3 = knn(normdata, 3)
scaled_knn3_df = pd.DataFrame(scaled_knn3)
scaled_knn3_df.columns = columns
scaled_knn3_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.4,0.751269,0.654545,0.0,0.676056,0.240165,0.74359
1,0.066667,0.431472,0.6,0.0,0.535211,0.113907,0.25641
2,0.244444,0.928934,0.581818,0.0,0.798122,0.510217,0.282051
3,0.066667,0.451777,0.6,0.111111,0.565392,0.029735,0.0
4,0.022222,0.827411,0.363636,0.0,0.867203,1.0,0.307692


In [22]:
scaled_knn5 = knn(normdata, 5)
scaled_knn5_df = pd.DataFrame(scaled_knn5)
scaled_knn5_df.columns = columns
scaled_knn5_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.4,0.751269,0.654545,0.0,0.676056,0.240165,0.74359
1,0.066667,0.431472,0.6,0.0,0.535211,0.113907,0.25641
2,0.306667,0.928934,0.581818,0.0,0.727565,0.32946,0.282051
3,0.066667,0.451777,0.6,0.111111,0.565392,0.029735,0.0
4,0.093333,0.702538,0.363636,0.066667,0.867203,1.0,0.307692


### Calculating Euclidean Distance For Weighted KNN:-

In [23]:
def weighted_euclidean_distance(i,j):
    attr = len(i)
    distance = 0
    for k in range(attr):
        if np.isnan(i[k]):
            continue
        distance = distance + np.square(i[k]-j[k])    
    return np.sqrt(distance)*(1-distance)

### Weighted KNN:-

In [24]:
def sortSecond(val): 
    return val[1] # get value of 1st position

def weight_knn(missingdata,k):
    fill_value = missingdata.copy()
    for j in null_indexes:  # null_indexes contains all the row_col combinations of missing value
        distances = [] 
        for i in range(instances):
            if(i==j[0]): # if two rows are same then skip it
                continue
            else:
                if np.isnan(missingdata[i]).any(): # if row contains null anywhere then it skips
                    continue
                d = weighted_euclidean_distance(missingdata[j[0]], missingdata[i]) # distance only calculate for complete case
                distances.append([missingdata[i][j[1]], d])
        distances.sort(key = sortSecond) # call sortSecond function and sort the list on the basis of value of 1st position in this case it is "distance"
        distances = np.array(distances)
        count = len(distances)
        nearest = [] # store the nearest neighbor in list
        for l in range(count):
            nearest.append(distances[l,0])    
        x = nearest[:k] # slice list on the value of k
        mean = sum(x) / len(x)
        fill_value[j[0]][j[1]] = mean    # replace the nan with mean value
    return fill_value

#### ==> Original's dataframe imputation with weighted k=1,3,5

In [25]:
weight_knn1 = weight_knn(ori_data, 1)
weight_knn1_df = pd.DataFrame(weight_knn1)
weight_knn1_df.columns = columns
weight_knn1_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,0.0,26.6,0.351,31.0
2,1.0,183.0,64.0,0.0,30.1,0.398,32.0
3,1.0,89.0,66.0,94.0,28.1,0.167,21.0
4,5.0,137.0,40.0,0.0,43.1,2.288,33.0


In [26]:
weight_knn3 = weight_knn(ori_data, 3)
weight_knn3_df = pd.DataFrame(weight_knn3)
weight_knn3_df.columns = columns
weight_knn3_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,0.0,26.6,0.351,31.0
2,3.333333,183.0,64.0,0.0,31.766667,0.424667,32.0
3,1.0,89.0,66.0,94.0,28.1,0.167,21.0
4,7.666667,128.666667,40.0,48.666667,43.1,2.288,33.0


In [27]:
weight_knn5 = weight_knn(ori_data, 5)
weight_knn5_df = pd.DataFrame(weight_knn5)
weight_knn5_df.columns = columns
weight_knn5_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,0.0,26.6,0.351,31.0
2,4.2,183.0,64.0,0.0,33.62,0.3518,32.0
3,1.0,89.0,66.0,94.0,28.1,0.167,21.0
4,6.2,117.4,40.0,29.2,43.1,2.288,33.0


#### Scaled dataframe imputation with weighted k=1,3,5

In [28]:
weight_scaled_knn1 = weight_knn(normdata, 1)
weight_scaled_knn1_df = pd.DataFrame(weight_scaled_knn1)
weight_scaled_knn1_df.columns = columns
weight_scaled_knn1_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.4,0.751269,0.654545,0.0,0.676056,0.240165,0.74359
1,0.066667,0.431472,0.6,0.0,0.535211,0.113907,0.25641
2,0.066667,0.928934,0.581818,0.0,0.605634,0.135407,0.282051
3,0.066667,0.451777,0.6,0.111111,0.565392,0.029735,0.0
4,0.266667,0.680203,0.363636,0.0,0.867203,1.0,0.307692


In [29]:
weight_scaled_knn3 = weight_knn(normdata, 3)
weight_scaled_knn3_df = pd.DataFrame(weight_scaled_knn3)
weight_scaled_knn3_df.columns = columns
weight_scaled_knn3_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.4,0.751269,0.654545,0.0,0.676056,0.240165,0.74359
1,0.066667,0.431472,0.6,0.0,0.535211,0.113907,0.25641
2,0.266667,0.928934,0.581818,0.0,0.71831,0.282251,0.282051
3,0.066667,0.451777,0.6,0.111111,0.565392,0.029735,0.0
4,0.377778,0.56176,0.363636,0.0,0.867203,1.0,0.307692


In [30]:
weight_scaled_knn5 = weight_knn(normdata, 5)
weight_scaled_knn5_df = pd.DataFrame(weight_scaled_knn5)
weight_scaled_knn5_df.columns = columns
weight_scaled_knn5_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.4,0.751269,0.654545,0.0,0.676056,0.240165,0.74359
1,0.066667,0.431472,0.6,0.0,0.535211,0.113907,0.25641
2,0.186667,0.928934,0.581818,0.0,0.722736,0.338335,0.282051
3,0.066667,0.451777,0.6,0.111111,0.565392,0.029735,0.0
4,0.4,0.682234,0.363636,0.162884,0.867203,1.0,0.307692


# MSE:-

In [31]:
def mean_square_error_calculation(x, y):
    total = 0
    for row in range(len(x)):
        error = np.square(x[row] - y[row])
        total += error
    mse = total/len(x)
    return mse

def MSE(actual_df, impute_df):
    mse = []
    for i in columns:
        cal = mean_square_error_calculation(actual_df[i], impute_df[i])
        mse.append(cal)
    return mse

#### MSE for mean imputation original data:-

In [32]:
actual_mse_mean = MSE(X, X_mean_impute)

#### MSE for k=1,3,5 imputation original data:-

In [33]:
actual_mse_knn1 = MSE(X, knn1_df)


In [34]:
actual_mse_knn3 = MSE(X, knn3_df)


In [35]:
actual_mse_knn5 = MSE(X, knn5_df)


#### MSE for weighted k=1,3,5 imputation original data:-

In [36]:
actual_mse_Wknn1 = MSE(X, weight_knn1_df)

In [37]:
actual_mse_Wknn3 = MSE(X, weight_knn3_df)

In [38]:
actual_mse_Wknn5 = MSE(X, weight_knn5_df)

In [39]:
col = ["Mean", "KNN-1", "KNN-3", "KNN-5", "WEIGHTED-KNN-1", "WEIGHTED-KNN-3", "WEIGHTED-KNN-5"]
results_original_data = pd.DataFrame([actual_mse_mean, actual_mse_knn1, actual_mse_knn3, actual_mse_knn5, actual_mse_Wknn1, actual_mse_Wknn3, actual_mse_Wknn5])
results_original_data = results_original_data.transpose()
results_original_data.columns = col
results_original_data.index = columns
results_original_data.head(7)

Unnamed: 0,Mean,KNN-1,KNN-3,KNN-5,WEIGHTED-KNN-1,WEIGHTED-KNN-3,WEIGHTED-KNN-5
Pregnancies,3.919445,5.782178,4.354235,3.856634,7.80198,4.871287,4.098218
Glucose,257.450134,332.267327,243.244224,224.93901,1011.910891,799.170517,677.657822
BloodPressure,111.895935,207.188119,154.682068,133.999604,117.059406,115.10121,115.844356
Insulin,831.236319,2395.50495,2624.115512,1695.857822,22794.465347,15481.167217,10482.970693
BMI,25.491674,42.097228,31.913993,29.747739,24.847426,26.503047,30.273648
DiabetesPedigreeFunction,0.010814,0.038866,0.013147,0.01084,0.009407,0.007191,0.012139
Age,23.832963,34.316832,24.776678,25.301782,113.782178,71.062706,47.007525


In [40]:
results_original_data.to_csv("results_original_data.csv")

#### MSE for mean imputation normalize data:-

In [41]:
normalize_mse_mean = MSE(normalize_X, normalize_X_mean_impute)

#### MSE for k=1,3,5 imputation normalize data:-

In [42]:
normalize_mse_knn1 = MSE(normalize_X, scaled_knn1_df)


In [43]:
normalize_mse_knn3 = MSE(normalize_X, scaled_knn3_df)


In [44]:
normalize_mse_knn5 = MSE(normalize_X, scaled_knn5_df)


#### MSE for weighted k=1,3,5 imputation normalize data:-

In [45]:
normalize_mse_Wknn1 = MSE(normalize_X, weight_scaled_knn1_df)


In [46]:
normalize_mse_Wknn3 = MSE(normalize_X, weight_scaled_knn3_df)


In [47]:
normalize_mse_Wknn5 = MSE(normalize_X, weight_scaled_knn5_df)


In [48]:
norm_col = ["Mean", "KNN-1", "KNN-3", "KNN-5", "WEIGHTED-KNN-1", "WEIGHTED-KNN-3", "WEIGHTED-KNN-5"]
results_scaled_data = pd.DataFrame([normalize_mse_mean, normalize_mse_knn1, normalize_mse_knn3, normalize_mse_knn5, normalize_mse_Wknn1, normalize_mse_Wknn3, normalize_mse_Wknn5])
results_scaled_data = results_scaled_data.transpose()
results_scaled_data.columns = norm_col
results_scaled_data.index = columns
results_scaled_data.head(7)

Unnamed: 0,Mean,KNN-1,KNN-3,KNN-5,WEIGHTED-KNN-1,WEIGHTED-KNN-3,WEIGHTED-KNN-5
Pregnancies,0.01742,0.023014,0.018365,0.016053,0.026271,0.017983,0.018389
Glucose,0.006634,0.008965,0.006511,0.005229,0.025159,0.017028,0.010843
BloodPressure,0.009248,0.013533,0.009877,0.007963,0.012551,0.009123,0.009137
Insulin,0.001161,0.006078,0.004228,0.002905,0.02461,0.010176,0.004905
BMI,0.01032,0.017573,0.013329,0.012204,0.012186,0.012206,0.012025
DiabetesPedigreeFunction,0.002263,0.004274,0.001668,0.001594,0.011094,0.001674,0.001331
Age,0.015669,0.027685,0.022843,0.016729,0.080015,0.029974,0.021293


In [49]:
results_scaled_data.to_csv("results_scaled_data.csv")