In [16]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import nan_euclidean_distances
from sklearn.neighbors import KNNi

In [17]:
df = pd.read_csv('/Users/andrescrucettanieto/Documents/GitHub/missing_data_project/data/toy_data.csv')

In [34]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import nan_euclidean_distances

'''
To find out the weights following steps have to be taken:

1) Choose missing value to fill in the data.
2) Select the values in a row
3) Choose the number of neighbors you want to work with (ideally 2-5)
4)Calculate Euclidean distance from all other data points corresponding to each other in the row.
5) Select the smallest 2 and average out.

Sources used:
- https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/
- https://www.numpyninja.com/post/mice-and-knn-missing-value-imputations-through-python
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.nan_euclidean_distances.html
- https://chrisalbon.com/code/machine_learning/preprocessing_structured_data/imputing_missing_class_labels_using_k-nearest_neighbors/

'''

def normalize_data(df):
    '''
    Normalizes the data in X
    
    Input:
        - df: dataframe
    Output:
        - df: normalized dataframe
    '''
    scaler = MinMaxScaler()
    df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
    return df

def kSmallest(arr, k):
    '''
    Gets the k smallest values in an array
    
    Input:
        - arr: array
        - k: number of smallest values to return
    Output:
        - k_smallest: k smallest values in the array
    '''
    # Sort the given array 
    arr_i = arr.copy()
    arr_i.sort()
  
    # Return k'th element in the sorted array 
    # Skips the first element since it contains 
    # the value 0
    return arr_i[1:k+1]

def KNN_imputation(df, k=5):
    '''
    Recieves a dataframe df and imputes the missing values
    using a K Nearest Neighbors imputer algorithm.
    
    Input:
        - df: dataframe
        - k: number of neighbors to use
    Output:
        - df: dataframe with missing values imputed
    '''
    # Extracting the numerical columns
    numeric_df = df.select_dtypes(include=['float64', 'int64'])
    
    # Normalizing the data
    numeric_df = normalize_data(numeric_df)
    
    # Iterating over each column
    for j in range(len(numeric_df.columns)):
        lst_missing = numeric_df.iloc[:,j][numeric_df.iloc[:,j].isnull()].index.to_list()
        
        # Iterating over missing rows
        for i in lst_missing:
            # Get the comparison row
            euclidean_row = numeric_df.iloc[i]
            
            # Take euclidean distances of the other columns
            euclidean_distances = nan_euclidean_distances(numeric_df, [euclidean_row.to_list()])
            
            # Flatten the euclidean distance array
            euc_flattened = euclidean_distances.flatten(order='F')
            
            # Get the k smallest distances
            k_nearest = kSmallest(euc_flattened, k)
            k_nearest_indices = np.where(np.in1d(euc_flattened, k_nearest))[0]
            
            # Get mean of the k nearest neighbors
            k_mean = numeric_df.iloc[k_nearest_indices].iloc[:,j].mean()
            
            # Assigning the value to the nan row cell
            print(i,j)
            numeric_df.iloc[i].iloc[j]=k_mean
    return numeric_df
    

In [37]:
numeric_df = df.select_dtypes(include=['float64', 'int64'])

In [40]:
normalize_data(numeric_df)

Unnamed: 0,Numerical A,Numerical B,Output Value 1
0,0.040629,0.296947,0.0
1,0.0,0.489844,0.046875
2,,0.280872,
3,0.314548,0.665919,0.585938
4,0.716907,,0.035156
5,1.0,0.182928,
6,,0.0,0.878906
7,0.730013,0.524486,0.0
8,0.828309,1.0,1.0


In [36]:
KNN_imputation(df, k=4)

2 0
6 0
4 1
2 2
5 2


Unnamed: 0,Numerical A,Numerical B,Output Value 1
0,0.040629,0.296947,0.0
1,0.0,0.489844,0.046875
2,0.442661,0.280872,0.228516
3,0.314548,0.665919,0.585938
4,0.716907,0.321308,0.035156
5,1.0,0.182928,0.285645
6,0.449459,0.0,0.878906
7,0.730013,0.524486,0.0
8,0.828309,1.0,1.0


In [41]:
import numpy as np
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=4)
imputer.fit_transform(normalize_data(numeric_df))

array([[0.0406291 , 0.29694704, 0.        ],
       [0.        , 0.48984424, 0.046875  ],
       [0.44266055, 0.28087227, 0.23144531],
       [0.31454784, 0.665919  , 0.5859375 ],
       [0.71690695, 0.41757009, 0.03515625],
       [1.        , 0.18292835, 0.47851562],
       [0.33879423, 0.        , 0.87890625],
       [0.73001311, 0.52448598, 0.        ],
       [0.82830931, 1.        , 1.        ]])

In [30]:
import numpy as np
from sklearn.impute import KNNImputer
X = np.array([[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]])
imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(X)

array([[1. , 2. , 4. ],
       [3. , 4. , 3. ],
       [5.5, 6. , 5. ],
       [8. , 8. , 7. ]])

In [29]:
print(X)

[[ 1.  2. nan]
 [ 3.  4.  3.]
 [nan  6.  5.]
 [ 8.  8.  7.]]


AttributeError: 'numpy.ndarray' object has no attribute 'to_pandas'