In [1]:
import pandas as pd
import numpy as np
import math
import statistics
import shutil
import os
import warnings
import sys
import time
warnings.filterwarnings('ignore')
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
# set input path of the folder which contains multilpe CSV files to impute the data
input_folder = r"C:\Users\arshi\Desktop\Presentation\Input"

# set the output path to save the all the CSV files after imputation
output_folder = r"C:\Users\arshi\Desktop\Presentation\Output"

# set the path of original dataset to find NRMS values
original_dataset = r'C:\Users\arshi\Desktop\Project DM\BlackBoard\Complete datasets\Data_1.csv'

# location to save nrms values (specify file name with extension - it will be created automatically)
analysis_file = r"C:\Users\arshi\Desktop\Presentation\Analysis\Analysis.csv"

# write header name in the output NRMS.csv files
if(analysis_file and original_dataset):
    f = open(analysis_file, "a")
    f.write("Dataset File,NRMS,Run time,Key Parameters" + "\n" )
    f.close()

# scan all the CSV files located in the input_folder path
all_files = []
for path, currentDirectory, files in os.walk(input_folder):
    for file in files:
        if file.endswith(".csv"):
            all_files.append(os.path.join(path, file))

# copy all the files from input_folder path to output_folder path
for i in range(0, len(all_files)):
    shutil.copy2(all_files[i], output_folder)

# create a list of all the CSV file
output_folder_files = []
for dirpath,_,filenames in os.walk(output_folder):
    for f in filenames:
        output_folder_files.append(os.path.abspath(os.path.join(dirpath, f)))
output_folder_files

# executing the algorithm for each CSV file
for n in range(0, len(output_folder_files)):
    # variable to store number of iterations
    incomplete_class_iteration = 0
    complete_class_iteration = 0
    threshold_overhead_iterations = 0
    validity = 0
    print(str(n+1) + ". Imputing data in " + output_folder_files[n] + "...")
    
    # read excel sheet with no haeder (because the data provided has no header)
    df = pd.read_csv (output_folder_files[n], header=None)
    start_time = time.time()
    
    # find last column in the data as it has class lables
    lastcolumn = len(df.columns)-1
    dfclass = df.iloc[:,lastcolumn]
    class_names = dfclass.to_numpy()
    
# ----------------------------------------------MODULE A starts----------------------------------------------
    # store unique class lables
    unique_list = []
    for x in class_names:
        if x not in unique_list:
            unique_list.append(x)
    unique_list.sort()

    # divide dataset into unique classes
    for m in range(0, len(unique_list)):
        class_column = df.iloc[:,len(df.columns)-1];
        oneclass = []
    
        # create dataset of unique class
        for i in range(0, len(df.index)):
            if(int(unique_list[m]) == int(class_column.iloc[i])):
                oneclass.append(df.iloc[i,:])

        df_class = pd.DataFrame(oneclass)
        complete_class = []
        incomplete_class = []
        nan_count = 0

        # scanning each samples
        for i in range(0, len(df_class.index)):  
            nan_count = 0
            # scanning each feature
            for j in range(0, len(df_class.columns)):
                # if the value is NaN
                if(math.isnan(float(df_class.iloc[i,:].iloc[j]))):
                    nan_count += 1
            if(nan_count > 0):
                incomplete_class.append(df_class.iloc[i,:])
                incomplete_class_iteration += 1
            else:
                complete_class.append(df_class.iloc[i,:])
                complete_class_iteration += 1

        # create dataframe for complete and incomplete dataset
        df_complete_class = pd.DataFrame(complete_class)
        df_incomplete_class = pd.DataFrame(incomplete_class)

        class_center = []
        class_std = []

        if(len(df_incomplete_class.index) == len(df_class.index)):
            # set validity flag if there is no data in complete class
            validity = 1
        else:
            # calculate class center
            for i in range (0, len(df_complete_class.columns)):
                class_center.append(df_complete_class.iloc[:,i].mean())

            # calculate standard deviation
            for i in range (0, len(df_complete_class.columns) - 1):
                class_std.append(np.std(df_complete_class.iloc[:,i]))

            # calculate euclidean distance
            distance = []
            for i in range (0, len(df_complete_class.index)):
                conv_df_to_array = df_complete_class.iloc[i,:len(df.columns)].to_numpy()
                distance.append(np.linalg.norm(class_center - conv_df_to_array))
            
            # calculate threshold
            threshold = np.median(distance)
#----------------------------------------------MODULE A ends----------------------------------------------

#----------------------------------------------MODULE B starts----------------------------------------------
            
            for i in range(0, len(df_incomplete_class.index)):
                nan_count = 0
                # count number of missing values in the sample
                for j in range(0, len(df_incomplete_class.columns)):
                    if(math.isnan(float(df_incomplete_class.iloc[i,:].iloc[j]))):
                        nan_count += 1
                # if sample has one missing value
                if(nan_count == 1):
                    for j in range(0, len(df_incomplete_class.columns)):
                        if(math.isnan(float(df_incomplete_class.iloc[i,:].iloc[j]))):
                            # fill it with the class center
                            df_incomplete_class.iloc[i,:].iloc[j] = class_center[j]
                            # find euclidean distance with the class center
                            temp_distance = []
                            conv_df_to_array = df_incomplete_class.iloc[i,:].to_numpy()
                            # calculate euclidean distance
                            temp_distance = (np.linalg.norm(class_center - conv_df_to_array))
                            
                            #compare distance with the threshold
                            if(temp_distance >= threshold):
                                threshold_overhead_iterations += 1
                                df_incomplete_class.iloc[i,:].iloc[j] = class_center[j] - class_std[j]
                
                # if sample has more than one missing value
                else:
                    for j in range(0, len(df_incomplete_class.columns)):
                        if(math.isnan(float(df_incomplete_class.iloc[i,:].iloc[j]))):
                            missing = []
                            # saving the index of the missing value of the sample
                            missing.append(j)
                            # fill it with the class center
                            df_incomplete_class.iloc[i,:].iloc[j] = class_center[j]
                            # find euclidean distance with the class center
                            temp_distance = []
                            conv_df_to_array = df_incomplete_class.iloc[i,:].to_numpy()
                            # calculate euclidean distance
                            temp_distance = np.linalg.norm(class_center - conv_df_to_array)
                            #compare distance with the threshold
                            if(temp_distance >= threshold):
                                threshold_overhead_iterations += 1
                                temp_df = pd.DataFrame()
                                for j in range(0, len(df_incomplete_class.columns)):
                                    for k in range(0, len(missing)):
                                        # impute with + and - std
                                        if(math.isnan(float(df_incomplete_class.iloc[i,:].iloc[j]))):
                                            # storing all the possible values of + and - std
                                            df_incomplete_class.iloc[i,:].iloc[missing[k]] = class_center[j] + class_std[j]
                                            temp_df.append(df_incomplete_class.iloc[i,:])
                                            df_incomplete_class.iloc[i,:].iloc[missing[k]] = class_center[j] - class_std[j]
                                            temp_df.append(df_incomplete_class.iloc[i,:])
                                
                                for j in range(0, len(temp_df.index)):
                                    distance_array = []
                                    conv_temp_df_to_array = temp_df.iloc[i,:].to_numpy()
                                    # find euclidean distance with the class center
                                    distance_array.append(np.linalg.norm(class_center - conv_temp_df_to_array))
                                    # find the minimum distance and impute it
                                    df_incomplete_class.iloc[i,:] = temp_df[distance_array.index(min(distance_array))]
            
            # save the imputed dataframe
            df.update(df_incomplete_class)
            df.to_csv(output_folder_files[n], index = False, header = False)
#----------------------------------------------MODULE B ends----------------------------------------------

    # calculate NRMS, Execution time and Key parameters
    if(original_dataset):
        imputed = pd.read_csv (output_folder_files[n], header=None).to_numpy()
        original = pd.read_csv (original_dataset, header=None).iloc[1:].astype(float).to_numpy()       
        num_nrms = np.linalg.norm(imputed - original)
        deno_nrms = np.linalg.norm(original)
        nrms = (num_nrms/deno_nrms)
        if(analysis_file and original_dataset):
            f = open(analysis_file, "a")
            execution_time = time.time() - start_time
            string_to_write = str(output_folder_files[n]) + "," + str(nrms) + "," + str(execution_time) + ",Complete Class Iteration: " + str(complete_class_iteration) + ". Incomplete class iterations: " + str(incomplete_class_iteration) + ". Threshold overhead iterations: " + str(threshold_overhead_iterations) + "\n" 
            if(validity == 1):
                string_to_write = str(output_folder_files[n]) + ",NA," + str(execution_time) + ",Complete Class Iteration: " + str(complete_class_iteration) + ". Incomplete class iterations: " + str(incomplete_class_iteration) + ". Threshold overhead iterations: " + str(threshold_overhead_iterations) + "\n" 
            f.write(string_to_write)
            f.close()
        
        if (validity == 0):
            console_output = "NRMS: " + str(nrms) + "\n" + "Run time: " + str(execution_time) + "\nKey Parameteres:\n-Complete class iteration: " + str(complete_class_iteration) + "\n-Incomplete class iterations: " + str(incomplete_class_iteration) + "\n-Threshold overhead iterations: " + str(threshold_overhead_iterations) + "\n" 
            print(console_output)
        else:
            console_output = "NRMS: NA\nRun time: " + str(execution_time) + "\nKey Parameteres:\n-Complete class iteration: " + str(complete_class_iteration) + "\n-Incomplete class iterations: " + str(incomplete_class_iteration) + "\n-Threshold overhead iterations: " + str(threshold_overhead_iterations) + "\n" 
            print(console_output)    
            
print("Total " + str(len(output_folder_files)) + " files imputed.")

1. Imputing data in C:\Users\arshi\Desktop\Presentation\Output\Data_1_AW_10%.csv...
NRMS: 0.278360811807391
Run time: 1.4304254055023193
Key Parameteres:
-Complete class iteration: 800
-Incomplete class iterations: 200
-Threshold overhead iterations: 57

Total 1 files imputed.
