# Importing necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import miceforest as mf
from sklearn.impute import SimpleImputer

# MICE(Random Forest)

In [None]:
def mice_forest(merged):
    
    pm10 = df_columns_to_impute(merged)
    # Create kernels. 
    kernel = mf.MultipleImputedKernel(pm10,
                                      save_all_iterations=True,
                                  random_state=1987)
    kernel.mice(3)
    pm10 = kernel.complete_data(2)
    visualize_imputed(kernel)
    print_accuracy(kernel, pm10)
    merged = merge_to_merged_csv(pm10, merged)
    return(merged)




In [None]:
#Visualises the imputed data

def visualize_imputed(kernel):
    
    # Density Distribution of imputed variables against original distribution
    kernel.plot_imputed_distributions(wspace=0.3,hspace=0.3)
    
    # Correlation box plot
    kernel.plot_correlations()
    kernel.plot_feature_importance(annot=True,cmap="YlGnBu",vmin=0, vmax=1)
    
    # Convergence plot of al the imputed variables
    kernel.plot_mean_convergence(wspace=0.3, hspace=0.4)
    

In [None]:
#Prints accuracy of the imputed variable set

def print_accuracy(kernel, pm10):
    acclist = []
    for iteration in range(kernel.iteration_count()+1):
        target_na_count = kernel.na_counts['PM2.5']
        compdat = kernel.complete_data(dataset=2,iteration=iteration)

        # Record the accuract of the imputations of target.
        acclist.append(
          round(1-sum(compdat['PM2.5'] != pm10['PM2.5'])/target_na_count,2)
        )

    # acclist shows the accuracy of the imputations
    # over the iterations.
    print(acclist)

# Using simple imputer with mean strategy

In [None]:
def simple_imputer(merged):

    pm10 = df_columns_to_impute(merged)
    impNumeric = SimpleImputer(missing_values=np.nan, strategy='mean')
    impNumeric = impNumeric.fit(pm10)
    pm10 = pd.DataFrame(impNumeric.transform(pm10))
    
    merged = merge_to_merged_csv(pm10, merged)
    

# Substituting imputed values in the merged csv

In [None]:
def merge_to_merged_csv(pm10, merged):
    merged['PM10'] = pm10['PM10']
    merged['PM2.5'] = pm10['PM2.5']
    merged['NO2'] = pm10['NO2']
    merged['NOX as NO2'] = pm10['NOX as NO2']
    return(merged)

# Selecting columns to impute

In [None]:
def df_columns_to_impute(merged):
    # TO check null values per column
    null_columns=merged.columns[merged.isnull().any()]
    print("Null Values per column", merged[null_columns].isnull().sum())  
    #Selecting columns with null values
    selected_columns = merged[['PM10','PM2.5', 'NO2', 'NOX as NO2']]
    return(selected_columns)
