In [62]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import itertools
import scipy.stats as stats 

def turn_df_numeric(df):
    one_hot_fts = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
    preprocessor = ColumnTransformer(
        [('one_hot', OneHotEncoder(), one_hot_fts)], 
        remainder='passthrough', 
        verbose_feature_names_out=False
    )

    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('imputer', IterativeImputer(random_state = 43))
    ])
    
    return pd.DataFrame(
        pipe.fit_transform(df), 
        columns = pipe.get_feature_names_out())

#Goal: for MAR, compare the average number of bedrooms, parking, airconditioning that were missing
# For MNAR, compare the average area after predicting with iterative imputer versus the original values

#DataFrame with columns:
#Run # (0-9)
#Feature Name (bedrooms, parking, etc.)
#Type of missing (None, MAR, MNAR)
#Type of imputation method (None, Synthpop, Synthesizer)
nomissing_data = pd.read_csv("Datasets/housing_original_train_0.csv")
original_MAR = pd.read_csv("Datasets/housing_mar_train_0.csv")
original_MNAR = pd.read_csv("Datasets/housing_mnar_train_0.csv")

bedrooms_missing = pd.isna(original_MAR['bedrooms'])
parking_missing = pd.isna(original_MAR['parking'])

true_mean_bedrooms = np.mean(nomissing_data.loc[bedrooms_missing, 'bedrooms'])
true_mean_parking = np.mean(nomissing_data.loc[parking_missing, 'parking'])

original_MAR_imputed = turn_df_numeric(original_MAR)

imputed_mean_bedrooms = np.mean(original_MAR_imputed.loc[bedrooms_missing, 'bedrooms'])
imputed_mean_parking = np.mean(original_MAR_imputed.loc[parking_missing, 'parking'])

In [22]:
runs = range(10)  # 0-9
features = ['bedrooms', 'parking', 'stories']
missing_types = ['mar', 'mnar']
imputation_methods = ['original', 'synthesizer', 'synthpop']

# Generate all combinations
combinations = list(itertools.product(runs, features, missing_types + ['original'], imputation_methods))

# Create the DataFrame
results_df = pd.DataFrame(combinations, columns=['Run #', 'Feature Name', 'Type of missing', 'Type of imputation method'])
results_df.head()

for run in runs:
    for imputation in imputation_methods:
        for missing_type in missing_types:
            file_path = "Datasets/"
            if imputation != 'original':
                file_path += imputation + "_"
            file_path += "housing_" + missing_type + "_train_" + str(run) + ".csv"
            synth_df = pd.read_csv(file_path)
            for feature in features:
                missing_idx = pd.isna(synth_df[feature])
                if (imputation == "original" and missing_type == "mar"):
                    #When imputing with MAR on the real value, we want to see how close the values are to the true mean
                    true_df = pd.read_csv("Datasets/housing_original_train_" + str(run) + ".csv")
                    true_mean = np.mean(true_df.loc[missing_idx, feature])
                    true_row_index = results_df[
                        (results_df['Run #'] == run) & 
                        (results_df['Feature Name'] == feature) & 
                        (results_df['Type of missing'] == "original") & 
                        (results_df['Type of imputation method'] == "original")
                    ].index
                    results_df.loc[true_row_index, 'Mean Value'] = true_mean
                    
                imputed_synth_df = turn_df_numeric(synth_df)
                mean_feature = np.mean(imputed_synth_df.loc[missing_idx, feature])
                
                row_index = results_df[
                    (results_df['Run #'] == run) & 
                    (results_df['Feature Name'] == feature) & 
                    (results_df['Type of missing'] == missing_type) & 
                    (results_df['Type of imputation method'] == imputation)
                ].index
                    
                results_df.loc[row_index, 'Mean Value'] = mean_feature



In [46]:
#ANALYSIS
#Filter out any nonmissing rows where the imputation method is not original
"""original_no_missing = results_df.loc[(results_df['Type of imputation method'] == "original") & (results_df['Type of missing'] == "original")].reset_index()['Mean Value']
results_df[(results_df['Type of imputation method'] == "synthpop") & (results_df['Type of missing'] == "original")]["Mean Value"] = original_no_missing
results_df[(results_df['Type of imputation method'] == "synthesizer") & (results_df['Type of missing'] == "original")]["Mean Value"] = original_no_missing
results_df.to_csv("AverageMissing.csv")   """


original_with_missing = results_df.loc[(results_df['Type of imputation method'] == "original")].drop(columns='Type of imputation method')
original_no_missing = results_df.loc[(results_df['Type of imputation method'] == "original") & (results_df['Type of missing'] == "original")].drop(columns=['Type of imputation method',
                                                                                                                                                        'Type of missing'])
new_results = (results_df.
              merge(original_with_missing, how = "left", on=['Run #', 'Feature Name', 'Type of missing'], suffixes=("", "_original_missing")).
              merge(original_no_missing, how = "left", on=['Run #', 'Feature Name'], suffixes=('', '_original_no_missing')))
new_results = new_results[new_results["Mean Value"].notna()]
new_results["Difference Between No Missing"] = np.abs(new_results["Mean Value"] - new_results["Mean Value_original_no_missing"])
new_results["Difference Between Original (with MAR/MNAR)"] = np.abs(new_results["Mean Value"] - new_results["Mean Value_original_missing"])
new_results_mean = new_results.groupby(by = ['Feature Name', 'Type of missing', 'Type of imputation method']).mean()
new_results_std = new_results.groupby(by = ['Feature Name', 'Type of missing', 'Type of imputation method']).std()

new_results_mean.to_csv("AverageMissing.csv")
new_results_std.to_csv("AverageMissingSTD.csv")

both = pd.DataFrame(columns = ["Feature Name", "Type of missing", "Type of imputation method", "Mean Value", "STD"])
both["Mean Value"] = new_results_mean["Difference Between Original (with MAR/MNAR)"]
both["STD"] = new_results_std["Difference Between Original (with MAR/MNAR)"]
both["Feature Name"] = new_results_mean.index.get_level_values(0)
both["Type of missing"] = new_results_mean.index.get_level_values(1)
both["Type of imputation method"] = new_results_mean.index.get_level_values(2)

both.to_csv("Both.csv")

In [64]:
only_original = results_df.loc[(results_df['Type of imputation method'] == "original")].drop(columns='Type of imputation method')
imputation_comparison = only_original.merge(original_no_missing, how = "left", on=['Run #', 'Feature Name'], suffixes=("", "_original_no_missing"))
imputation_comparison["Difference"] = np.abs(imputation_comparison["Mean Value"] - imputation_comparison["Mean Value_original_no_missing"])
imputation_comparison
imputation_comparison_mean = imputation_comparison.groupby(by = ['Feature Name', 'Type of missing']).mean()
imputation_comparison_std = imputation_comparison.groupby(by = ['Feature Name', 'Type of missing']).std()
imputation_comparison_all = pd.DataFrame(columns = ["Feature Name", "Type of missing", "Mean Value", "STD"])
imputation_comparison_all["Mean Value"] = imputation_comparison_mean["Difference"]
imputation_comparison_all["STD"] = imputation_comparison_std["Difference"]
imputation_comparison_all["Feature Name"] = imputation_comparison_mean.index.get_level_values(0)
imputation_comparison_all["Type of missing"] = imputation_comparison_mean.index.get_level_values(1)
imputation_comparison_all["T-Statistic"] = imputation_comparison_all["Mean Value"] / (imputation_comparison_all["STD"] / np.sqrt(10))
imputation_comparison_all['P-value'] = 1 - stats.t.cdf(imputation_comparison_all['T-Statistic'], 9)
imputation_comparison_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Feature Name,Type of missing,Mean Value,STD,T-Statistic,P-value
Feature Name,Type of missing,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bedrooms,mar,bedrooms,mar,0.226642,0.056912,12.593202,2.54981e-07
bedrooms,mnar,bedrooms,mnar,0.150747,0.052596,9.063495,4.03029e-06
bedrooms,original,bedrooms,original,0.0,0.0,,
parking,mar,parking,mar,0.097115,0.067041,4.58081,0.0006631429
parking,mnar,parking,mnar,0.419007,0.050508,26.23391,4.101931e-10
parking,original,parking,original,0.0,0.0,,
stories,mar,stories,mar,0.035174,0.023623,4.708508,0.0005534166
stories,mnar,stories,mnar,0.132316,0.07184,5.824364,0.0001258418
stories,original,stories,original,0.0,0.0,,


In [None]:


t_statistic = mean_value / (std_dev / (n**0.5))

# Calculate the p-value
p_value = 2 * (1 - stats.t.cdf(abs(t_statistic), df=n-1))
