In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
# label encoding
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("./dataset/Glass.csv")
df["Type of glass"] = df["Type of glass"].astype("category")
le = LabelEncoder()
df["Type of glass"] = le.fit_transform(df["Type of glass"])
df

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,0
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,0
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,5
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,5
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,5
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,5


In [3]:
df.dtypes

RI               float64
Na               float64
Mg               float64
Al               float64
Si               float64
K                float64
Ca               float64
Ba               float64
Fe               float64
Type of glass      int64
dtype: object

In [4]:
df.columns

Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass'], dtype='object')

In [5]:
from hyperimpute.plugins.imputers import Imputers, ImputerPlugin
from Impute import fill_with_et

imputers = Imputers()


class EtImputer(ImputerPlugin):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._model = fill_with_et

    @staticmethod
    def name():
        return "et"

    @staticmethod
    def hyperparameter_space():
        return []

    def _fit(self, *args, **kwargs):
        return self

    def _transform(self, df):
        # 按照缺失值的比例进行排序
        miss_rate = df.isnull().sum() / df.shape[0]
        cols = miss_rate.sort_values().index.tolist()
        cols = [col for col in cols if miss_rate[col] > 0]
        for col in cols:
            df_col_filled = self._model(df, col)
            df[col] = df_col_filled[col]
        return df


imputers.add("et", EtImputer)

hyper = imputers.get("hyperimpute", n_inner_iter=1)
et = imputers.get("et")
missforest = imputers.get("missforest", max_iter=1)
gain = imputers.get("gain", n_epochs=10)
sinkhorn = imputers.get("sinkhorn", n_epochs=10)
mean = imputers.get("mean")




In [6]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.ensemble import RandomForestClassifier
# from hyperimpute.plugins.utils.simulate import simulate_nan
# from sklearn.metrics import accuracy_score, f1_score
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# import scienceplots
# from sklearn.preprocessing import LabelEncoder
# from hyperimpute.plugins.utils.simulate import simulate_nan
# import warnings

# warnings.filterwarnings("ignore")

# plt.style.use(['science','no-latex'])
# # set font as times new roman
# plt.rcParams["font.family"] = "Times New Roman"
# plt.figure(figsize=(12, 8))

# target_col = "Type of glass"
# X = df.drop(target_col, axis=1)
# y = df[target_col]
# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# et_all_test_preds = []
# et_all_scores = []
# et_all_f1_scores = []

# hyper_all_test_preds = []
# hyper_all_scores = []
# hyper_all_f1_scores = []

# missforest_all_test_preds = []
# missforest_all_scores = []
# missforest_all_f1_scores = []

# ori_all_test_preds = []
# ori_all_scores = []
# ori_all_f1_scores = []

# gain_all_test_preds = []
# gain_all_scores = []
# gain_all_f1_scores = []

# sinkhorn_all_test_preds = []
# sinkhorn_all_scores = []
# sinkhorn_all_f1_scores = []

# mean_all_test_preds = []
# mean_all_scores = []
# mean_all_f1_scores = []

# y_test_all = []

# for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
#     X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#     y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
#     df_train = pd.concat([X_train, y_train], axis=1)
#     df_test = pd.concat([X_test, y_test], axis=1)
#     # reset the index
#     df_train = df_train.reset_index(drop=True)
#     df_test = df_test.reset_index(drop=True)
#     X_train = df_train.drop(target_col, axis=1)
#     y_train = df_train[target_col]
#     X_test = df_test.drop(target_col, axis=1)
#     y_test = df_test[target_col]

#     cols = X_train.columns
#     X_train = X_train.to_numpy()
#     X_train_nan = simulate_nan(X_train, 0.1)['X_incomp']
#     X_train_nan = pd.DataFrame(X_train_nan, columns=cols)

#     X_train_imputed_hyper = hyper.fit_transform(X_train_nan.copy())
#     X_train_imputed_et = et.fit_transform(X_train_nan.copy())
#     X_train_imputed_missforest = missforest.fit_transform(X_train_nan.copy())
#     X_train_imputed_gain = gain.fit_transform(X_train_nan.copy())
#     X_train_imputed_sinkhorn = sinkhorn.fit_transform(X_train_nan.copy())
#     X_train_imputed_mean = mean.fit_transform(X_train_nan.copy())

#     clf_hyper = RandomForestClassifier(n_estimators=500)
#     clf_hyper.fit(X_train_imputed_hyper, y_train)
#     y_pred_hyper = clf_hyper.predict(X_test)
#     print("Accuracy (hyperimpute):", accuracy_score(y_test, y_pred_hyper))
#     hyper_all_scores.append(accuracy_score(y_test, y_pred_hyper))
#     hyper_all_test_preds.extend(y_pred_hyper)
#     hyper_all_f1_scores.append(f1_score(y_test, y_pred_hyper, average='weighted'))

#     clf_et = RandomForestClassifier(n_estimators=500)
#     clf_et.fit(X_train_imputed_et, y_train)
#     y_pred_et = clf_et.predict(X_test)
#     print("Accuracy (et):", accuracy_score(y_test, y_pred_et))
#     et_all_scores.append(accuracy_score(y_test, y_pred_et))
#     et_all_test_preds.extend(y_pred_et)
#     et_all_f1_scores.append(f1_score(y_test, y_pred_et, average='weighted'))

#     clf_missforest = RandomForestClassifier(n_estimators=500)
#     clf_missforest.fit(X_train_imputed_missforest, y_train)
#     y_pred_missforest = clf_missforest.predict(X_test)
#     print("Accuracy (missforest):", accuracy_score(y_test, y_pred_missforest))
#     missforest_all_scores.append(accuracy_score(y_test, y_pred_missforest))
#     missforest_all_test_preds.extend(y_pred_missforest)
#     missforest_all_f1_scores.append(f1_score(y_test, y_pred_missforest, average='weighted'))

#     clf_original = RandomForestClassifier(n_estimators=500)
#     clf_original.fit(X_train, y_train)
#     y_pred_original = clf_original.predict(X_test)
#     print("Accuracy (original):", accuracy_score(y_test, y_pred_original))
#     ori_all_scores.append(accuracy_score(y_test, y_pred_original))
#     ori_all_test_preds.extend(y_pred_original)
#     ori_all_f1_scores.append(f1_score(y_test, y_pred_original, average='weighted'))

#     clf_gain = RandomForestClassifier(n_estimators=500)
#     clf_gain.fit(X_train_imputed_gain, y_train)
#     y_pred_gain = clf_gain.predict(X_test)
#     print("Accuracy (gain):", accuracy_score(y_test, y_pred_gain))
#     gain_all_scores.append(accuracy_score(y_test, y_pred_gain))
#     gain_all_test_preds.extend(y_pred_gain)
#     gain_all_f1_scores.append(f1_score(y_test, y_pred_gain, average='weighted'))

#     clf_sinkhorn = RandomForestClassifier(n_estimators=500)
#     clf_sinkhorn.fit(X_train_imputed_sinkhorn, y_train)
#     y_pred_sinkhorn = clf_sinkhorn.predict(X_test)
#     print("Accuracy (sinkhorn):", accuracy_score(y_test, y_pred_sinkhorn))
#     sinkhorn_all_scores.append(accuracy_score(y_test, y_pred_sinkhorn))
#     sinkhorn_all_test_preds.extend(y_pred_sinkhorn)
#     sinkhorn_all_f1_scores.append(f1_score(y_test, y_pred_sinkhorn, average='weighted'))

#     clf_mean = RandomForestClassifier(n_estimators=500)
#     clf_mean.fit(X_train_imputed_mean, y_train)
#     y_pred_mean = clf_mean.predict(X_test)
#     print("Accuracy (mean):", accuracy_score(y_test, y_pred_mean))
#     mean_all_scores.append(accuracy_score(y_test, y_pred_mean))
#     mean_all_test_preds.extend(y_pred_mean)
#     mean_all_f1_scores.append(f1_score(y_test, y_pred_mean, average='weighted'))

#     y_test_all.extend(y_test)
#     print("=====================================")

In [7]:
# # plot confusion matrix
# from sklearn.metrics import confusion_matrix

# fig, axs = plt.subplots(4, 2, figsize=(10, 12), sharex=True, sharey=True)
# sns.set_theme(style="whitegrid",font='Times New Roman',font_scale=1.5)

# axs = axs.flatten()

# confusion_matrix_et = confusion_matrix(y_test_all, et_all_test_preds)
# confusion_matrix_original = confusion_matrix(y_test_all, ori_all_test_preds)
# confusion_matrix_hyper = confusion_matrix(y_test_all, hyper_all_test_preds)
# confusion_matrix_missforest = confusion_matrix(y_test_all, missforest_all_test_preds)
# confusion_matrix_gain = confusion_matrix(y_test_all, gain_all_test_preds)
# confusion_matrix_sinkhorn = confusion_matrix(y_test_all, sinkhorn_all_test_preds)
# confusion_matrix_mean = confusion_matrix(y_test_all, mean_all_test_preds)
# cms = [confusion_matrix_original, confusion_matrix_et, confusion_matrix_hyper, confusion_matrix_missforest,
#        confusion_matrix_gain, confusion_matrix_sinkhorn, confusion_matrix_mean]

# methods = ["Original", "MatImpute", "HyperImpute", "MissForest", "Gain", "Sinkhorn", "Mean"]
# scores = [ori_all_scores, et_all_scores, hyper_all_scores, missforest_all_scores, gain_all_scores, sinkhorn_all_scores,
#           mean_all_scores]
# f1_scores = [ori_all_f1_scores, et_all_f1_scores, hyper_all_f1_scores, missforest_all_f1_scores, gain_all_f1_scores,
#              sinkhorn_all_f1_scores, mean_all_f1_scores]
# classes = ["building_float", "building_non_float", "vehicle_float","containers",
#            "tableware", "headlamps"]
# for i, ax in enumerate(axs[:len(cms)]):
#     sns.heatmap(cms[i], annot=True, fmt='d', cmap='RdPu', ax=axs[i], xticklabels=classes,
#                 yticklabels=classes, cbar=False, annot_kws={"size": 20})
#     axs[i].set_title(
#         "{} ({:.2f}, {:.2f})".format(methods[i], np.mean(scores[i]), np.mean(f1_scores[i])),fontsize=22)
#     axs[i].tick_params(axis='x', labelsize=20)
#     axs[i].tick_params(axis='y', labelsize=20, rotation=0)
# # del the reamining axes
# for i in range(len(cms), len(axs)):
#     fig.delaxes(axs[i])

# fig.tight_layout()
# plt.savefig("results/pipeline_cls_glass.png", dpi=300)

In [8]:
# print("Accuracy (hyperimpute): {:.2f} ± {:.2f}".format(np.mean(hyper_all_scores), np.std(hyper_all_scores)))
# print("Accuracy (et): {:.2f} ± {:.2f}".format(np.mean(et_all_scores), np.std(et_all_scores)))
# print("Accuracy (missforest): {:.2f} ± {:.2f}".format(np.mean(missforest_all_scores), np.std(missforest_all_scores)))
# print("Accuracy (original): {:.2f} ± {:.2f}".format(np.mean(ori_all_scores), np.std(ori_all_scores)))
# print("Accuracy (gain): {:.2f} ± {:.2f}".format(np.mean(gain_all_scores), np.std(gain_all_scores)))
# print("Accuracy (sinkhorn): {:.2f} ± {:.2f}".format(np.mean(sinkhorn_all_scores), np.std(sinkhorn_all_scores)))
# print("Accuracy (mean): {:.2f} ± {:.2f}".format(np.mean(mean_all_scores), np.std(mean_all_scores)))

In [9]:
# print("F1 Score (hyperimpute): {:.2f} ± {:.2f}".format(np.mean(hyper_all_f1_scores), np.std(hyper_all_f1_scores)))
# print("F1 Score (et): {:.2f} ± {:.2f}".format(np.mean(et_all_f1_scores), np.std(et_all_f1_scores)))
# print("F1 Score (missforest): {:.2f} ± {:.2f}".format(np.mean(missforest_all_f1_scores),
#                                                       np.std(missforest_all_f1_scores)))
# print("F1 Score (original): {:.2f} ± {:.2f}".format(np.mean(ori_all_f1_scores), np.std(ori_all_f1_scores)))
# print("F1 Score (gain): {:.2f} ± {:.2f}".format(np.mean(gain_all_f1_scores), np.std(gain_all_f1_scores)))
# print("F1 Score (sinkhorn): {:.2f} ± {:.2f}".format(np.mean(sinkhorn_all_f1_scores), np.std(sinkhorn_all_f1_scores)))
# print("F1 Score (mean): {:.2f} ± {:.2f}".format(np.mean(mean_all_f1_scores), np.std(mean_all_f1_scores)))

In [10]:
# print("Accuracy (hyperimpute):", hyper_all_scores)
# print("Accuracy (et):", et_all_scores)
# print("Accuracy (missforest):", missforest_all_scores)
# print("Accuracy (original):", ori_all_scores)
# print("Accuracy (gain):", gain_all_scores)
# print("Accuracy (sinkhorn):", sinkhorn_all_scores)
# print("Accuracy (mean):", mean_all_scores)
# print("=========================================")
# print("F1 Score (hyperimpute):", hyper_all_f1_scores)
# print("F1 Score (et):", et_all_f1_scores)
# print("F1 Score (missforest):", missforest_all_f1_scores)
# print("F1 Score (original):", ori_all_f1_scores)
# print("F1 Score (gain):", gain_all_f1_scores)
# print("F1 Score (sinkhorn):", sinkhorn_all_f1_scores)
# print("F1 Score (mean):", mean_all_f1_scores)

In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from hyperimpute.plugins.utils.simulate import simulate_nan
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from hyperimpute.plugins.utils.simulate import simulate_nan
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

target_col = "Type of glass"
X = df.drop(target_col, axis=1)
y = df[target_col]

original_scores = []
matimpute_scores = []
hyperimpute_scores = []
missforest_scores = []
gain_scores = []
sinkhorn_scores = []
mean_scores = []

for ratio in tqdm([0.1, 0.2, 0.3, 0.4, 0.5]):
    original_score = []
    matimpute_score = []
    hyperimpute_score = []
    missforest_score = []
    gain_score = []
    sinkhorn_score = []
    mean_score = []
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        df_train = pd.concat([X_train, y_train], axis=1)
        df_test = pd.concat([X_test, y_test], axis=1)
        # reset the index
        df_train = df_train.reset_index(drop=True)
        df_test = df_test.reset_index(drop=True)
        X_train = df_train.drop(target_col, axis=1)
        y_train = df_train[target_col]
        X_test = df_test.drop(target_col, axis=1)
        y_test = df_test[target_col]

        cols = X_train.columns
        X_train = X_train.to_numpy()
        # np.random.seed(0)
        X_train_nan = simulate_nan(X_train, ratio)['X_incomp']
        X_train_nan = pd.DataFrame(X_train_nan, columns=cols)

        X_train_imputed_hyper = hyper.fit_transform(X_train_nan.copy())
        X_train_imputed_et = et.fit_transform(X_train_nan.copy())
        X_train_imputed_missforest = missforest.fit_transform(X_train_nan.copy())
        X_train_imputed_gain = gain.fit_transform(X_train_nan.copy())
        X_train_imputed_sinkhorn = sinkhorn.fit_transform(X_train_nan.copy())
        X_train_imputed_mean = mean.fit_transform(X_train_nan.copy())
        
        clf_original = RandomForestClassifier(n_estimators=500)
        clf_original.fit(X_train, y_train)
        y_pred_original = clf_original.predict(X_test)
        original_score.append(accuracy_score(y_test, y_pred_original))

        clf_et = RandomForestClassifier(n_estimators=500)
        clf_et.fit(X_train_imputed_et, y_train)
        y_pred_et = clf_et.predict(X_test)
        matimpute_score.append(accuracy_score(y_test, y_pred_et))
        
        clf_hyper = RandomForestClassifier(n_estimators=500)
        clf_hyper.fit(X_train_imputed_hyper, y_train)
        y_pred_hyper = clf_hyper.predict(X_test)
        hyperimpute_score.append(accuracy_score(y_test, y_pred_hyper))


        clf_missforest = RandomForestClassifier(n_estimators=500)
        clf_missforest.fit(X_train_imputed_missforest, y_train)
        y_pred_missforest = clf_missforest.predict(X_test)
        missforest_score.append(accuracy_score(y_test, y_pred_missforest))

        clf_gain = RandomForestClassifier(n_estimators=500)
        clf_gain.fit(X_train_imputed_gain, y_train)
        y_pred_gain = clf_gain.predict(X_test)
        gain_score.append(accuracy_score(y_test, y_pred_gain))


        clf_sinkhorn = RandomForestClassifier(n_estimators=500)
        clf_sinkhorn.fit(X_train_imputed_sinkhorn, y_train)
        y_pred_sinkhorn = clf_sinkhorn.predict(X_test)
        sinkhorn_score.append(accuracy_score(y_test, y_pred_sinkhorn))


        clf_mean = RandomForestClassifier(n_estimators=500)
        clf_mean.fit(X_train_imputed_mean, y_train)
        y_pred_mean = clf_mean.predict(X_test)
        mean_score.append(accuracy_score(y_test, y_pred_mean))
    original_scores.append(original_score)
    matimpute_scores.append(matimpute_score)
    hyperimpute_scores.append(hyperimpute_score)
    missforest_scores.append(missforest_score)
    gain_scores.append(gain_score)
    sinkhorn_scores.append(sinkhorn_score)
    mean_scores.append(mean_score)

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [06:17<00:00, 75.50s/it]


In [12]:
# save the results

results = [
    original_scores,
    matimpute_scores,
    hyperimpute_scores,
    missforest_scores,
    gain_scores,
    sinkhorn_scores,
    mean_scores
]

# save to npy file
np.save("results/results_glass.npy", results)

In [13]:
# # plot the results
# import scienceplots

# plt.style.use('default')
# plt.style.use(['science','no-latex'])
# # set font as times new roman
# plt.rcParams["font.family"] = "Times New Roman"
# plt.figure(figsize=(8, 6))

# # # set no grid
# plt.grid(False)

# sns.lineplot(x=[0.1, 0.2, 0.3, 0.4, 0.5], y=original_scores, label='Original', marker='o',color='#7ED9D9',lw=2, markersize=8)
# sns.lineplot(x=[0.1, 0.2, 0.3, 0.4, 0.5], y=matimpute_scores, label='MatImpute', marker='*',color='#F35F5F',lw=2, markersize=12)
# sns.lineplot(x=[0.1, 0.2, 0.3, 0.4, 0.5], y=hyperimpute_scores, label='HyperImpute', marker='v',color='#9467BD',lw=2, markersize=8)
# sns.lineplot(x=[0.1, 0.2, 0.3, 0.4, 0.5], y=missforest_scores, label='MissForest', marker='^',color='#B3DE69',lw=2, markersize=8)
# sns.lineplot(x=[0.1, 0.2, 0.3, 0.4, 0.5], y=gain_scores, label='Gain', marker='>',color='#FFC0D9',lw=2, markersize=8)
# sns.lineplot(x=[0.1, 0.2, 0.3, 0.4, 0.5], y=sinkhorn_scores, label='Sinkhorn', marker='<',color='#5FBDFF',lw=2, markersize=8)
# sns.lineplot(x=[0.1, 0.2, 0.3, 0.4, 0.5], y=mean_scores, label='Mean', marker='s',color='#FDBF6E',lw=2, markersize=7)

# plt.xlabel('Missing Ratio', fontsize=24)
# plt.ylabel('Accuracy',fontsize=24)
# plt.tick_params(labelsize=24)
# plt.legend([])
# plt.tight_layout(pad=1.5)
# plt.savefig("results/missing_ratio_cls_glass.png", dpi=300)