In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error

def create_missing_values(data, missing_rate=0.20):
    np.random.seed(42) 
    modified_data = data.copy()
    mask = pd.DataFrame(np.zeros(data.shape), columns=data.columns)

    for column in data.columns:
        n_missing = int(missing_rate * len(data))
        missing_indices = np.random.choice(data.index, size=n_missing, replace=False)
        modified_data.loc[missing_indices, column] = np.nan
        mask.loc[missing_indices, column] = 1

    return modified_data, mask

def impute_missing_values_with_mean(data):
    imputed_data = data.copy()
    for column in data.columns[:-1]: 
        if data[column].dtype in ['float64', 'int64']:
            imputed_data[column].fillna(data[column].mean(), inplace=True)

    imputed_data['species'].fillna(data['species'].mode()[0], inplace=True)
    return imputed_data

def calculate_rmse_nrmse_variance_normalized(original_data, imputed_data, mask):
    metrics = {}
    for column in original_data.columns:
        if original_data[column].dtype in ['float64', 'int64'] and mask[column].sum() > 0:
            original_values = original_data[column][mask[column] == 1]
            imputed_values = imputed_data[column][mask[column] == 1]
            mse = mean_squared_error(original_values, imputed_values)
            rmse = np.sqrt(mse)
            variance = np.var(original_data[column], ddof=1)
            nrmse_var = rmse / np.sqrt(variance)
            metrics[column] = {'RMSE': rmse, 'NRMSE_var': nrmse_var}
        elif original_data[column].dtype == 'object' and mask[column].sum() > 0:
            original_values = original_data[column][mask[column] == 1]
            imputed_values = imputed_data[column][mask[column] == 1]
            error_rate = (original_values != imputed_values).sum() / len(original_values)
            metrics[column] = {'Error Rate': error_rate}
    return metrics

iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)


data_with_missing, missing_mask = create_missing_values(iris_df)
data_imputed = impute_missing_values_with_mean(data_with_missing)
evaluation_metrics = calculate_rmse_nrmse_variance_normalized(iris_df, data_imputed, missing_mask)


evaluation_metrics_output = []
for column, metrics in evaluation_metrics.items():
    if 'NRMSE_var' in metrics:
        evaluation_metrics_output.append(f"{column} - RMSE: {metrics['RMSE']:.2f}, Variance-Normalized NRMSE: {metrics['NRMSE_var']:.3f}")
    else:
        evaluation_metrics_output.append(f"{column} - Error Rate: {metrics['Error Rate']:.3f}")

evaluation_metrics_output


['sepal length (cm) - RMSE: 0.85, Variance-Normalized NRMSE: 1.024',
 'sepal width (cm) - RMSE: 0.53, Variance-Normalized NRMSE: 1.220',
 'petal length (cm) - RMSE: 1.63, Variance-Normalized NRMSE: 0.925',
 'petal width (cm) - RMSE: 0.79, Variance-Normalized NRMSE: 1.034']

In [3]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris


iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

np.random.seed(42)  
missing_rate = 0.20  
n_missing = int(missing_rate * len(iris_df))
missing_indices = np.random.choice(iris_df.index, size=n_missing, replace=False)
iris_df.loc[missing_indices, 'species'] = np.nan

original_species = iris_df['species'].copy()

most_frequent_species = iris_df['species'].mode()[0]
iris_df['species'].fillna(most_frequent_species, inplace=True)

mismatches = (iris_df.loc[missing_indices, 'species'] != original_species[missing_indices]).sum()
error_rate = mismatches / n_missing

print(f"Error Rate for 'species' column: {error_rate:.3f}")


Error Rate for 'species' column: 1.000
