In [36]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

def create_missing_values(data, missing_rate=0.10):
    np.random.seed(42)  
    modified_data = data.copy()
    mask = pd.DataFrame(np.zeros(data.shape), columns=data.columns)

    for column in data.columns:
        n_missing = int(missing_rate * len(data))
        missing_indices = np.random.choice(data.index, size=n_missing, replace=False)
        modified_data.loc[missing_indices, column] = np.nan
        mask.loc[missing_indices, column] = 1

    return modified_data, mask

def impute_missing_values_with_mean(data):
    imputed_data = data.copy()
    for column in data.columns:
        if data[column].dtype in ['float64', 'int64']:
            imputed_data[column].fillna(data[column].mean(), inplace=True)
        else:
            imputed_data[column].fillna(data[column].mode()[0], inplace=True)
    return imputed_data

def calculate_rmse_nrmse_variance_normalized(original_data, imputed_data, mask):
    metrics = {}
    for column in original_data.columns:
        if original_data[column].dtype in ['float64', 'int64'] and mask[column].sum() > 0:
            original_values = original_data[column][mask[column] == 1]
            imputed_values = imputed_data[column][mask[column] == 1]
            mse = mean_squared_error(original_values, imputed_values)
            rmse = np.sqrt(mse)
            variance = np.var(original_data[column], ddof=1)  # Population variance
            nrmse = rmse / np.sqrt(variance)
            metrics[column] = {'RMSE': rmse, 'NRMSE_var': nrmse}
    return metrics


file_path = '/Users/emondemoniac/Desktop/TabNet_PyTorch/Medical Cost Personal Datasets/insurance.csv'
data = pd.read_csv(file_path)


data_with_missing, missing_mask = create_missing_values(data)


data_imputed = impute_missing_values_with_mean(data_with_missing)


evaluation_metrics_variance_normalized = calculate_rmse_nrmse_variance_normalized(data, data_imputed, missing_mask)


for column, metrics in evaluation_metrics_variance_normalized.items():
    print(f"{column} - RMSE: {metrics['RMSE']:.2f}, NRMSE_var: {metrics['NRMSE_var']:.3f}")


age - RMSE: 13.81, NRMSE_var: 0.983
bmi - RMSE: 5.83, NRMSE_var: 0.957
children - RMSE: 1.31, NRMSE_var: 1.083
charges - RMSE: 10748.84, NRMSE_var: 0.888


In [38]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

def create_missing_values(data, missing_rate=0.10):
    np.random.seed(42)  
    modified_data = data.copy()
    mask = pd.DataFrame(np.zeros(data.shape), columns=data.columns)

    for column in data.columns:
        n_missing = int(missing_rate * len(data))
        missing_indices = np.random.choice(data.index, size=n_missing, replace=False)
        modified_data.loc[missing_indices, column] = np.nan
        mask.loc[missing_indices, column] = 1

    return modified_data, mask

def impute_missing_values_with_mean(data):
    imputed_data = data.copy()
    for column in data.columns:
        if data[column].dtype in ['float64', 'int64']:
            imputed_data[column].fillna(data[column].mean(), inplace=True)
        else:
            imputed_data[column].fillna(data[column].mode()[0], inplace=True)
    return imputed_data

def calculate_rmse_nrmse_variance_normalized(original_data, imputed_data, mask):
    metrics = {}
    for column in original_data.columns:
        if original_data[column].dtype in ['float64', 'int64'] and mask[column].sum() > 0:
            original_values = original_data[column][mask[column] == 1]
            imputed_values = imputed_data[column][mask[column] == 1]
            mse = mean_squared_error(original_values, imputed_values)
            rmse = np.sqrt(mse)
            variance = np.var(original_data[column], ddof=1)
            nrmse_var = rmse / np.sqrt(variance)
            metrics[column] = {'RMSE': rmse, 'NRMSE_var': nrmse_var}
        elif original_data[column].dtype == 'object' and mask[column].sum() > 0:
            original_values = original_data[column][mask[column] == 1]
            imputed_values = imputed_data[column][mask[column] == 1]
            error_rate = (original_values != imputed_values).sum() / len(original_values)
            metrics[column] = {'Error Rate': error_rate}
    return metrics

# Load the dataset
file_path = '/Users/emondemoniac/Desktop/TabNet_PyTorch/Medical Cost Personal Datasets/insurance.csv'
data = pd.read_csv(file_path)


data_with_missing, missing_mask = create_missing_values(data)


data_imputed = impute_missing_values_with_mean(data_with_missing)


evaluation_metrics = calculate_rmse_nrmse_variance_normalized(data, data_imputed, missing_mask)


for column, metrics in evaluation_metrics.items():
    if 'NRMSE_var' in metrics:
        print(f"{column} - RMSE: {metrics['RMSE']:.2f}, Variance-Normalized NRMSE: {metrics['NRMSE_var']:.3f}")
    else:
        print(f"{column} - Error Rate: {metrics['Error Rate']:.3f}")


age - RMSE: 13.81, Variance-Normalized NRMSE: 0.983
sex - Error Rate: 0.541
bmi - RMSE: 5.83, Variance-Normalized NRMSE: 0.957
children - RMSE: 1.31, Variance-Normalized NRMSE: 1.083
smoker - Error Rate: 0.203
region - Error Rate: 0.692
charges - RMSE: 10748.84, Variance-Normalized NRMSE: 0.888


In [40]:
(0.983+ 0.957+1.083+0.888)/4

0.9777499999999999