In [151]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [152]:
data2009 = pd.read_csv('dataset_2009.csv')
data2011 = pd.read_csv('dataset_2011.csv')
data2013 = pd.read_csv('dataset_2013.csv')
data2015 = pd.read_csv('dataset_2015.csv')
data2017 = pd.read_csv('dataset_2017.csv')


In [153]:
# Cheking unique columns
first = data2015.columns
second = data2009.columns

only_in_2015_2016 = set(first) - set(second)

print(" Unique coloumns 2015-2016:", only_in_2015_2016)


 Unique coloumns 2015-2016: {'SLQ120', 'LBDFERSI', 'LBXHSCRP'}


In [154]:
df = pd.concat([data2009,data2011, data2013, data2015, data2017], axis=0, ignore_index=True)
print(df.shape)


(55976, 80)


In [155]:
df.head()
df = df.dropna(subset=['LBDGLUSI', 'LBXGH', 'LBXIN'])
#df = df.dropna(subset=['LBDGLUSI'])
print(df.shape)
df[['LBDGLUSI', 'LBXGH', 'LBXIN']].isna().sum()


(16806, 80)


LBDGLUSI    0
LBXGH       0
LBXIN       0
dtype: int64

In [156]:
# Creating HOMA-IR 
# Formula: HOMA-IR = (Fasting Glucose * Insulin) / 22.5

df['HOMA_IR'] = (df['LBDGLUSI'] * df['LBXIN']) / 22.5

#Diabetes_Status

def determine_diabetes_status(row):
    if row['LBDGLUSI'] >= 7.0 or row['LBXGH'] >= 6.5:
        return 2  # Diabetes
    elif 5.6 <= row['LBDGLUSI'] < 7.0 or 5.7 <= row['LBXGH'] < 6.5:
        return 1  # Prediabetes
    else:
        return 0  # Normal

# creation target 1 
df['Diabetes_Status'] = df.apply(determine_diabetes_status, axis=1)

# Assign Insulin_Resistance based on HOMA-IR > 2.9
df['Insulin_Resistance'] = (df['HOMA_IR'] > 2.9).astype(int)

print(df['Diabetes_Status'].value_counts())
print(df['Insulin_Resistance'].value_counts())


0    8461
1    6140
2    2205
Name: Diabetes_Status, dtype: int64
0    9666
1    7140
Name: Insulin_Resistance, dtype: int64


"""
# Dataset Variables Description

## 1. Identifier
1. **SEQN**: Unique respondent identifier.

## 2. Body Measures (P_BMX)
NUM  2. **BMXWT**: Weight (kg).
NUM  3. **BMXHT**: Height (cm).
NUM  4. **BMXBMI**: Body Mass Index (kg/m²).
NUM  5. **BMXWAIST**: Waist circumference (cm).
NUM  6. **WHtR**: Waist-to-height ratio (BMXWAIST / BMXHT).

## 3. Blood Pressure (P_BPXO)
NUM  7. **AvgSystolicBP**: Average systolic blood pressure (mmHg).
NUM  8. **AvgDiastolicBP**: Average diastolic blood pressure (mmHg).
NUM  9. **PulsePressure**: Pulse pressure (mmHg).
NUM  10. **AvgPulseRate**: Average pulse rate (beats per minute).

## 4. Demographics Data (P_DEMO)
obj  11. **SDDSRVYR**: Survey cycle (year).
obj  12. **RIAGENDR**: Gender (1 = Male, 2 = Female).
NUM  13. **RIDAGEYR**: Age (years).
obj  14. **RIDRETH3**: Race/Ethnicity.
obj  15. **DMDBORN4**: Country of birth.
obj  16. **DMDEDUC2**: Education level.
obj  17. **RIDEXPRG**: Pregnancy status.
NUM  18. **INDFMPIR**: Income-to-poverty ratio.
obj  19. **DMDMARTZ**: Marital status.

## 5. Lab Tests (P_LAB)
NUM  20. **WTINTPRP**: Weighting factor for interview.
NUM  21. **WTMECPRP**: Weighting factor for physical examination.
NUM  22. **WTSAFPRP**: Weighting factor for fasting measures.
NUM  23. **LBDGLUSI**: Fasting glucose (mmol/L).
NUM  24. **LBXIN**: Insulin (µU/mL).
NUM  25. **LBXGH**: Glycated hemoglobin (HbA1c, %).
NUM  26. **LBDTCSI**: Total cholesterol (mmol/L).
NUM  27. **LBDHDDSI**: HDL cholesterol (mmol/L).
NUM  28. **LBXTR**: Triglycerides (mg/dL).
NUM  29. **LBDTRSI**: Triglycerides (mmol/L).
NUM  30. **LBDLDL**: LDL cholesterol (mmol/L).
NUM  31. **LBXHSCRP**: High-sensitivity C-reactive protein (mg/L).
NUM  32. **LBDFERSI**: Ferritin (µg/L).
NUM  33. **HOMA_IR**: Homeostatic Model Assessment for Insulin Resistance.

## 6. Alcohol Use (P_ALQ)
obj  34. **ALQ111**: Ever consumed alcohol.
obj  35. **ALQ121**: Alcohol consumption frequency in the past year.
NUM  36. **ALQ130**: Average number of drinks per day.
obj  37. **ALQ142**: Number of drinking days in the past 12 months.
obj  38. **ALQ151**: Binge drinking episodes. 4/5 days 

## 7. Blood Pressure & Cholesterol (P_BPQ)
obj  39. **BPQ020**: Diagnosed with hypertension.
obj  40. **BPQ040A**: Taking hypertension medication.
obj  41. **BPQ050A**: Taking medication for high blood pressure.
obj  42. **BPQ080**: Diagnosed with high cholesterol.
obj  43. **BPQ090D**: Taking medication for high cholesterol.

## 8. Diabetes (P_DIQ)
obj  44. **DIQ160**: Ever told you have prediabetes.
obj  45. **DIQ010**: Doctor told you have diabetes.
NUM  46. **DID040**: Age when first diagnosed with diabetes.

## 9. Diet Behavior & Nutrition (P_DBQ)
NUM  47. **DBD900**: Number of fast food meals in the past 30 days.
NUM  48. **DBD905**: Number of ready-to-eat meals in the past 30 days.
NUM  49. **DBD910**: Number of frozen meals in the past 30 days.
obj  50. **DBQ197**: Milk product consumption in the past 30 days.
NUM  51. **DBD895**: Number of meals eaten away from home.
NUM  52. **DBD030**: Age stopped breastfeeding (days).
NUM  53. **DBD041**: Age first fed formula (days).
obj  54. **DBQ700**: How healthy is the diet

## 10. Health Insurance (P_HIQ) & Income (P_INQ)
obj  55. **HIQ011**: Has health insurance (Yes/No).
obj  56. **INDFMMPC**: Family monthly income-to-poverty ratio.

## 11. Medical Conditions (P_MCQ)
obj  57. **MCQ080**: Doctor advised about being overweight.
obj  58. **MCQ160B**: History of heart failure.
obj  59. **MCQ160C**: History of ischemic heart disease.
obj  60. **MCQ160M**: History of thyroid problems.
obj  61. **MCQ300C**: Family history of diabetes.

## 12. Mental Health - Depression Screener (P_DPQ)
obj  62. **DPQ020**: Feeling down, depressed, or hopeless.
obj  63. **DPQ030**: Sleep problems.
obj  64. **DPQ040**: Feeling tired or lacking energy.
obj  65. **DPQ060**: Low self-esteem.
obj  66. **DPQ070**: Difficulty concentrating.

## 13. Physical Activity (P_PAQ)
obj  67. **PAQ605**: Active job.
obj  68. **PAQ620**: Moderate work activity.
obj  69. **PAQ635**: Walking or cycling.
obj  70. **PAQ650**: Active leisure time.
obj  71. **PAQ665**: Moderate exercise frequency.
NUM  72. **PAD680**: Minutes of sedentary activity per day.

## 14. Sleep Disorders (P_SLQ)
NUM  73. **SLD012**: Average sleep duration on weekdays.
obj  74. **SLQ120**: Frequency of excessive daytime sleepiness.

## 15. Smoking - Cigarette Use (P_SMQ)
obj  75. **SMQ020**: Ever smoked 100 or more cigarettes.
NUM  76. **SMD650**: Average cigarettes per day in the past 30 days.
NUM  77. **SMD030**: Age started smoking regularly.
obj  78. **SMQ040**: Do you currently smoke cigarettes?

## 16. Weight History (P_WHQ)
NUM  79. **WHD110**: Weight 10 years ago (lbs).
NUM  80. **WHD120**: Weight at age 25 (lbs).
NUM  81. **WHD140**: Maximum lifetime weight (lbs).

## 17. Target Variables
obj  82. **Diabetes_Status**: Diabetes diagnosis (Yes/No).
obj  83. **Insulin_Resistance**: Diagnosed insulin resistance (Yes/No).
"""

In [157]:
desired_order = [
    "SEQN",  # Identifier
    # Body Measures
    "BMXWT", "BMXHT", "BMXBMI", "BMXWAIST", "WHtR",
    # Blood Pressure
    "AvgSystolicBP", "AvgDiastolicBP", "PulsePressure", "AvgPulseRate",
    # Demographics
    "SDDSRVYR", "RIAGENDR", "RIDAGEYR", "RIDRETH3", "DMDBORN4", "DMDEDUC2", "RIDEXPRG",
    "INDFMPIR", "DMDMARTZ",
    # Lab Tests
    "WTINTPRP", "WTMECPRP", "WTSAFPRP", "LBDGLUSI", "LBXIN", "LBXGH", "LBDTCSI",
    "LBDHDDSI", "LBXTR", "LBDTRSI", "LBDLDL", "LBXHSCRP", "LBDFERSI", "HOMA_IR",
    # Alcohol Use
    "ALQ111", "ALQ121", "ALQ130", "ALQ142", "ALQ151",
    # Blood Pressure & Cholesterol
    "BPQ020", "BPQ040A", "BPQ050A", "BPQ080", "BPQ090D",
    # Diabetes
    "DIQ160", "DIQ010", "DID040",
    # Diet Behavior & Nutrition
    "DBD900", "DBD905", "DBD910", "DBQ197", "DBD895", "DBD030", "DBD041", "DBQ700",
    # Health Insurance & Income
    "HIQ011", "INDFMMPC",
    # Medical Conditions
    "MCQ080", "MCQ160B", "MCQ160C", "MCQ160M", "MCQ300C",
    # Mental Health
    "DPQ020", "DPQ030", "DPQ040", "DPQ060", "DPQ070",
    # Physical Activity
    "PAQ605", "PAQ620", "PAQ635", "PAQ650", "PAQ665", "PAD680",
    # Smoking
    "SMQ020", "SMD650", "SMD030", "SMQ040",
    # Weight History
    "WHD110", "WHD120", "WHD140",
    # Sleep Disorders
    "SLD012", "SLQ120",
    # Target Variables
    "Diabetes_Status", "Insulin_Resistance"
]

existing_columns = [col for col in desired_order if col in df.columns]

df = df[existing_columns]

print(df.shape)
df.isna().sum()

(16806, 83)


SEQN                      0
BMXWT                   184
BMXHT                   189
BMXBMI                  211
BMXWAIST                697
WHtR                    719
AvgSystolicBP           894
AvgDiastolicBP          894
PulsePressure           894
AvgPulseRate           1188
SDDSRVYR                  0
RIAGENDR                  0
RIDAGEYR                  0
RIDRETH3                  0
DMDBORN4                  0
DMDEDUC2               2902
RIDEXPRG              13793
INDFMPIR               1695
DMDMARTZ               2902
WTINTPRP                  0
WTMECPRP                  0
WTSAFPRP                  0
LBDGLUSI                  0
LBXIN                     0
LBXGH                     0
LBDTCSI                   7
LBDHDDSI                  7
LBXTR                   240
LBDTRSI                 240
LBDLDL                  431
LBXHSCRP               9287
LBDFERSI              11328
HOMA_IR                   0
ALQ111                10237
ALQ121                 5435
ALQ130              

In [158]:
def calculate_missing_percentage(df,categorical_features,numerical_features):
    missing_counts = df.isnull().sum()
    total_rows = len(df)
    missing_percentage = round((missing_counts / total_rows) * 100, 1)
    
    feature_types = [
        'Categorical' if col in categorical_features else 'Numerical' 
        for col in df.columns
    ]
    
    missing_df = pd.DataFrame({
        'Feature': df.columns,
        'Missing_Count': missing_counts.values,
        'Missing_Percentage': missing_percentage.values,
        'Feature_Type': feature_types
    })
    
    # Сортировка по проценту пропусков
    missing_df = missing_df.sort_values(by='Missing_Percentage', ascending=True).reset_index(drop=True)
    
    return missing_df


categorical_features = [
        'SDDSRVYR','RIAGENDR', 'RIDRETH3', 'DMDBORN4', 'DMDEDUC2','RIDEXPRG', 'DMDMARTZ',
        'ALQ111', 'ALQ121', 'ALQ142','ALQ151','BPQ020','BPQ040A','BPQ050A','BPQ080','BPQ090D',
        'DIQ160','DIQ010','DBQ197','DBQ700','HIQ011','INDFMMPC','MCQ080','MCQ160B','MCQ160C','MCQ160M','MCQ300C',
        'DPQ020','DPQ030','DPQ040','DPQ060','DPQ070','PAQ605','PAQ620','PAQ635','PAQ650','PAQ665','SLQ120',
        'SMQ020','SMQ040','Diabetes_Status','Insulin_Resistance'
    ]
    
numerical_features = [col for col in df.columns if col not in categorical_features]

missing_data_summary = calculate_missing_percentage(df,categorical_features,numerical_features)
missing_data_summary[missing_data_summary["Missing_Percentage"] > 5]

#missing_data_summary.to_excel("missing_data_summary.xlsx", index=False)

Unnamed: 0,Feature,Missing_Count,Missing_Percentage,Feature_Type
37,INDFMMPC,863,5.1,Categorical
38,PulsePressure,894,5.3,Numerical
39,AvgSystolicBP,894,5.3,Numerical
40,AvgDiastolicBP,894,5.3,Numerical
41,AvgPulseRate,1188,7.1,Numerical
42,BPQ020,1475,8.8,Categorical
43,MCQ080,1475,8.8,Categorical
44,DBQ700,1475,8.8,Categorical
45,SLD012,1526,9.1,Numerical
46,INDFMPIR,1695,10.1,Numerical


In [159]:
#df = df.dropna(subset=['BMXWT', 'BMXHT', 'BMXBMI','BMXWAIST','AvgSystolicBP','AvgDiastolicBP','DMDEDUC2','INDFMPIR','ALQ121','WHD120','SLD012','DBD900'])


columns_to_drop = [
    "LBXHSCRP", "SLQ120", "ALQ111", "SMD030", "SMQ040", "LBDFERSI",
    "BPQ040A", "BPQ050A", "RIDEXPRG", "SMD650", "DID040", "DBD041", "DBD030","INDFMPIR","LBXTR","BPQ090D",
    "WTSAFPRP","WTMECPRP",'WTINTPRP'
]


df.drop(columns=columns_to_drop, inplace=True, errors="ignore")


In [160]:
#df['DIQ010'].value_counts()
#df['Diabetes_Status'].value_counts()

In [161]:
df = df.dropna(subset=['MCQ300C'])
#df = df.dropna(subset=['BMXWT', 'BMXHT', 'BMXBMI','BMXWAIST','AvgSystolicBP','AvgDiastolicBP','AvgPulseRate','LBDTRSI','LBDLDL'])
print(df.shape)
df.isna().sum()

(13904, 64)


SEQN                     0
BMXWT                  140
BMXHT                  145
BMXBMI                 166
BMXWAIST               600
WHtR                   621
AvgSystolicBP          738
AvgDiastolicBP         738
PulsePressure          738
AvgPulseRate           996
SDDSRVYR                 0
RIAGENDR                 0
RIDAGEYR                 0
RIDRETH3                 0
DMDBORN4                 0
DMDEDUC2                 0
DMDMARTZ                 0
LBDGLUSI                 0
LBXIN                    0
LBXGH                    0
LBDTCSI                  4
LBDHDDSI                 4
LBDTRSI                189
LBDLDL                 375
HOMA_IR                  0
ALQ121                2862
ALQ130                5198
ALQ142                5207
ALQ151                2856
BPQ020                   0
BPQ080                 795
DIQ160                2277
DIQ010                   0
DBD900                3242
DBD905                  15
DBD910                  14
DBQ197                   0
D

In [162]:
missing_data_summary[missing_data_summary["Missing_Percentage"] > 0]

Unnamed: 0,Feature,Missing_Count,Missing_Percentage,Feature_Type
20,DBD895,71,0.4,Numerical
21,DBD910,90,0.5,Numerical
22,DBD905,88,0.5,Numerical
23,BMXWT,184,1.1,Numerical
24,BMXHT,189,1.1,Numerical
25,BMXBMI,211,1.3,Numerical
26,LBXTR,240,1.4,Numerical
27,LBDTRSI,240,1.4,Numerical
28,LBDLDL,431,2.6,Numerical
29,PAQ605,599,3.6,Categorical


In [163]:
categorical_features = [col for col in categorical_features if col not in columns_to_drop]
numerical_features = [col for col in df.columns if col not in categorical_features]
df[categorical_features] = df[categorical_features].astype("str")

missing_data = calculate_missing_percentage(df,categorical_features,numerical_features)

low_missing = missing_data[(missing_data["Missing_Percentage"] > 0) & (missing_data["Missing_Percentage"] <= 5)]
medium_missing = missing_data[(missing_data["Missing_Percentage"] > 5) & (missing_data["Missing_Percentage"] <= 20)]
high_missing = missing_data[missing_data["Missing_Percentage"] > 20]

# Делим числовые и категориальные переменные в каждой категории
low_missing_numerical = low_missing[low_missing["Feature_Type"] == "Numerical"]["Feature"].tolist()
low_missing_categorical = low_missing[low_missing["Feature_Type"] == "Categorical"]["Feature"].tolist()

medium_missing_numerical = medium_missing[medium_missing["Feature_Type"] == "Numerical"]["Feature"].tolist()
medium_missing_categorical = medium_missing[medium_missing["Feature_Type"] == "Categorical"]["Feature"].tolist()

high_missing_numerical = high_missing[high_missing["Feature_Type"] == "Numerical"]["Feature"].tolist()
high_missing_categorical = high_missing[high_missing["Feature_Type"] == "Categorical"]["Feature"].tolist()

# Вывод 
print("Nan (0-5%):")
print("\n Numerical", low_missing_numerical)
print("Categorical:", low_missing_categorical)

print("\nNan (5-20%):")

print("\n Numerical:", medium_missing_numerical)
print("Categorical:", medium_missing_categorical)

print("\n Nan (>20%):")

print("\nNumerical:", high_missing_numerical)
print("Categorical:", high_missing_categorical)

Nan (0-5%):

 Numerical ['DBD910', 'PAD680', 'DBD905', 'WHD140', 'SLD012', 'BMXWT', 'BMXHT', 'BMXBMI', 'LBDTRSI', 'LBDLDL', 'BMXWAIST', 'WHtR']
Categorical: []

Nan (5-20%):

 Numerical: ['AvgSystolicBP', 'PulsePressure', 'AvgDiastolicBP', 'AvgPulseRate', 'WHD120']
Categorical: []

 Nan (>20%):

Numerical: ['DBD900', 'WHD110', 'ALQ130']
Categorical: []


In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler


def impute_missing_values(df, target_column, features_to_use):
    df_copy = df.copy()

    # Проверяем, что целевой признак есть в датасете
    if target_column not in df.columns:
        raise ValueError(f"Признак {target_column} отсутствует в датасете.")

    # Проверяем, что все признаки для предсказания есть в датасете
    missing_features = [col for col in features_to_use if col not in df.columns]
    if missing_features:
        raise ValueError(f"Признаки {missing_features} отсутствуют в датасете.")

    # только строки без nan в features_to_use
    df_subset = df_copy[features_to_use + [target_column]]
    df_subset = df_subset.dropna(subset=features_to_use, how="all")

    # таргет категориальный/числовой
    is_categorical = target_column in categorical_features

    # Кодировка категориальных признаков 
    encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    if is_categorical:
        df_subset[categorical_features] = encoder.fit_transform(df_subset[categorical_features])
        estimator = DecisionTreeClassifier()
    else:
        estimator = DecisionTreeRegressor()

    # Применяем MICE ТОЛЬКО к целевому признаку
    imputer = IterativeImputer(estimator=estimator, max_iter=200, random_state=42)
    df_subset[target_column] = imputer.fit_transform(df_subset[features_to_use + [target_column]])[:, -1]  # Обновляем только target

    # Декодировка категориальных
    if is_categorical:
        df_subset[categorical_features] = encoder.inverse_transform(df_subset[categorical_features])

    # Обновляем только `target_column` в исходном датасете
    df_copy[target_column] = df_subset[target_column]

    return df_copy

# Пример использования:
column_to_impute = "BMXWT"  # Какой признак заполняем
features_to_use = ["BMXHT", "RIAGENDR", "RIDAGEYR"]  # Какие признаки используем для заполнения

df_imputed = impute_missing_values(df, column_to_impute, features_to_use)

# Проверяем количество пропусков
print(f"Пропуски после заполнения в {column_to_impute}: {df_imputed[column_to_impute].isnull().sum()}")

Пропуски после заполнения в BMXWT: 0


In [164]:
df.dtypes 

SEQN                   object
BMXWT                 float64
BMXHT                 float64
BMXBMI                float64
BMXWAIST              float64
WHtR                  float64
AvgSystolicBP         float64
AvgDiastolicBP        float64
PulsePressure         float64
AvgPulseRate          float64
SDDSRVYR               object
RIAGENDR               object
RIDAGEYR              float64
RIDRETH3               object
DMDBORN4               object
DMDEDUC2               object
DMDMARTZ               object
LBDGLUSI              float64
LBXIN                 float64
LBXGH                 float64
LBDTCSI               float64
LBDHDDSI              float64
LBDTRSI               float64
LBDLDL                float64
HOMA_IR               float64
ALQ121                 object
ALQ130                float64
ALQ142                 object
ALQ151                 object
BPQ020                 object
BPQ080                 object
DIQ160                 object
DIQ010                 object
DBD900    

In [165]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

def evaluate_imputation(df, target_column, features_to_use, test_fraction=0.2):
    df_copy = df.copy()

    
    # Отбираем только строки без пропусков в target_column (чтобы проверить заполнение)
    df_clean = df_copy.dropna(subset=[target_column])
    
    # Создаём искусственные пропуски (берём часть данных)
    test_sample = df_clean.sample(frac=test_fraction, random_state=42)
    df_copy.loc[test_sample.index, target_column] = np.nan  # Делаем пропуски

    # Определяем, является ли признак категориальным
    is_categorical = target_column in categorical_features

    # Кодируем категориальные признаки
    encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    if is_categorical:
        df_copy[categorical_features] = encoder.fit_transform(df_copy[categorical_features])
        estimator = DecisionTreeClassifier()
    else:
        estimator = DecisionTreeRegressor()

    # Применяем MICE
    imputer = IterativeImputer(estimator=RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=42), max_iter= 50, initial_strategy= "most_frequent", tol=1e-4, random_state=42)
    df_copy[target_column] = imputer.fit_transform(df_copy[features_to_use + [target_column]])[:, -1]

    # Декодируем обратно категориальные признаки
    if is_categorical:
        df_copy[target_column] = encoder.inverse_transform(df_copy[[target_column]])

    # Сравниваем восстановленные значения с реальными
    true_values = test_sample[target_column].dropna()
    predicted_values = df_copy.loc[test_sample.index, target_column]

    # Выбираем метрику
    if is_categorical:
        accuracy = accuracy_score(true_values, predicted_values)
        print(f"Accuracy  {target_column}: {accuracy:.4f}")
    else:

        r2 = r2_score(true_values, predicted_values)
        print(f"R² --- : {r2:.4f}")

    return df_copy

# Пример использования:
#column_to_impute = "BMXWT"  # Какой признак заполняем
#features_to_use = ["BMXHT", "RIAGENDR", "RIDAGEYR","BMXWAIST","DMDMARTZ"]  # Какие признаки используем для предсказания

column_to_impute = "INDFMMPC"  # Какой признак заполняем
features_to_use = [ "RIAGENDR", "RIDAGEYR","BMXWAIST","DMDMARTZ","HIQ011"]  # Какие признаки используем для предсказания



df_imputed = evaluate_imputation(df, column_to_impute, features_to_use)

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [121]:
df.columns

Index(['SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'WHtR', 'AvgSystolicBP',
       'AvgDiastolicBP', 'PulsePressure', 'AvgPulseRate', 'SDDSRVYR',
       'RIAGENDR', 'RIDAGEYR', 'RIDRETH3', 'DMDBORN4', 'DMDEDUC2', 'DMDMARTZ',
       'LBDGLUSI', 'LBXIN', 'LBXGH', 'LBDTCSI', 'LBDHDDSI', 'LBDTRSI',
       'LBDLDL', 'HOMA_IR', 'ALQ121', 'ALQ130', 'ALQ142', 'ALQ151', 'BPQ020',
       'BPQ080', 'DIQ160', 'DIQ010', 'DBD900', 'DBD905', 'DBD910', 'DBQ197',
       'DBD895', 'DBQ700', 'HIQ011', 'INDFMMPC', 'MCQ080', 'MCQ160B',
       'MCQ160C', 'MCQ160M', 'MCQ300C', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ060',
       'DPQ070', 'PAQ605', 'PAQ620', 'PAQ635', 'PAQ650', 'PAQ665', 'PAD680',
       'SMQ020', 'WHD110', 'WHD120', 'WHD140', 'SLD012', 'Diabetes_Status',
       'Insulin_Resistance'],
      dtype='object')

In [147]:
df.dtypes

SEQN                   object
BMXWT                 float64
BMXHT                 float64
BMXBMI                float64
BMXWAIST              float64
WHtR                  float64
AvgSystolicBP         float64
AvgDiastolicBP        float64
PulsePressure         float64
AvgPulseRate          float64
SDDSRVYR               object
RIAGENDR               object
RIDAGEYR              float64
RIDRETH3               object
DMDBORN4               object
DMDEDUC2               object
DMDMARTZ               object
LBDGLUSI              float64
LBXIN                 float64
LBXGH                 float64
LBDTCSI               float64
LBDHDDSI              float64
LBDTRSI               float64
LBDLDL                float64
HOMA_IR               float64
ALQ121                 object
ALQ130                float64
ALQ142                 object
ALQ151                 object
BPQ020                 object
BPQ080                 object
DIQ160                 object
DIQ010                 object
DBD900    

In [149]:
print(df["INDFMMPC"].value_counts())
print(df["INDFMMPC"].dtype)

3.0    6350
1.0    4537
2.0    1916
9.0     281
7.0      91
Name: INDFMMPC, dtype: int64
object


In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

def evaluate_imputation(df, target_column, features_to_use, test_fraction=0.2):
    df_copy = df.copy()

    # Определяем, является ли признак категориальным
    is_categorical = target_column in categorical_features

    # Преобразуем категориальные признаки в str
    if is_categorical:
        df_copy[target_column] = df_copy[target_column].astype(str)

    # Отбираем только строки без пропусков в target_column (чтобы проверить заполнение)
    df_clean = df_copy.dropna(subset=[target_column])
    
    # Создаём искусственные пропуски (берём часть данных)
    test_sample = df_clean.sample(frac=test_fraction, random_state=42)
    df_copy.loc[test_sample.index, target_column] = np.nan  # Делаем пропуски

    # Кодируем категориальные признаки (кроме target_column)
    encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    df_copy[features_to_use] = encoder.fit_transform(df_copy[features_to_use])

    # Кодируем target_column отдельно (если он категориальный)
    target_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    if is_categorical:
        df_copy[[target_column]] = target_encoder.fit_transform(df_copy[[target_column]])

    # Выбираем модель
    if is_categorical:
        estimator = RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=42)
        #estimator = DecisionTreeClassifier(max_depth=50, random_state=42)  # для категориальн
        
    else:
        #estimator = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
        estimator = DecisionTreeRegressor(max_depth=50, random_state=42)  # для числовых

    # Применяем MICE
    imputer = IterativeImputer(estimator=estimator, max_iter=10, initial_strategy="most_frequent", tol=1e-4, random_state=42)
    # Оставляем только строки без пропусков в features_to_use
    df_impute = df_copy[features_to_use + [target_column]].dropna()

    # Запускаем IterativeImputer только на этих данных
    imputed_values = imputer.fit_transform(df_impute)

    # Вставляем предсказанные значения обратно в df_copy
    df_copy.loc[df_impute.index, [target_column]] = imputed_values[:, -1]

    # Декодируем обратно target_column
    if is_categorical:
        df_copy[[target_column]] = target_encoder.inverse_transform(df_copy[[target_column]])

    # Сравниваем восстановленные значения с реальными
    true_values = test_sample[target_column].dropna()
    predicted_values = df_copy.loc[test_sample.index, target_column]

    # Выбираем метрику
    if is_categorical:
        true_values = true_values.astype(str)
        predicted_values = predicted_values.astype(str)
        accuracy = accuracy_score(true_values, predicted_values)
        print(f"Accuracy  {target_column}: {accuracy:.4f}")
    else:
        r2 = r2_score(true_values, predicted_values)
        print(f"R² --- : {r2:.4f}")

    return df_copy

# Пример использования:
column_to_impute = "INDFMMPC"  # Какой признак заполняем
features_to_use = ["RIAGENDR", "RIDAGEYR", "BMXWAIST", "DMDMARTZ", "HIQ011"]  # Какие признаки используем

df_imputed = evaluate_imputation(df, column_to_impute, features_to_use)

KeyboardInterrupt: 