In [45]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, r2_score, f1_score
from lightgbm import LGBMClassifier, LGBMRegressor


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [46]:
df = pd.read_csv('../data/processed/df_2009-2020Total.csv')

In [47]:
categorical_features = [
        'SDDSRVYR','RIAGENDR', 'RIDRETH3', 'DMDBORN4', 'DMDEDUC2','RIDEXPRG', 'DMDMARTZ',
        'ALQ111', 'ALQ121', 'ALQ142','ALQ151','BPQ020','BPQ040A','BPQ050A','BPQ080','BPQ090D',
        'DIQ160','DIQ010','DBQ197','DBQ700','HIQ011','INDFMMPC','MCQ080','MCQ160B','MCQ160C','MCQ160M','MCQ300C',
        'DPQ020','DPQ030','DPQ040','DPQ060','DPQ070','PAQ605','PAQ620','PAQ635','PAQ650','PAQ665','SLQ120',
        'SMQ020','SMQ040','Diabetes_Status','Insulin_Resistance'
    ]

columns_to_drop_1 = [
    "LBXHSCRP", "SLQ120", "ALQ111", "SMD030", "SMQ040", "LBDFERSI",
    "BPQ040A", "BPQ050A", "RIDEXPRG", "SMD650", "DID040", "DBD041", "DBD030","INDFMPIR","LBXTR","BPQ090D",
    "WTSAFPRP","WTMECPRP",'WTINTPRP'
]

df.drop(columns=columns_to_drop_1, inplace=True)

categorical_features = [col for col in categorical_features if col not in columns_to_drop_1]
numerical_features = [col for col in df.columns if col not in categorical_features]

In [48]:
def calculate_missing_percentage(df,categorical_features,numerical_features):
    missing_counts = df.isnull().sum()
    total_rows = len(df)
    missing_percentage = round((missing_counts / total_rows) * 100, 1)
    
    feature_types = [
        'Categorical' if col in categorical_features else 'Numerical' 
        for col in df.columns
    ]
    
    missing_df = pd.DataFrame({
        'Feature': df.columns,
        'Missing_Count': missing_counts.values,
        'Missing_Percentage': missing_percentage.values,
        'Feature_Type': feature_types
    })
    
    # Сортировка по проценту пропусков
    missing_df = missing_df.sort_values(by='Missing_Percentage', ascending=True).reset_index(drop=True)
    
    return missing_df

calculate_missing_percentage(df,categorical_features,numerical_features)

Unnamed: 0,Feature,Missing_Count,Missing_Percentage,Feature_Type
0,SEQN,0,0.0,Numerical
1,HIQ011,1,0.0,Categorical
2,DBQ197,0,0.0,Categorical
3,DIQ010,0,0.0,Categorical
4,Diabetes_Status,0,0.0,Categorical
5,HOMA_IR,0,0.0,Numerical
6,LBDHDDSI,7,0.0,Numerical
7,LBDTCSI,7,0.0,Numerical
8,LBXGH,0,0.0,Numerical
9,LBXIN,0,0.0,Numerical


In [49]:
categorical_features = [col for col in categorical_features if col not in columns_to_drop_1]
numerical_features = [col for col in df.columns if col not in categorical_features]

missing_data = calculate_missing_percentage(df,categorical_features,numerical_features)

low_missing = missing_data[(missing_data["Missing_Percentage"] > 0) & (missing_data["Missing_Percentage"] <= 5)]
medium_missing = missing_data[(missing_data["Missing_Percentage"] > 5) & (missing_data["Missing_Percentage"] <= 20)]
high_missing = missing_data[missing_data["Missing_Percentage"] > 20]

# 
low_missing_numerical = low_missing[low_missing["Feature_Type"] == "Numerical"]["Feature"].tolist()
low_missing_categorical = low_missing[low_missing["Feature_Type"] == "Categorical"]["Feature"].tolist()

medium_missing_numerical = medium_missing[medium_missing["Feature_Type"] == "Numerical"]["Feature"].tolist()
medium_missing_categorical = medium_missing[medium_missing["Feature_Type"] == "Categorical"]["Feature"].tolist()

high_missing_numerical = high_missing[high_missing["Feature_Type"] == "Numerical"]["Feature"].tolist()
high_missing_categorical = high_missing[high_missing["Feature_Type"] == "Categorical"]["Feature"].tolist()

### 
print("Nan (0-5%):")
print("\n Numerical", low_missing_numerical)
print("Categorical:", low_missing_categorical)

print("\nNan (5-20%):")

print("\n Numerical:", medium_missing_numerical)
print("Categorical:", medium_missing_categorical)

print("\n Nan (>20%):")

print("\nNumerical:", high_missing_numerical)
print("Categorical:", high_missing_categorical)

Nan (0-5%):

 Numerical ['DBD895', 'DBD910', 'DBD905', 'BMXHT', 'BMXWT', 'BMXBMI', 'LBDTRSI', 'LBDLDL', 'PAD680', 'BMXWAIST', 'WHtR']
Categorical: ['PAQ620', 'PAQ605', 'PAQ635', 'PAQ650', 'PAQ665']

Nan (5-20%):

 Numerical: ['AvgSystolicBP', 'AvgDiastolicBP', 'PulsePressure', 'AvgPulseRate', 'SLD012', 'WHD140']
Categorical: ['INDFMMPC', 'MCQ080', 'DBQ700', 'BPQ020', 'DIQ160', 'SMQ020', 'BPQ080', 'MCQ160B', 'MCQ160C', 'MCQ300C', 'DMDMARTZ', 'DMDEDUC2', 'MCQ160M']

 Nan (>20%):

Numerical: ['DBD900', 'WHD120', 'WHD110', 'ALQ130']
Categorical: ['DPQ020', 'DPQ030', 'DPQ040', 'DPQ060', 'DPQ070', 'ALQ121', 'ALQ151', 'ALQ142']


In [None]:
ALQ142, ALQ151, BPQ020,DIQ010, DMDEDUC2

In [None]:
def evaluate_imputation(df, target_column, features_to_use, test_fraction=0.2):
    df_copy = df.copy()

    # 
    df_clean = df_copy.dropna(subset=[target_column])

    # тестовая выборка для контроля
    test_sample = df_clean.sample(frac=test_fraction, random_state=42)
    df_copy.loc[test_sample.index, target_column] = np.nan  # Делаем пропуски

    is_categorical = target_column in categorical_features

    # Кодировка target
    target_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    if is_categorical:
        df_copy[target_column] = target_encoder.fit_transform(df_copy[[target_column]])

    # Кодировка `features`
    feature_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    categorical_subset = list(set(features_to_use) & set(categorical_features))
    
    if categorical_subset:
        df_copy[categorical_subset] = feature_encoder.fit_transform(df_copy[categorical_subset])

    # Модель MICE
    if is_categorical:
        estimator = LGBMClassifier(n_estimators=100, random_state=42, num_leaves=20, verbose=-1)
    else:
        estimator = LGBMRegressor(n_estimators=100, random_state=42, num_leaves=20, verbose=-1)

    # MICE
    imputer = IterativeImputer(estimator=estimator, max_iter=20, initial_strategy="most_frequent", random_state=42)
    df_copy[target_column] = imputer.fit_transform(df_copy[features_to_use + [target_column]])[:, -1]

    # Обратное декодирование target_column
    if is_categorical:
        df_copy[target_column] = target_encoder.inverse_transform(df_copy[[target_column]])

    # `true_values`,`predicted_values` к строкам 
    true_values = test_sample[target_column].dropna().astype(str)
    predicted_values = df_copy.loc[test_sample.index, target_column].astype(str)

    # 
    if is_categorical:
        f1 = f1_score(true_values, predicted_values, average='macro')  # "macro" - учитывает все классы
        print(f"F1-score (категориальный) {target_column}: {f1:.4f}")
    else:
        r2 = r2_score(true_values.astype(float), predicted_values.astype(float))
        print(f"R² (числовой) {target_column}: {r2:.4f}")

    return df_copy

# 
#column_to_impute = "ALQ121"  # Какой признак заполняем (Категориальный!)
#features_to_use = ["ALQ142", "ALQ151", "BPQ020", "Diabetes_Status", "DIQ010"]


column_to_impute = "BMXWAIST"  
features_to_use = ["BMXWT", "BMXHT", "PAQ605", "PAQ620", "PAQ635"]

df_imputed = evaluate_imputation(df, column_to_impute, features_to_use)

✅ R² (числовой) BMXWAIST: 0.8788


