In [41]:
import os
import pyreadstat
import pandas as pd
from functools import reduce
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [42]:
# Paths for different periods
base_path_2015 = "../../data/raw/2015-2016"

# Relative paths (common for all periods)
file_paths = {
    "body_measures": "P_BMX.xpt",
    "demographics": "P_DEMO.xpt",
    "blood_pressure_2015": "P_BPXO.xpt",
    "LabData_fast_glucose": "P_GLU.xpt",
    "LabData_glycohemog": "P_GHB.xpt",
    "LabData_insulin": "P_INS.xpt",
    "LabData_Cholest_total": "P_TCHOL.xpt",
    "LabData_Cholest_HDL": "P_HDL.xpt",
    "LabData_Cholest_LDL": "P_TRIGLY.xpt",
    "LabData_C_protein": "P_HSCRP.xpt",
    "LabData_Feretin": "P_FERTIN.xpt",
    "SurveyData_Alcohol_2015": "P_ALQ.xpt",
    "SurveyData_Pressure_Cholest": "P_BPQ.xpt",
    "SurveyData_Diabetes": "P_DIQ.xpt",
    "SurveyData_Diet_Behavior": "P_DBQ.xpt",
    "SurveyData_Insurance": "P_HIQ.xpt",
    "SurveyData_Income": "P_INQ.xpt",
    "SurveyData_Medical": "P_MCQ.xpt",
    "SurveyData_MentalHealth": "P_DPQ.xpt",
    "SurveyData_PhysicalActivity": "P_PAQ.xpt",
    "SurveyData_SleepDisorders": "P_SLQ.xpt",
    "SurveyData_Smoking": "P_SMQ.xpt",
    "SurveyData_WeightHistory": "P_WHQ.xpt"
}


columns_to_keep_2015 = {
    "body_measures": ["SEQN", "BMXWT", "BMXHT", "BMXBMI", "BMXWAIST"],
    "demographics": ["SEQN", "SDDSRVYR", "RIAGENDR", "RIDAGEYR","RIDRETH3","DMDBORN4","DMDEDUC2","RIDEXPRG","INDFMPIR","DMDMARTL","WTINT2YR","WTMEC2YR"],
    #"blood_pressure": ["SEQN", "BPXOSY1", "BPXOSY2", "BPXOSY3", "BPXODI1", "BPXODI2", "BPXODI3","BPXOPLS1", "BPXOPLS2", "BPXOPLS3"],
    "blood_pressure_2015": ["SEQN", "BPXSY1", "BPXSY2", "BPXSY3", "BPXDI1", "BPXDI2", "BPXDI3", "BPXPLS"],
    "LabData_fast_glucose": ["SEQN","WTSAF2YR" ,"LBDGLUSI"],
    "LabData_glycohemog": ["SEQN","LBXGH"],
    "LabData_insulin": ["SEQN","LBXIN"],      
    "LabData_Cholest_total": ["SEQN","LBDTCSI"],
    "LabData_Cholest_HDL": ["SEQN","LBDHDDSI"],
    "LabData_Cholest_LDL": ["SEQN","LBXTR", "LBDTRSI","LBDLDL"],         
    "LabData_C_protein": ["SEQN","LBXHSCRP"],
    "LabData_Feretin": ["SEQN","LBDFERSI"], 
    "SurveyData_Alcohol_2015" : ["SEQN","ALQ110","ALQ120Q","ALQ130","ALQ141Q","ALQ151"],

    "SurveyData_Pressure_Cholest" : ["SEQN","BPQ020","BPQ040A","BPQ050A","BPQ080", "BPQ090D"],
    "SurveyData_Diabetes" : ["SEQN","DIQ160", "DIQ010", "DID040"],
    "SurveyData_Diet_Behavior" : ["SEQN","DBD900","DBD905","DBD910","DBQ197","DBD895","DBD030", "DBD041", "DBQ700"],
    "SurveyData_Insurance" : ["SEQN","HIQ011"],
    "SurveyData_Income" : ["SEQN","INDFMMPC"],
    "SurveyData_Medical" : ["SEQN","MCQ080","MCQ160B","MCQ160C","MCQ160M","MCQ300C"],
    "SurveyData_MentalHealth" : ["SEQN","DPQ020","DPQ030","DPQ040","DPQ060","DPQ070"],
    "SurveyData_PhysicalActivity" : ["SEQN","PAQ605","PAQ620","PAQ635","PAQ650","PAQ665","PAD680"],
    "SurveyData_SleepDisorders" : ["SEQN","SLD012","SLQ120"],
    "SurveyData_Smoking" : ["SEQN","SMQ020","SMD650","SMD030","SMQ040"],
    "SurveyData_WeightHistory" : ["SEQN","WHD110","WHD120","WHD140"]  
}




In [43]:
def load_data(base_path, file_paths, columns_to_keep):
    dataframes = {}
    for key, relative_path in file_paths.items():
        # full file path
        file_path = os.path.join(base_path, relative_path)
        try:
            # Check
            if not os.path.exists(file_path):
                print(f"Error: File '{file_path}' does not exist!")
                continue

            # Handle specific encoding
            if key == "LabData_insulin":
                data, _ = pyreadstat.read_xport(file_path, encoding="latin1")
            else:
                
                data, _ = pyreadstat.read_xport(file_path)

            # Filter 
            dataframes[key] = data[columns_to_keep[key]]
            print(f"The file '{key}' has been successfully loaded.")
        except Exception as e:
            print(f"Error loading the file '{key}': {e}")
    return dataframes

In [44]:
# Load data 
dataframes_2015 = load_data(base_path_2015, file_paths, columns_to_keep_2015)

print("Loaded datasets:")
for key, df in dataframes_2015.items():
    print(f"{key}: {len(df)} rows, {len(df.columns)} columns")

The file 'body_measures' has been successfully loaded.
The file 'demographics' has been successfully loaded.
The file 'blood_pressure_2015' has been successfully loaded.
The file 'LabData_fast_glucose' has been successfully loaded.
The file 'LabData_glycohemog' has been successfully loaded.
The file 'LabData_insulin' has been successfully loaded.
The file 'LabData_Cholest_total' has been successfully loaded.
The file 'LabData_Cholest_HDL' has been successfully loaded.
The file 'LabData_Cholest_LDL' has been successfully loaded.
The file 'LabData_C_protein' has been successfully loaded.
The file 'LabData_Feretin' has been successfully loaded.
The file 'SurveyData_Alcohol_2015' has been successfully loaded.
The file 'SurveyData_Pressure_Cholest' has been successfully loaded.
The file 'SurveyData_Diabetes' has been successfully loaded.
The file 'SurveyData_Diet_Behavior' has been successfully loaded.
The file 'SurveyData_Insurance' has been successfully loaded.
The file 'SurveyData_Income

In [45]:
final_dataset_2015 = reduce(
    lambda left, right: left.merge(right, on="SEQN", how="outer"), 
    dataframes_2015.values())

print(final_dataset_2015.shape)
final_dataset_2015.head()

(9971, 82)


Unnamed: 0,SEQN,BMXWT,BMXHT,BMXBMI,BMXWAIST,SDDSRVYR,RIAGENDR,RIDAGEYR,RIDRETH3,DMDBORN4,DMDEDUC2,RIDEXPRG,INDFMPIR,DMDMARTL,WTINT2YR,WTMEC2YR,BPXSY1,BPXSY2,BPXSY3,BPXDI1,BPXDI2,BPXDI3,BPXPLS,WTSAF2YR,LBDGLUSI,LBXGH,LBXIN,LBDTCSI,LBDHDDSI,LBXTR,LBDTRSI,LBDLDL,LBXHSCRP,LBDFERSI,ALQ110,ALQ120Q,ALQ130,ALQ141Q,ALQ151,BPQ020,BPQ040A,BPQ050A,BPQ080,BPQ090D,DIQ160,DIQ010,DID040,DBD900,DBD905,DBD910,DBQ197,DBD895,DBD030,DBD041,DBQ700,HIQ011,INDFMMPC,MCQ080,MCQ160B,MCQ160C,MCQ160M,MCQ300C,DPQ020,DPQ030,DPQ040,DPQ060,DPQ070,PAQ605,PAQ620,PAQ635,PAQ650,PAQ665,PAD680,SLD012,SLQ120,SMQ020,SMD650,SMD030,SMQ040,WHD110,WHD120,WHD140
0,83732.0,94.8,184.5,27.8,101.1,9.0,1.0,62.0,3.0,1.0,5.0,,4.39,1.0,134671.370419,135629.507405,128.0,124.0,116.0,70.0,64.0,62.0,76.0,,,7.0,,4.47,1.19,,,,0.6,,,1.0,1.0,0.0,2.0,2.0,,,2.0,2.0,,1.0,46.0,0.0,2.0,0.0,3.0,1.0,,,3.0,1.0,3.0,1.0,2.0,2.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,2.0,2.0,1.0,480.0,5.5,3.0,1.0,,22.0,3.0,230.0,205.0,260.0
1,83733.0,90.4,171.4,30.8,107.9,9.0,1.0,53.0,3.0,2.0,3.0,,1.32,3.0,24328.560239,25282.425927,146.0,140.0,134.0,88.0,88.0,82.0,72.0,54722.34333,5.59,5.5,17.26,6.85,1.63,147.0,1.66,173.0,1.4,,,7.0,6.0,7.0,1.0,2.0,,,2.0,,2.0,2.0,,,0.0,0.0,0.0,0.0,,,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,300.0,8.0,0.0,1.0,20.0,20.0,1.0,180.0,160.0,205.0
2,83734.0,83.4,170.1,28.8,116.5,9.0,1.0,78.0,3.0,1.0,3.0,,1.51,1.0,12400.008522,12575.838818,138.0,132.0,136.0,46.0,44.0,46.0,56.0,25471.093699,4.66,5.8,11.77,5.92,0.78,269.0,3.037,145.0,0.6,,,0.0,,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,52.0,4.0,0.0,0.0,2.0,4.0,,,4.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,2.0,2.0,2.0,480.0,7.0,3.0,1.0,,14.0,3.0,224.0,165.0,240.0
3,83735.0,109.8,160.9,42.4,110.1,9.0,2.0,56.0,3.0,1.0,5.0,,5.0,6.0,102717.995647,102078.634508,132.0,134.0,136.0,72.0,68.0,70.0,78.0,,,5.6,,4.5,1.58,,,,9.0,,1.0,3.0,1.0,0.0,2.0,2.0,,,2.0,2.0,1.0,2.0,,1.0,11.0,15.0,0.0,3.0,,,4.0,1.0,3.0,1.0,2.0,2.0,2.0,9.0,1.0,2.0,2.0,3.0,2.0,2.0,1.0,2.0,2.0,2.0,480.0,6.5,4.0,2.0,,,,160.0,120.0,240.0
4,83736.0,55.2,164.9,20.3,80.4,9.0,2.0,42.0,4.0,1.0,4.0,1.0,1.23,3.0,17627.674984,18234.736219,100.0,114.0,98.0,70.0,54.0,56.0,76.0,38179.51087,4.66,5.6,5.42,5.28,1.37,47.0,0.531,142.0,0.5,67.2,1.0,1.0,1.0,0.0,2.0,2.0,,,2.0,2.0,2.0,2.0,,1.0,3.0,1.0,0.0,3.0,,,5.0,1.0,2.0,2.0,2.0,2.0,2.0,9.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,2.0,2.0,2.0,540.0,,1.0,2.0,,,,127.0,127.0,135.0


In [46]:
#
final_dataset_2015['SDDSRVYR'] = '2015-2016'

# Add the period to SEQN 
final_dataset_2015['SEQN'] = final_dataset_2015['SEQN'].astype(str) + '_' + final_dataset_2015['SDDSRVYR']

# Validate  
print(final_dataset_2015[['SEQN', 'SDDSRVYR']].head())

                SEQN   SDDSRVYR
0  83732.0_2015-2016  2015-2016
1  83733.0_2015-2016  2015-2016
2  83734.0_2015-2016  2015-2016
3  83735.0_2015-2016  2015-2016
4  83736.0_2015-2016  2015-2016


In [47]:
# Create the WHtR feature
final_dataset_2015['WHtR'] = round(final_dataset_2015['BMXWAIST'] / final_dataset_2015['BMXHT'],2)
# Checking
#final_dataset_2017[['BMXWAIST', 'BMXHT', 'WHtR']].head(20)

# Calculate derived features
final_dataset_2015['AvgSystolicBP'] = final_dataset_2015[['BPXSY1', 'BPXSY2', 'BPXSY3']].replace(0, np.nan).mean(axis=1)
final_dataset_2015['AvgDiastolicBP'] = final_dataset_2015[['BPXDI1', 'BPXDI2', 'BPXDI3']].replace(0, np.nan).mean(axis=1)
final_dataset_2015['PulsePressure'] = final_dataset_2015['AvgSystolicBP'] - final_dataset_2015['AvgDiastolicBP']
final_dataset_2015['AvgPulseRate'] = final_dataset_2015['BPXPLS']

# Drop original columns
columns_to_drop = ['BPXSY1', 'BPXSY2', 'BPXSY3',
                   'BPXDI1', 'BPXDI2', 'BPXDI3',
                   'BPXPLS']

In [48]:
final_dataset_2015.drop(columns=columns_to_drop, inplace=True)
final_dataset_2015[['AvgSystolicBP', 'AvgDiastolicBP', 'PulsePressure', 'AvgPulseRate']].head()

Unnamed: 0,AvgSystolicBP,AvgDiastolicBP,PulsePressure,AvgPulseRate
0,122.666667,65.333333,57.333333,76.0
1,140.0,86.0,54.0,72.0
2,135.333333,45.333333,90.0,56.0
3,134.0,70.0,64.0,78.0
4,104.0,60.0,44.0,76.0


In [49]:

print(final_dataset_2015.shape)
print(final_dataset_2015.columns)

final_dataset_2015.head()

(9971, 80)
Index(['SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'SDDSRVYR', 'RIAGENDR',
       'RIDAGEYR', 'RIDRETH3', 'DMDBORN4', 'DMDEDUC2', 'RIDEXPRG', 'INDFMPIR',
       'DMDMARTL', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'LBDGLUSI', 'LBXGH',
       'LBXIN', 'LBDTCSI', 'LBDHDDSI', 'LBXTR', 'LBDTRSI', 'LBDLDL',
       'LBXHSCRP', 'LBDFERSI', 'ALQ110', 'ALQ120Q', 'ALQ130', 'ALQ141Q',
       'ALQ151', 'BPQ020', 'BPQ040A', 'BPQ050A', 'BPQ080', 'BPQ090D', 'DIQ160',
       'DIQ010', 'DID040', 'DBD900', 'DBD905', 'DBD910', 'DBQ197', 'DBD895',
       'DBD030', 'DBD041', 'DBQ700', 'HIQ011', 'INDFMMPC', 'MCQ080', 'MCQ160B',
       'MCQ160C', 'MCQ160M', 'MCQ300C', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ060',
       'DPQ070', 'PAQ605', 'PAQ620', 'PAQ635', 'PAQ650', 'PAQ665', 'PAD680',
       'SLD012', 'SLQ120', 'SMQ020', 'SMD650', 'SMD030', 'SMQ040', 'WHD110',
       'WHD120', 'WHD140', 'WHtR', 'AvgSystolicBP', 'AvgDiastolicBP',
       'PulsePressure', 'AvgPulseRate'],
      dtype='object')


Unnamed: 0,SEQN,BMXWT,BMXHT,BMXBMI,BMXWAIST,SDDSRVYR,RIAGENDR,RIDAGEYR,RIDRETH3,DMDBORN4,DMDEDUC2,RIDEXPRG,INDFMPIR,DMDMARTL,WTINT2YR,WTMEC2YR,WTSAF2YR,LBDGLUSI,LBXGH,LBXIN,LBDTCSI,LBDHDDSI,LBXTR,LBDTRSI,LBDLDL,LBXHSCRP,LBDFERSI,ALQ110,ALQ120Q,ALQ130,ALQ141Q,ALQ151,BPQ020,BPQ040A,BPQ050A,BPQ080,BPQ090D,DIQ160,DIQ010,DID040,DBD900,DBD905,DBD910,DBQ197,DBD895,DBD030,DBD041,DBQ700,HIQ011,INDFMMPC,MCQ080,MCQ160B,MCQ160C,MCQ160M,MCQ300C,DPQ020,DPQ030,DPQ040,DPQ060,DPQ070,PAQ605,PAQ620,PAQ635,PAQ650,PAQ665,PAD680,SLD012,SLQ120,SMQ020,SMD650,SMD030,SMQ040,WHD110,WHD120,WHD140,WHtR,AvgSystolicBP,AvgDiastolicBP,PulsePressure,AvgPulseRate
0,83732.0_2015-2016,94.8,184.5,27.8,101.1,2015-2016,1.0,62.0,3.0,1.0,5.0,,4.39,1.0,134671.370419,135629.507405,,,7.0,,4.47,1.19,,,,0.6,,,1.0,1.0,0.0,2.0,2.0,,,2.0,2.0,,1.0,46.0,0.0,2.0,0.0,3.0,1.0,,,3.0,1.0,3.0,1.0,2.0,2.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,2.0,2.0,1.0,480.0,5.5,3.0,1.0,,22.0,3.0,230.0,205.0,260.0,0.55,122.666667,65.333333,57.333333,76.0
1,83733.0_2015-2016,90.4,171.4,30.8,107.9,2015-2016,1.0,53.0,3.0,2.0,3.0,,1.32,3.0,24328.560239,25282.425927,54722.34333,5.59,5.5,17.26,6.85,1.63,147.0,1.66,173.0,1.4,,,7.0,6.0,7.0,1.0,2.0,,,2.0,,2.0,2.0,,,0.0,0.0,0.0,0.0,,,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,300.0,8.0,0.0,1.0,20.0,20.0,1.0,180.0,160.0,205.0,0.63,140.0,86.0,54.0,72.0
2,83734.0_2015-2016,83.4,170.1,28.8,116.5,2015-2016,1.0,78.0,3.0,1.0,3.0,,1.51,1.0,12400.008522,12575.838818,25471.093699,4.66,5.8,11.77,5.92,0.78,269.0,3.037,145.0,0.6,,,0.0,,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,52.0,4.0,0.0,0.0,2.0,4.0,,,4.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,2.0,2.0,2.0,480.0,7.0,3.0,1.0,,14.0,3.0,224.0,165.0,240.0,0.68,135.333333,45.333333,90.0,56.0
3,83735.0_2015-2016,109.8,160.9,42.4,110.1,2015-2016,2.0,56.0,3.0,1.0,5.0,,5.0,6.0,102717.995647,102078.634508,,,5.6,,4.5,1.58,,,,9.0,,1.0,3.0,1.0,0.0,2.0,2.0,,,2.0,2.0,1.0,2.0,,1.0,11.0,15.0,0.0,3.0,,,4.0,1.0,3.0,1.0,2.0,2.0,2.0,9.0,1.0,2.0,2.0,3.0,2.0,2.0,1.0,2.0,2.0,2.0,480.0,6.5,4.0,2.0,,,,160.0,120.0,240.0,0.68,134.0,70.0,64.0,78.0
4,83736.0_2015-2016,55.2,164.9,20.3,80.4,2015-2016,2.0,42.0,4.0,1.0,4.0,1.0,1.23,3.0,17627.674984,18234.736219,38179.51087,4.66,5.6,5.42,5.28,1.37,47.0,0.531,142.0,0.5,67.2,1.0,1.0,1.0,0.0,2.0,2.0,,,2.0,2.0,2.0,2.0,,1.0,3.0,1.0,0.0,3.0,,,5.0,1.0,2.0,2.0,2.0,2.0,2.0,9.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,2.0,2.0,2.0,540.0,,1.0,2.0,,,,127.0,127.0,135.0,0.49,104.0,60.0,44.0,76.0


### **Transformation** 
### **ALQ120Q → ALQ121 (Fréquence consommation alcool)**
### **ALQ141Q → ALQ142 (Fréquence 4/5 verres consommés)**

In [50]:
def convert_alq120q_to_alq121(value):
    if pd.isna(value):  # Valeurs manquantes restent NaN
        return None
    if value == 0:
        return 0  # Never in the last year
    elif value == 365:
        return 1  # Every day
    elif 300 <= value < 365:
        return 2  # Nearly every day
    elif 156 <= value < 300:
        return 3  # 3 to 4 times a week
    elif 104 <= value < 156:
        return 4  # 2 times a week
    elif 52 <= value < 104:
        return 5  # Once a week
    elif 24 <= value < 52:
        return 6  # 2 to 3 times a month
    elif 12 <= value < 24:
        return 7  # Once a month
    elif 7 <= value < 12:
        return 8  # 7 to 11 times a year
    elif 3 <= value < 7:
        return 9  # 3 to 6 times a year
    elif 1 <= value < 3:
        return 10  # 1 to 2 times a year
    else:
        return None  # Valeur inconnue

final_dataset_2015["ALQ121"] = final_dataset_2015["ALQ120Q"].apply(convert_alq120q_to_alq121)
final_dataset_2015["ALQ142"] = final_dataset_2015["ALQ141Q"].apply(convert_alq120q_to_alq121)

#ALQ110 (2015-2016) devient ALQ111 pour être compatible avec 2017-2021.
final_dataset_2015.rename(columns={"ALQ110": "ALQ111"}, inplace=True)

# "DMDMARTL" (before 2017): -> "DMDMARTZ" (after 2017)
marital_status_mapping = {
    1: 1,  # Married -> Married/Living with Partner
    2: 2,  # Widowed -> Widowed/Divorced/Separated
    3: 2,  # Divorced -> Widowed/Divorced/Separated
    4: 2,  # Separated -> Widowed/Divorced/Separated
    5: 3,  # Never married -> Never married
    6: 1,  # Living with partner -> Married/Living with Partner
    77: 77,  # Refused -> Refused
    99: 99   # Don't know -> Don't know
}
if "DMDMARTL" in final_dataset_2015.columns:
    final_dataset_2015["DMDMARTZ"] = final_dataset_2015["DMDMARTL"].map(marital_status_mapping)
    final_dataset_2015.drop(columns=["DMDMARTL"], inplace=True)  # Удаляем старый столбец

rename_dict = {
    "WTINT2YR": "WTINTPRP",
    "WTSAF2YR": "WTSAFPRP",
    "WTMEC2YR": "WTMECPRP"
}

# Переименуем колонки в 2015-2016
final_dataset_2015.rename(columns=rename_dict, inplace=True)


final_dataset_2015.drop(columns=["ALQ120Q", "ALQ141Q"], inplace=True)


In [51]:

print(final_dataset_2015.shape)
print(final_dataset_2015.columns)
print(final_dataset_2015.shape)

final_dataset_2015.head()



# Sauvegarde du dataset transformé et fusionné
final_dataset_2015.to_csv("dataset_2015.csv", index=False, encoding="utf-8")

final_dataset_2015["DMDMARTZ"].value_counts()

(9971, 80)
Index(['SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'SDDSRVYR', 'RIAGENDR',
       'RIDAGEYR', 'RIDRETH3', 'DMDBORN4', 'DMDEDUC2', 'RIDEXPRG', 'INDFMPIR',
       'WTINTPRP', 'WTMECPRP', 'WTSAFPRP', 'LBDGLUSI', 'LBXGH', 'LBXIN',
       'LBDTCSI', 'LBDHDDSI', 'LBXTR', 'LBDTRSI', 'LBDLDL', 'LBXHSCRP',
       'LBDFERSI', 'ALQ111', 'ALQ130', 'ALQ151', 'BPQ020', 'BPQ040A',
       'BPQ050A', 'BPQ080', 'BPQ090D', 'DIQ160', 'DIQ010', 'DID040', 'DBD900',
       'DBD905', 'DBD910', 'DBQ197', 'DBD895', 'DBD030', 'DBD041', 'DBQ700',
       'HIQ011', 'INDFMMPC', 'MCQ080', 'MCQ160B', 'MCQ160C', 'MCQ160M',
       'MCQ300C', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ060', 'DPQ070', 'PAQ605',
       'PAQ620', 'PAQ635', 'PAQ650', 'PAQ665', 'PAD680', 'SLD012', 'SLQ120',
       'SMQ020', 'SMD650', 'SMD030', 'SMQ040', 'WHD110', 'WHD120', 'WHD140',
       'WHtR', 'AvgSystolicBP', 'AvgDiastolicBP', 'PulsePressure',
       'AvgPulseRate', 'ALQ121', 'ALQ142', 'DMDMARTZ'],
      dtype='object')
(9971, 80)


1.0     3441
2.0     1227
3.0     1048
77.0       2
99.0       1
Name: DMDMARTZ, dtype: int64

In [52]:
final_dataset_2015 = final_dataset_2015.dropna(subset=['LBDGLUSI', 'LBXGH', 'LBXIN'])
print(final_dataset_2015.shape)
final_dataset_2015[['LBDGLUSI', 'LBXGH', 'LBXIN']].isna().sum()

(2917, 80)


LBDGLUSI    0
LBXGH       0
LBXIN       0
dtype: int64

In [154]:
columns_order = [
        'SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'WHtR', 'AvgSystolicBP', 
        'AvgDiastolicBP','PulsePressure', 'AvgPulseRate', 'SDDSRVYR', 'RIAGENDR',
        'RIDAGEYR', 'RIDRETH3', 'DMDBORN4', 'DMDEDUC2', 'RIDEXPRG', 'INDFMPIR',
        'DMDMARTL', 'LBDGLUSI', 'LBXGH', 'LBXIN','LBDTCSI', 'LBDHDDSI', 'LBXTR', 
        'LBDTRSI', 'LBDLDL', 'LBXHSCRP','LBDFERSI', 'ALQ110', 'ALQ120Q', 'ALQ130', 
        'ALQ141Q', 'ALQ151','BPQ020', 'BPQ040A', 'BPQ050A', 'BPQ080', 'BPQ090D', 
        'DIQ160', 'DBD900','DBD905', 'DBD910', 'DBQ197', 'DBD895', 'HIQ011', 
        'INDFMMPC', 'MCQ080','MCQ160B', 'MCQ160C', 'MCQ160M', 'MCQ300C', 'DPQ020', 
        'DPQ030','DPQ040', 'DPQ060', 'DPQ070', 'PAQ605', 'PAQ620', 'PAQ635', 
        'PAQ650','PAQ665', 'PAD680', 'SLD012', 'SLQ120', 'SMQ020', 'SMD650', 
        'WHD110','WHD120', 'WHD140','HOMA_IR', 
        # Таргетные переменные
       'Diabetes_Status','Insulin_Resistance',
       'WTINT2YR', 'WTMEC2YR',"WTSAF2YR"

]




final_dataset_2015 = final_dataset_2015[columns_order]