In [46]:
import os
import pyreadstat
import pandas as pd
from functools import reduce
import seaborn as sns
import numpy as np


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [47]:
# Paths for different periods
base_path_2017 = "../../data/raw/2017-2020"

# Relative paths (common for all periods)
file_paths = {
    "body_measures": "P_BMX.xpt",
    "demographics": "P_DEMO.xpt",
    "blood_pressure": "P_BPXO.xpt",
    "LabData_fast_glucose": "P_GLU.xpt",
    "LabData_glycohemog": "P_GHB.xpt",
    "LabData_insulin": "P_INS.xpt",
    "LabData_Cholest_total": "P_TCHOL.xpt",
    "LabData_Cholest_HDL": "P_HDL.xpt",
    "LabData_Cholest_LDL": "P_TRIGLY.xpt",
    "LabData_C_protein": "P_HSCRP.xpt",
    "LabData_Feretin": "P_FERTIN.xpt",
    "SurveyData_Alcohol": "P_ALQ.xpt",
    "SurveyData_Pressure_Cholest": "P_BPQ.xpt",
    "SurveyData_Diabetes": "P_DIQ.xpt",
    "SurveyData_Diet_Behavior": "P_DBQ.xpt",
    "SurveyData_Insurance": "P_HIQ.xpt",
    "SurveyData_Income": "P_INQ.xpt",
    "SurveyData_Medical": "P_MCQ.xpt",
    "SurveyData_MentalHealth": "P_DPQ.xpt",
    "SurveyData_PhysicalActivity": "P_PAQ.xpt",
    "SurveyData_SleepDisorders": "P_SLQ.xpt",
    "SurveyData_Smoking": "P_SMQ.xpt",
    "SurveyData_WeightHistory": "P_WHQ.xpt"
}


columns_to_keep_2017 = {
    "body_measures": ["SEQN", "BMXWT", "BMXHT", "BMXBMI", "BMXWAIST"],
    "demographics": ["SEQN", "SDDSRVYR", "RIAGENDR", "RIDAGEYR","RIDRETH3","DMDBORN4","DMDEDUC2","RIDEXPRG","INDFMPIR","DMDMARTZ","WTINTPRP","WTMECPRP"],
    "blood_pressure": ["SEQN", "BPXOSY1", "BPXOSY2", "BPXOSY3", "BPXODI1", "BPXODI2", "BPXODI3","BPXOPLS1", "BPXOPLS2", "BPXOPLS3"],
    "LabData_fast_glucose": ["SEQN","WTSAFPRP","LBDGLUSI"],
    "LabData_glycohemog": ["SEQN","LBXGH"],
    "LabData_insulin": ["SEQN","LBXIN"],      
    "LabData_Cholest_total": ["SEQN","LBDTCSI"],
    "LabData_Cholest_HDL": ["SEQN","LBDHDDSI"],
    "LabData_Cholest_LDL": ["SEQN","LBXTR", "LBDTRSI","LBDLDL"],         
    "LabData_C_protein": ["SEQN","LBXHSCRP"],
    "LabData_Feretin": ["SEQN","LBDFERSI"], 

    "SurveyData_Alcohol" : ["SEQN","ALQ111","ALQ121","ALQ130","ALQ142","ALQ151"],
    "SurveyData_Pressure_Cholest" : ["SEQN","BPQ020","BPQ040A","BPQ050A","BPQ080", "BPQ090D"],
    "SurveyData_Diabetes" : ["SEQN","DIQ160", "DIQ010", "DID040"],
    "SurveyData_Diet_Behavior" : ["SEQN","DBD900","DBD905","DBD910","DBQ197","DBD895","DBD030", "DBD041","DBQ930", "DBQ700"],

    "SurveyData_Insurance" : ["SEQN","HIQ011"],
    "SurveyData_Income" : ["SEQN","INDFMMPC"],

    "SurveyData_Medical" : ["SEQN","MCQ080","MCQ160B","MCQ160C","MCQ160M","MCQ300C"],

    "SurveyData_MentalHealth" : ["SEQN","DPQ020","DPQ030","DPQ040","DPQ060","DPQ070"],

    "SurveyData_PhysicalActivity" : ["SEQN","PAQ605","PAQ620","PAQ635","PAQ650","PAQ665","PAD680"],

    "SurveyData_SleepDisorders" : ["SEQN","SLD012","SLQ120"],

    "SurveyData_Smoking" : ["SEQN","SMQ020","SMD650","SMD030","SMQ040"],
    
    "SurveyData_WeightHistory" : ["SEQN","WHD110","WHD120","WHD140"]  
}


In [48]:
def load_data(base_path, file_paths, columns_to_keep):
    dataframes = {}
    for key, relative_path in file_paths.items():
        # full file path
        file_path = os.path.join(base_path, relative_path)
        try:
            # Check if the file exists
            if not os.path.exists(file_path):
                print(f"Error: File '{file_path}' does not exist!")
                continue

            # Handle specific encoding
            if key == "LabData_insulin":
                data, _ = pyreadstat.read_xport(file_path, encoding="latin1")
            else:
                # Default loading
                data, _ = pyreadstat.read_xport(file_path)

            # Filter columns 
            dataframes[key] = data[columns_to_keep[key]]
            print(f"The file '{key}' has been successfully loaded.")
        except Exception as e:
            print(f"Error loading the file '{key}': {e}")
    return dataframes

In [49]:
# Load data 
dataframes_2017 = load_data(base_path_2017, file_paths, columns_to_keep_2017)

# Check the loaded datasets
print("Loaded datasets:")
for key, df in dataframes_2017.items():
    print(f"{key}: {len(df)} rows, {len(df.columns)} columns")

The file 'body_measures' has been successfully loaded.
The file 'demographics' has been successfully loaded.
The file 'blood_pressure' has been successfully loaded.
The file 'LabData_fast_glucose' has been successfully loaded.
The file 'LabData_glycohemog' has been successfully loaded.
The file 'LabData_insulin' has been successfully loaded.
The file 'LabData_Cholest_total' has been successfully loaded.
The file 'LabData_Cholest_HDL' has been successfully loaded.
The file 'LabData_Cholest_LDL' has been successfully loaded.
The file 'LabData_C_protein' has been successfully loaded.
The file 'LabData_Feretin' has been successfully loaded.
The file 'SurveyData_Alcohol' has been successfully loaded.
The file 'SurveyData_Pressure_Cholest' has been successfully loaded.
The file 'SurveyData_Diabetes' has been successfully loaded.
The file 'SurveyData_Diet_Behavior' has been successfully loaded.
The file 'SurveyData_Insurance' has been successfully loaded.
The file 'SurveyData_Income' has been

In [50]:
final_dataset_2017 = reduce(
    lambda left, right: left.merge(right, on="SEQN", how="outer"), 
    dataframes_2017.values())

print(final_dataset_2017.shape)

final_dataset_2017.head()



(15560, 85)


Unnamed: 0,SEQN,BMXWT,BMXHT,BMXBMI,BMXWAIST,SDDSRVYR,RIAGENDR,RIDAGEYR,RIDRETH3,DMDBORN4,DMDEDUC2,RIDEXPRG,INDFMPIR,DMDMARTZ,WTINTPRP,WTMECPRP,BPXOSY1,BPXOSY2,BPXOSY3,BPXODI1,BPXODI2,BPXODI3,BPXOPLS1,BPXOPLS2,BPXOPLS3,WTSAFPRP,LBDGLUSI,LBXGH,LBXIN,LBDTCSI,LBDHDDSI,LBXTR,LBDTRSI,LBDLDL,LBXHSCRP,LBDFERSI,ALQ111,ALQ121,ALQ130,ALQ142,ALQ151,BPQ020,BPQ040A,BPQ050A,BPQ080,BPQ090D,DIQ160,DIQ010,DID040,DBD900,DBD905,DBD910,DBQ197,DBD895,DBD030,DBD041,DBQ930,DBQ700,HIQ011,INDFMMPC,MCQ080,MCQ160B,MCQ160C,MCQ160M,MCQ300C,DPQ020,DPQ030,DPQ040,DPQ060,DPQ070,PAQ605,PAQ620,PAQ635,PAQ650,PAQ665,PAD680,SLD012,SLQ120,SMQ020,SMD650,SMD030,SMQ040,WHD110,WHD120,WHD140
0,109263.0,,,,,66.0,1.0,2.0,6.0,1.0,,,4.66,,7891.762435,8951.815567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,0.0,0.0,5.0,3.0,1.0,0.0,365.0,,,1.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,
1,109264.0,42.2,154.7,17.6,63.8,66.0,2.0,13.0,1.0,1.0,,,0.83,,11689.747264,12271.157043,109.0,109.0,106.0,67.0,68.0,66.0,94.0,95.0,91.0,27533.174559,5.38,5.3,6.05,4.29,1.86,40.0,0.452,86.0,0.11,15.7,,,,,,,,,,,2.0,2.0,,0.0,1.0,1.0,3.0,1.0,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,
2,109265.0,12.0,89.3,15.0,41.2,66.0,1.0,2.0,3.0,1.0,,,3.06,,16273.825939,16658.764203,,,,,,,,,,,,,,,,,,,0.31,42.1,,,,,,,,,,,,2.0,,2.0,0.0,2.0,3.0,3.0,,999999.0,,,1.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,
3,109266.0,97.1,160.2,37.8,117.9,66.0,2.0,29.0,6.0,2.0,5.0,2.0,5.0,3.0,7825.646112,8154.968193,99.0,99.0,99.0,56.0,55.0,52.0,68.0,66.0,66.0,,,5.2,,5.04,1.45,,,,0.72,11.6,1.0,10.0,1.0,0.0,2.0,2.0,,,1.0,2.0,1.0,2.0,,0.0,0.0,5.0,2.0,7.0,,,1.0,3.0,1.0,3.0,1.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,1.0,1.0,480.0,7.5,0.0,2.0,,,,,210.0,218.0
4,109269.0,13.6,,,,66.0,1.0,2.0,2.0,1.0,,,0.96,,5906.250521,6848.271782,,,,,,,,,,,,,,,,,,,0.73,41.7,,,,,,,,,,,,2.0,,3.0,0.0,0.0,2.0,3.0,,1.0,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,


In [51]:
# Update the SDDSRVYR column to represent the period
final_dataset_2017['SDDSRVYR'] = '2017-2020'

# Add the period to SEQN by concatenating the values
final_dataset_2017['SEQN'] = final_dataset_2017['SEQN'].astype(str) + '_' + final_dataset_2017['SDDSRVYR']

# Validate the changes
print(final_dataset_2017[['SEQN', 'SDDSRVYR']].head())

                 SEQN   SDDSRVYR
0  109263.0_2017-2020  2017-2020
1  109264.0_2017-2020  2017-2020
2  109265.0_2017-2020  2017-2020
3  109266.0_2017-2020  2017-2020
4  109269.0_2017-2020  2017-2020


In [52]:
# Create the WHtR feature
final_dataset_2017['WHtR'] = round(final_dataset_2017['BMXWAIST'] / final_dataset_2017['BMXHT'],2)
# Checking
#final_dataset_2017[['BMXWAIST', 'BMXHT', 'WHtR']].head(20)

# Calculate derived features
final_dataset_2017['AvgSystolicBP'] = final_dataset_2017[['BPXOSY1', 'BPXOSY2', 'BPXOSY3']].replace(0, np.nan).mean(axis=1)
final_dataset_2017['AvgDiastolicBP'] = final_dataset_2017[['BPXODI1', 'BPXODI2', 'BPXODI3']].replace(0, np.nan).mean(axis=1)
final_dataset_2017['PulsePressure'] = final_dataset_2017['AvgSystolicBP'] - final_dataset_2017['AvgDiastolicBP']
final_dataset_2017['AvgPulseRate'] = final_dataset_2017[['BPXOPLS1', 'BPXOPLS2', 'BPXOPLS3']].replace(0, np.nan).mean(axis=1)

# Drop original columns
columns_to_drop = ['BPXOSY1', 'BPXOSY2', 'BPXOSY3',
                   'BPXODI1', 'BPXODI2', 'BPXODI3',
                   'BPXOPLS1', 'BPXOPLS2', 'BPXOPLS3']

In [53]:
final_dataset_2017.drop(columns=columns_to_drop, inplace=True)
final_dataset_2017[['AvgSystolicBP', 'AvgDiastolicBP', 'PulsePressure', 'AvgPulseRate']].head()

Unnamed: 0,AvgSystolicBP,AvgDiastolicBP,PulsePressure,AvgPulseRate
0,,,,
1,108.0,67.0,41.0,93.333333
2,,,,
3,99.0,54.333333,44.666667,66.666667
4,,,,


In [54]:
final_dataset_2017.head()

Unnamed: 0,SEQN,BMXWT,BMXHT,BMXBMI,BMXWAIST,SDDSRVYR,RIAGENDR,RIDAGEYR,RIDRETH3,DMDBORN4,DMDEDUC2,RIDEXPRG,INDFMPIR,DMDMARTZ,WTINTPRP,WTMECPRP,WTSAFPRP,LBDGLUSI,LBXGH,LBXIN,LBDTCSI,LBDHDDSI,LBXTR,LBDTRSI,LBDLDL,LBXHSCRP,LBDFERSI,ALQ111,ALQ121,ALQ130,ALQ142,ALQ151,BPQ020,BPQ040A,BPQ050A,BPQ080,BPQ090D,DIQ160,DIQ010,DID040,DBD900,DBD905,DBD910,DBQ197,DBD895,DBD030,DBD041,DBQ930,DBQ700,HIQ011,INDFMMPC,MCQ080,MCQ160B,MCQ160C,MCQ160M,MCQ300C,DPQ020,DPQ030,DPQ040,DPQ060,DPQ070,PAQ605,PAQ620,PAQ635,PAQ650,PAQ665,PAD680,SLD012,SLQ120,SMQ020,SMD650,SMD030,SMQ040,WHD110,WHD120,WHD140,WHtR,AvgSystolicBP,AvgDiastolicBP,PulsePressure,AvgPulseRate
0,109263.0_2017-2020,,,,,2017-2020,1.0,2.0,6.0,1.0,,,4.66,,7891.762435,8951.815567,,,,,,,,,,,,,,,,,,,,,,,2.0,,0.0,0.0,5.0,3.0,1.0,0.0,365.0,,,1.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,109264.0_2017-2020,42.2,154.7,17.6,63.8,2017-2020,2.0,13.0,1.0,1.0,,,0.83,,11689.747264,12271.157043,27533.174559,5.38,5.3,6.05,4.29,1.86,40.0,0.452,86.0,0.11,15.7,,,,,,,,,,,2.0,2.0,,0.0,1.0,1.0,3.0,1.0,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.41,108.0,67.0,41.0,93.333333
2,109265.0_2017-2020,12.0,89.3,15.0,41.2,2017-2020,1.0,2.0,3.0,1.0,,,3.06,,16273.825939,16658.764203,,,,,,,,,,0.31,42.1,,,,,,,,,,,,2.0,,2.0,0.0,2.0,3.0,3.0,,999999.0,,,1.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.46,,,,
3,109266.0_2017-2020,97.1,160.2,37.8,117.9,2017-2020,2.0,29.0,6.0,2.0,5.0,2.0,5.0,3.0,7825.646112,8154.968193,,,5.2,,5.04,1.45,,,,0.72,11.6,1.0,10.0,1.0,0.0,2.0,2.0,,,1.0,2.0,1.0,2.0,,0.0,0.0,5.0,2.0,7.0,,,1.0,3.0,1.0,3.0,1.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,1.0,1.0,480.0,7.5,0.0,2.0,,,,,210.0,218.0,0.74,99.0,54.333333,44.666667,66.666667
4,109269.0_2017-2020,13.6,,,,2017-2020,1.0,2.0,2.0,1.0,,,0.96,,5906.250521,6848.271782,,,,,,,,,,0.73,41.7,,,,,,,,,,,,2.0,,3.0,0.0,0.0,2.0,3.0,,1.0,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [55]:

final_dataset_2017.drop(columns=["DBQ930"], inplace=True)


# Sauvegarde du dataset transformé et fusionné
final_dataset_2017.to_csv("dataset_2017.csv", index=False, encoding="utf-8")

In [56]:
final_dataset_2017 = final_dataset_2017.dropna(subset=['LBDGLUSI', 'LBXGH', 'LBXIN'])
print(final_dataset_2017.shape)
final_dataset_2017[['LBDGLUSI', 'LBXGH', 'LBXIN']].isna().sum()

(4616, 80)


LBDGLUSI    0
LBXGH       0
LBXIN       0
dtype: int64

In [57]:
columns_order = [
    'SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'WHtR', 'AvgSystolicBP', 
    'AvgDiastolicBP', 'PulsePressure', 'AvgPulseRate', 'SDDSRVYR', 'RIAGENDR', 
    'RIDAGEYR', 'RIDRETH3', 'DMDBORN4', 'DMDEDUC2', 'RIDEXPRG', 'INDFMPIR', 
    'DMDMARTZ', 'LBDGLUSI', 'LBXGH', 'LBXIN', 'LBDTCSI', 'LBDHDDSI', 'LBXTR', 
    'LBDTRSI', 'LBDLDL', 'LBXHSCRP', 'LBDFERSI', 'ALQ111', 'ALQ121', 'ALQ130', 
    'ALQ142', 'ALQ151', 'BPQ020', 'BPQ040A', 'BPQ050A', 'BPQ080', 'BPQ090D', 
    'DIQ160', 'DBD900', 'DBD905', 'DBD910', 'DBQ197', 'DBD895', 'HIQ011', 
    'INDFMMPC', 'MCQ080', 'MCQ160B', 'MCQ160C', 'MCQ160M', 'MCQ300C', 'DPQ020', 
    'DPQ030', 'DPQ040', 'DPQ060', 'DPQ070', 'PAQ605', 'PAQ620', 'PAQ635', 
    'PAQ650', 'PAQ665', 'PAD680', 'SLD012', 'SMQ020', 'SMD650', 
    'WHD110', 'WHD120', 'WHD140', #'HOMA_IR',
    # Таргетные переменные
    #'Diabetes_Status', 'Insulin_Resistance',
    # Весовые коэффициенты
    'WTINTPRP', 'WTMECPRP', 'WTSAFPRP'
]


final_dataset_2017 = final_dataset_2017[columns_order]

In [51]:

categorical_features = [
    'SEQN', 'SDDSRVYR', 'RIAGENDR', 'RIDRETH3', 'DMDBORN4', 'DMDEDUC2',
    'RIDEXPRG', 'DMDMARTZ', 'ALQ111', 'ALQ121', 'ALQ151', 'BPQ020',
    'BPQ040A', 'BPQ050A', 'BPQ080', 'BPQ090D', 'DIQ160', 'HIQ011',
    'INDFMMPC', 'MCQ080', 'MCQ160B', 'MCQ160C', 'MCQ160M', 'MCQ300C',
    'DPQ020', 'DPQ030', 'DPQ040', 'DPQ060', 'DPQ070', 'PAQ605', 'PAQ620',
    'PAQ635', 'PAQ650', 'PAQ665', 'SLD013', 'SMQ020', 'Diabetes_Status',
    'Insulin_Resistance'
]
numeric_features = [
    'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'WHtR', 'AvgSystolicBP',
    'AvgDiastolicBP', 'PulsePressure', 'AvgPulseRate', 'RIDAGEYR', 'INDFMPIR',
    'LBDGLUSI', 'LBXGH', 'LBXIN', 'LBDTCSI', 'LBDHDDSI', 'LBXTR', 'LBDTRSI',
    'LBDLDL', 'LBXHSCRP', 'LBDFERSI', 'ALQ130', 'ALQ142', 'DBD900', 'DBD905',
    'DBD910', 'DBQ197', 'DBD895', 'PAD680', 'SLD012', 'SMD650', 'WHD110',
    'WHD120', 'WHD140', 'HOMA_IR', 'WTINTPRP', 'WTMECPRP', 'WTSAFPRP'
]
