In [39]:
import os
import pyreadstat
import pandas as pd
from functools import reduce
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [40]:
# Paths for different periods
base_path_2009 = "../data/raw/2009-2010"

# Relative paths (common for all periods)
file_paths = {
    "body_measures": "BMX_F.xpt",
    "demographics_2009": "DEMO_F.xpt",
    "blood_pressure_2015": "BPX_F.xpt",
    "LabData_fast_glucose": "GLU_F.xpt",
    "LabData_glycohemog": "GHB_F.xpt",
    #"LabData_insulin": "INS_G.xpt",
    "LabData_Cholest_total": "TCHOL_F.xpt",
    "LabData_Cholest_HDL": "HDL_F.xpt",
    "LabData_Cholest_LDL": "TRIGLY_F.xpt",
    "LabData_C_protein": "HSCRP_F.xpt",
    "LabData_Feretin": "FERTIN_F.xpt",
    "SurveyData_Alcohol_2015": "ALQ_F.xpt",
    "SurveyData_Pressure_Cholest": "BPQ_F.xpt",
    "SurveyData_Diabetes": "DIQ_F.xpt",
    "SurveyData_Diet_Behavior": "DBQ_F.xpt",
    "SurveyData_Insurance": "HIQ_F.xpt",
    "SurveyData_Income": "INQ_F.xpt",
    "SurveyData_Medical": "MCQ_F.xpt",
    "SurveyData_MentalHealth": "DPQ_F.xpt",
    "SurveyData_PhysicalActivity": "PAQ_F.xpt",
    "SurveyData_SleepDisorders_2013": "SLQ_F.xpt",
    "SurveyData_Smoking": "SMQ_F.xpt",
    "SurveyData_WeightHistory": "WHQ_F.xpt"
}


columns_to_keep_2009 = {
    "body_measures": ["SEQN", "BMXWT", "BMXHT", "BMXBMI", "BMXWAIST"],
    "demographics_2009": ["SEQN", "SDDSRVYR", "RIAGENDR", "RIDAGEYR","RIDRETH1","DMDBORN2","DMDEDUC2","RIDEXPRG","INDFMPIR","DMDMARTL","WTINT2YR","WTMEC2YR"],
    "blood_pressure_2015": ["SEQN", "BPXSY1", "BPXSY2", "BPXSY3", "BPXDI1", "BPXDI2", "BPXDI3", "BPXPLS"],
    "LabData_fast_glucose": ["SEQN","WTSAF2YR" ,"LBDGLUSI",'LBXIN'],
    "LabData_glycohemog": ["SEQN","LBXGH"],
   #"LabData_insulin": ["SEQN","LBXIN"],      
    "LabData_Cholest_total": ["SEQN","LBDTCSI"],
    "LabData_Cholest_HDL": ["SEQN","LBDHDDSI"],
    "LabData_Cholest_LDL": ["SEQN","LBXTR", "LBDTRSI","LBDLDL"],         
    "LabData_C_protein": ["SEQN","LBXHSCRP"],
    "LabData_Feretin": ["SEQN","LBDFERSI"], 
    "SurveyData_Alcohol_2015" : ["SEQN","ALQ110","ALQ120Q","ALQ130","ALQ140Q","ALQ150"],

    "SurveyData_Pressure_Cholest" : ["SEQN","BPQ020","BPQ040A","BPQ050A","BPQ080", "BPQ090D"],
    "SurveyData_Diabetes" : ["SEQN","DIQ160", "DIQ010", "DID040"],
    "SurveyData_Diet_Behavior" : ["SEQN","DBD900","DBD905","DBD910","DBQ197","DBD895","DBD030", "DBD041", "DBQ700"],
    "SurveyData_Insurance" : ["SEQN","HIQ011"],
    "SurveyData_Income" : ["SEQN","INDFMMPC"],
    "SurveyData_Medical" : ["SEQN","MCQ080","MCQ160B","MCQ160C","MCQ160M","MCQ300C"],
    "SurveyData_MentalHealth" : ["SEQN","DPQ020","DPQ030","DPQ040","DPQ060","DPQ070"],
    "SurveyData_PhysicalActivity" : ["SEQN","PAQ605","PAQ620","PAQ635","PAQ650","PAQ665","PAD680"],

    "SurveyData_SleepDisorders_2013" : ["SEQN","SLD010H"],
    #"SurveyData_SleepDisorders" : ["SEQN","SLD012","SLQ120"],
    "SurveyData_Smoking" : ["SEQN","SMQ020","SMD650","SMD030","SMQ040"],
    "SurveyData_WeightHistory" : ["SEQN","WHD110","WHD120","WHD140"]  
}




In [41]:
def load_data(base_path, file_paths, columns_to_keep):
    """
    Loads data from the specified file paths and filters the columns.

    """

    dataframes = {}
    for key, relative_path in file_paths.items():
        # Construct the full file path
        file_path = os.path.join(base_path, relative_path)
        try:
            # Check if the file exists
            if not os.path.exists(file_path):
                print(f"Error: File '{file_path}' does not exist!")
                continue

            # Handle specific encoding if needed (e.g., for 'LabData_insulin')
            if key == "LabData_insulin":
                data, _ = pyreadstat.read_xport(file_path, encoding="latin1")
            else:
                # Default loading
                data, _ = pyreadstat.read_xport(file_path)

            # Filter columns based on the specified list
            dataframes[key] = data[columns_to_keep[key]]
            print(f"The file '{key}' has been successfully loaded.")
        except Exception as e:
            print(f"Error loading the file '{key}': {e}")
    return dataframes

In [42]:
# Load data 
dataframes_2009 = load_data(base_path_2009, file_paths, columns_to_keep_2009)

# Check the loaded datasets
print("Loaded datasets:")
for key, df in dataframes_2009.items():
    print(f"{key}: {len(df)} rows, {len(df.columns)} columns")

The file 'body_measures' has been successfully loaded.
The file 'demographics_2009' has been successfully loaded.
The file 'blood_pressure_2015' has been successfully loaded.
The file 'LabData_fast_glucose' has been successfully loaded.
The file 'LabData_glycohemog' has been successfully loaded.
The file 'LabData_Cholest_total' has been successfully loaded.
The file 'LabData_Cholest_HDL' has been successfully loaded.
The file 'LabData_Cholest_LDL' has been successfully loaded.
Error: File '../data/raw/2009-2010/HSCRP_F.xpt' does not exist!
Error: File '../data/raw/2009-2010/FERTIN_F.xpt' does not exist!
The file 'SurveyData_Alcohol_2015' has been successfully loaded.
The file 'SurveyData_Pressure_Cholest' has been successfully loaded.
The file 'SurveyData_Diabetes' has been successfully loaded.
The file 'SurveyData_Diet_Behavior' has been successfully loaded.
The file 'SurveyData_Insurance' has been successfully loaded.
The file 'SurveyData_Income' has been successfully loaded.
The fil

In [43]:
final_dataset_2009 = reduce(
    lambda left, right: left.merge(right, on="SEQN", how="outer"), 
    dataframes_2009.values())

print(final_dataset_2009.shape)
final_dataset_2009.head()

(10537, 79)


Unnamed: 0,SEQN,BMXWT,BMXHT,BMXBMI,BMXWAIST,SDDSRVYR,RIAGENDR,RIDAGEYR,RIDRETH1,DMDBORN2,DMDEDUC2,RIDEXPRG,INDFMPIR,DMDMARTL,WTINT2YR,WTMEC2YR,BPXSY1,BPXSY2,BPXSY3,BPXDI1,BPXDI2,BPXDI3,BPXPLS,WTSAF2YR,LBDGLUSI,LBXIN,LBXGH,LBDTCSI,LBDHDDSI,LBXTR,LBDTRSI,LBDLDL,ALQ110,ALQ120Q,ALQ130,ALQ140Q,ALQ150,BPQ020,BPQ040A,BPQ050A,BPQ080,BPQ090D,DIQ160,DIQ010,DID040,DBD900,DBD905,DBD910,DBQ197,DBD895,DBD030,DBD041,DBQ700,HIQ011,INDFMMPC,MCQ080,MCQ160B,MCQ160C,MCQ160M,MCQ300C,DPQ020,DPQ030,DPQ040,DPQ060,DPQ070,PAQ605,PAQ620,PAQ635,PAQ650,PAQ665,PAD680,SLD010H,SMQ020,SMD650,SMD030,SMQ040,WHD110,WHD120,WHD140
0,51624.0,87.4,164.7,32.22,100.4,6.0,1.0,34.0,3.0,1.0,3.0,,1.36,1.0,80100.543512,81528.772006,114.0,114.0,112.0,88.0,88.0,82.0,70.0,,,,5.2,3.49,1.29,,,,,0.0,,,1.0,2.0,,,,,2.0,2.0,,1.0,0.0,8.0,2.0,1.0,,,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,480.0,4.0,1.0,,18.0,3.0,,160.0,200.0
1,51625.0,17.0,105.4,15.3,49.0,6.0,1.0,4.0,5.0,1.0,,,1.07,,53901.104285,56995.035425,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,2.0,0.0,3.0,0.0,273.0,1.0,,1.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,
2,51626.0,72.3,181.3,22.0,74.7,6.0,1.0,16.0,4.0,1.0,,,2.27,,13953.078343,14509.27886,112.0,114.0,104.0,62.0,60.0,58.0,68.0,,,,5.7,4.97,1.55,,,,,,,,,2.0,,,,,2.0,2.0,,,2.0,0.0,2.0,0.0,,,1.0,1.0,2.0,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,1.0,240.0,8.0,,,,,,,
3,51627.0,39.8,147.8,18.22,63.0,6.0,1.0,10.0,4.0,1.0,,,0.81,,11664.899398,12041.635365,92.0,94.0,92.0,36.0,44.0,38.0,68.0,,,,,4.16,1.89,,,,,,,,,,,,,,,2.0,,1.0,0.0,3.0,3.0,2.0,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,
4,51628.0,116.8,166.0,42.39,118.2,6.0,2.0,60.0,4.0,1.0,3.0,,0.69,2.0,20090.339256,21000.338724,154.0,150.0,150.0,70.0,68.0,68.0,72.0,,,,6.0,5.22,1.16,,,,1.0,0.0,,,2.0,1.0,1.0,1.0,1.0,1.0,,1.0,56.0,,0.0,0.0,1.0,0.0,,,4.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,240.0,4.0,1.0,4.0,16.0,1.0,190.0,175.0,250.0


In [24]:
# Update the SDDSRVYR column to represent the period
final_dataset_2009['SDDSRVYR'] = '2009-2010'

# Add the period to SEQN by concatenating the values
final_dataset_2009['SEQN'] = final_dataset_2009['SEQN'].astype(str) + '_' + final_dataset_2009['SDDSRVYR']

# Validate the changes
print(final_dataset_2009[['SEQN', 'SDDSRVYR']].head())

                SEQN   SDDSRVYR
0  51624.0_2009-2010  2009-2010
1  51625.0_2009-2010  2009-2010
2  51626.0_2009-2010  2009-2010
3  51627.0_2009-2010  2009-2010
4  51628.0_2009-2010  2009-2010


In [25]:
# Create the WHtR feature
final_dataset_2009['WHtR'] = round(final_dataset_2009['BMXWAIST'] / final_dataset_2009['BMXHT'],2)
# Checking
#final_dataset_2017[['BMXWAIST', 'BMXHT', 'WHtR']].head(20)

# Calculate derived features
final_dataset_2009['AvgSystolicBP'] = final_dataset_2009[['BPXSY1', 'BPXSY2', 'BPXSY3']].mean(axis=1)
final_dataset_2009['AvgDiastolicBP'] = final_dataset_2009[['BPXDI1', 'BPXDI2', 'BPXDI3']].mean(axis=1)
final_dataset_2009['PulsePressure'] = final_dataset_2009['AvgSystolicBP'] - final_dataset_2009['AvgDiastolicBP']
final_dataset_2009['AvgPulseRate'] = final_dataset_2009['BPXPLS']

# Drop original columns
columns_to_drop = ['BPXSY1', 'BPXSY2', 'BPXSY3',
                   'BPXDI1', 'BPXDI2', 'BPXDI3',
                   'BPXPLS']

In [26]:
final_dataset_2009.drop(columns=columns_to_drop, inplace=True)
final_dataset_2009[['AvgSystolicBP', 'AvgDiastolicBP', 'PulsePressure', 'AvgPulseRate']].head()


print(final_dataset_2009.shape)
print(final_dataset_2009.columns)

final_dataset_2009.head()

(10537, 66)
Index(['SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'WTSAF2YR', 'LBDGLUSI',
       'LBXIN', 'LBXGH', 'LBDTCSI', 'LBDHDDSI', 'LBXTR', 'LBDTRSI', 'LBDLDL',
       'ALQ110', 'ALQ120Q', 'ALQ130', 'ALQ140Q', 'ALQ150', 'BPQ020', 'BPQ040A',
       'BPQ050A', 'BPQ080', 'BPQ090D', 'DIQ160', 'DIQ010', 'DID040', 'DBD900',
       'DBD905', 'DBD910', 'DBQ197', 'DBD895', 'DBD030', 'DBD041', 'DBQ700',
       'HIQ011', 'INDFMMPC', 'MCQ080', 'MCQ160B', 'MCQ160C', 'MCQ160M',
       'MCQ300C', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ060', 'DPQ070', 'PAQ605',
       'PAQ620', 'PAQ635', 'PAQ650', 'PAQ665', 'PAD680', 'SMQ020', 'SMD650',
       'SMD030', 'SMQ040', 'WHD110', 'WHD120', 'WHD140', 'SDDSRVYR', 'WHtR',
       'AvgSystolicBP', 'AvgDiastolicBP', 'PulsePressure', 'AvgPulseRate'],
      dtype='object')


Unnamed: 0,SEQN,BMXWT,BMXHT,BMXBMI,BMXWAIST,WTSAF2YR,LBDGLUSI,LBXIN,LBXGH,LBDTCSI,LBDHDDSI,LBXTR,LBDTRSI,LBDLDL,ALQ110,ALQ120Q,ALQ130,ALQ140Q,ALQ150,BPQ020,BPQ040A,BPQ050A,BPQ080,BPQ090D,DIQ160,DIQ010,DID040,DBD900,DBD905,DBD910,DBQ197,DBD895,DBD030,DBD041,DBQ700,HIQ011,INDFMMPC,MCQ080,MCQ160B,MCQ160C,MCQ160M,MCQ300C,DPQ020,DPQ030,DPQ040,DPQ060,DPQ070,PAQ605,PAQ620,PAQ635,PAQ650,PAQ665,PAD680,SMQ020,SMD650,SMD030,SMQ040,WHD110,WHD120,WHD140,SDDSRVYR,WHtR,AvgSystolicBP,AvgDiastolicBP,PulsePressure,AvgPulseRate
0,51624.0_2009-2010,87.4,164.7,32.22,100.4,,,,5.2,3.49,1.29,,,,,0.0,,,1.0,2.0,,,,,2.0,2.0,,1.0,0.0,8.0,2.0,1.0,,,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,480.0,1.0,,18.0,3.0,,160.0,200.0,2009-2010,0.61,113.333333,86.0,27.333333,70.0
1,51625.0_2009-2010,17.0,105.4,15.3,49.0,,,,,,,,,,,,,,,,,,,,,2.0,,,2.0,0.0,3.0,0.0,273.0,1.0,,1.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,2009-2010,0.46,,,,
2,51626.0_2009-2010,72.3,181.3,22.0,74.7,,,,5.7,4.97,1.55,,,,,,,,,2.0,,,,,2.0,2.0,,,2.0,0.0,2.0,0.0,,,1.0,1.0,2.0,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,1.0,240.0,,,,,,,,2009-2010,0.41,110.0,60.0,50.0,68.0
3,51627.0_2009-2010,39.8,147.8,18.22,63.0,,,,,4.16,1.89,,,,,,,,,,,,,,,2.0,,1.0,0.0,3.0,3.0,2.0,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,2009-2010,0.43,92.666667,39.333333,53.333333,68.0
4,51628.0_2009-2010,116.8,166.0,42.39,118.2,,,,6.0,5.22,1.16,,,,1.0,0.0,,,2.0,1.0,1.0,1.0,1.0,1.0,,1.0,56.0,,0.0,0.0,1.0,0.0,,,4.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,240.0,1.0,4.0,16.0,1.0,190.0,175.0,250.0,2009-2010,0.71,151.333333,68.666667,82.666667,72.0


In [None]:

# Переименовываем ALQ150 → ALQ151 для соответствия 2015-2020
final_dataset_2009.rename(columns={"ALQ150": "ALQ151"}, inplace=True)
# Переименовываем ALQ140Q → ALQ141Q
final_dataset_2009.rename(columns={"ALQ140Q": "ALQ141Q"}, inplace=True)
# RIDRETH3 появилась новая категория “Non-Hispanic Asian” (код 6) начиная с 2011 
final_dataset_2009.rename(columns={"RIDRETH1": "RIDRETH3"}, inplace=True)


# Функция для приведения DMDBORN2 (2009) к DMDBORN4 (2015)
def convert_dmdborn2_to_dmdborn4(value):
    if pd.isna(value):
        return None  # Пропущенные значения остаются NaN
    elif value == 1:
        return 1  # Born in US
    elif value in [2, 4, 5]:
        return 2  # Объединяем все страны, кроме США, в "Others"
    elif value == 7:
        return 77  
    elif value == 9:
        return 99  # Don't know
    else:
        return None  

# Применяем преобразование
final_dataset_2009["DMDBORN4"] = final_dataset_2009["DMDBORN2"].apply(convert_dmdborn2_to_dmdborn4)

# Удаляем старый столбец DMDBORN2
final_dataset_2009.drop(columns=["DMDBORN2"], inplace=True)