In [74]:
import os
import pyreadstat
import pandas as pd
from functools import reduce
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [75]:
# Paths for different periods
base_path_2011 = "../data/raw/2011-2012"

# Relative paths (common for all periods)
file_paths = {
    "body_measures": "BMX_G.xpt",
    "demographics": "DEMO_G.xpt",
    "blood_pressure_2015": "BPX_G.xpt",
    "LabData_fast_glucose": "GLU_G.xpt",
    "LabData_glycohemog": "GHB_G.xpt",
    "LabData_insulin": "INS_G.xpt",
    "LabData_Cholest_total": "TCHOL_G.xpt",
    "LabData_Cholest_HDL": "HDL_G.xpt",
    "LabData_Cholest_LDL": "TRIGLY_G.xpt",
    "LabData_C_protein": "HSCRP_G.xpt",
    "LabData_Feretin": "FERTIN_G.xpt",
    "SurveyData_Alcohol_2015": "ALQ_G.xpt",
    "SurveyData_Pressure_Cholest": "BPQ_G.xpt",
    "SurveyData_Diabetes": "DIQ_G.xpt",
    "SurveyData_Diet_Behavior": "DBQ_G.xpt",
    "SurveyData_Insurance": "HIQ_G.xpt",
    "SurveyData_Income": "INQ_G.xpt",
    "SurveyData_Medical": "MCQ_G.xpt",
    "SurveyData_MentalHealth": "DPQ_G.xpt",
    "SurveyData_PhysicalActivity": "PAQ_G.xpt",
    "SurveyData_SleepDisorders_2013": "SLQ_G.xpt",
    "SurveyData_Smoking": "SMQ_G.xpt",
    "SurveyData_WeightHistory": "WHQ_G.xpt"
}


columns_to_keep_2011 = {
    "body_measures": ["SEQN", "BMXWT", "BMXHT", "BMXBMI", "BMXWAIST"],
    "demographics": ["SEQN", "SDDSRVYR", "RIAGENDR", "RIDAGEYR","RIDRETH3","DMDBORN4","DMDEDUC2","RIDEXPRG","INDFMPIR","DMDMARTL","WTINT2YR","WTMEC2YR"],
    "blood_pressure_2015": ["SEQN", "BPXSY1", "BPXSY2", "BPXSY3", "BPXDI1", "BPXDI2", "BPXDI3", "BPXPLS"],
    "LabData_fast_glucose": ["SEQN","WTSAF2YR" ,"LBDGLUSI",'LBXIN'],
    "LabData_glycohemog": ["SEQN","LBXGH"],
    "LabData_insulin": ["SEQN","LBXIN"],      
    "LabData_Cholest_total": ["SEQN","LBDTCSI"],
    "LabData_Cholest_HDL": ["SEQN","LBDHDDSI"],
    "LabData_Cholest_LDL": ["SEQN","LBXTR", "LBDTRSI","LBDLDL"],         
    "LabData_C_protein": ["SEQN","LBXHSCRP"],
    "LabData_Feretin": ["SEQN","LBDFERSI"], 
    "SurveyData_Alcohol_2015" : ["SEQN","ALQ110","ALQ120Q","ALQ130","ALQ141Q","ALQ151"],

    "SurveyData_Pressure_Cholest" : ["SEQN","BPQ020","BPQ040A","BPQ050A","BPQ080", "BPQ090D"],
    "SurveyData_Diabetes" : ["SEQN","DIQ160", "DIQ010", "DID040"],
    "SurveyData_Diet_Behavior" : ["SEQN","DBD900","DBD905","DBD910","DBQ197","DBD895","DBD030", "DBD041", "DBQ700"],
    "SurveyData_Insurance" : ["SEQN","HIQ011"],
    "SurveyData_Income" : ["SEQN","INDFMMPC"],
    "SurveyData_Medical" : ["SEQN","MCQ080","MCQ160B","MCQ160C","MCQ160M","MCQ300C"],
    "SurveyData_MentalHealth" : ["SEQN","DPQ020","DPQ030","DPQ040","DPQ060","DPQ070"],
    "SurveyData_PhysicalActivity" : ["SEQN","PAQ605","PAQ620","PAQ635","PAQ650","PAQ665","PAD680"],

    "SurveyData_SleepDisorders_2013" : ["SEQN","SLD010H"],
    #"SurveyData_SleepDisorders" : ["SEQN","SLD012","SLQ120"],
    "SurveyData_Smoking" : ["SEQN","SMQ020","SMD650","SMD030","SMQ040"],
    "SurveyData_WeightHistory" : ["SEQN","WHD110","WHD120","WHD140"]  
}




In [76]:
def load_data(base_path, file_paths, columns_to_keep):
    """
    Loads data from the specified file paths and filters the columns.

    """

    dataframes = {}
    for key, relative_path in file_paths.items():
        # Construct the full file path
        file_path = os.path.join(base_path, relative_path)
        try:
            # Check if the file exists
            if not os.path.exists(file_path):
                print(f"Error: File '{file_path}' does not exist!")
                continue

            # Handle specific encoding if needed (e.g., for 'LabData_insulin')
            if key == "LabData_insulin":
                data, _ = pyreadstat.read_xport(file_path, encoding="latin1")
            else:
                # Default loading
                data, _ = pyreadstat.read_xport(file_path)

            # Filter columns based on the specified list
            dataframes[key] = data[columns_to_keep[key]]
            print(f"The file '{key}' has been successfully loaded.")
        except Exception as e:
            print(f"Error loading the file '{key}': {e}")
    return dataframes

In [77]:
# Load data 
dataframes_2011 = load_data(base_path_2011, file_paths, columns_to_keep_2011)

# Check the loaded datasets
print("Loaded datasets:")
for key, df in dataframes_2011.items():
    print(f"{key}: {len(df)} rows, {len(df.columns)} columns")

The file 'body_measures' has been successfully loaded.
The file 'demographics' has been successfully loaded.
The file 'blood_pressure_2015' has been successfully loaded.
The file 'LabData_fast_glucose' has been successfully loaded.
The file 'LabData_glycohemog' has been successfully loaded.
Error: File '../data/raw/2011-2012/INS_G.xpt' does not exist!
The file 'LabData_Cholest_total' has been successfully loaded.
The file 'LabData_Cholest_HDL' has been successfully loaded.
The file 'LabData_Cholest_LDL' has been successfully loaded.
Error: File '../data/raw/2011-2012/HSCRP_G.xpt' does not exist!
Error: File '../data/raw/2011-2012/FERTIN_G.xpt' does not exist!
The file 'SurveyData_Alcohol_2015' has been successfully loaded.
The file 'SurveyData_Pressure_Cholest' has been successfully loaded.
The file 'SurveyData_Diabetes' has been successfully loaded.
The file 'SurveyData_Diet_Behavior' has been successfully loaded.
The file 'SurveyData_Insurance' has been successfully loaded.
The file 

In [78]:
final_dataset_2011 = reduce(
    lambda left, right: left.merge(right, on="SEQN", how="outer"), 
    dataframes_2011.values())

print(final_dataset_2011.shape)
final_dataset_2011.head()


(9756, 79)


Unnamed: 0,SEQN,BMXWT,BMXHT,BMXBMI,BMXWAIST,SDDSRVYR,RIAGENDR,RIDAGEYR,RIDRETH3,DMDBORN4,DMDEDUC2,RIDEXPRG,INDFMPIR,DMDMARTL,WTINT2YR,WTMEC2YR,BPXSY1,BPXSY2,BPXSY3,BPXDI1,BPXDI2,BPXDI3,BPXPLS,WTSAF2YR,LBDGLUSI,LBXIN,LBXGH,LBDTCSI,LBDHDDSI,LBXTR,LBDTRSI,LBDLDL,ALQ110,ALQ120Q,ALQ130,ALQ141Q,ALQ151,BPQ020,BPQ040A,BPQ050A,BPQ080,BPQ090D,DIQ160,DIQ010,DID040,DBD900,DBD905,DBD910,DBQ197,DBD895,DBD030,DBD041,DBQ700,HIQ011,INDFMMPC,MCQ080,MCQ160B,MCQ160C,MCQ160M,MCQ300C,DPQ020,DPQ030,DPQ040,DPQ060,DPQ070,PAQ605,PAQ620,PAQ635,PAQ650,PAQ665,PAD680,SLD010H,SMQ020,SMD650,SMD030,SMQ040,WHD110,WHD120,WHD140
0,62161.0,69.2,172.3,23.3,81.0,7.0,1.0,22.0,3.0,1.0,3.0,,3.15,5.0,102641.406474,104236.582554,110.0,104.0,118.0,82.0,68.0,74.0,82.0,240011.713,5.107,18.65,5.1,4.34,1.06,84.0,0.948,110.0,2.0,,,,,2.0,,,2.0,,2.0,2.0,,2.0,0.0,0.0,3.0,2.0,,,2.0,1.0,3.0,2.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,300.0,8.0,2.0,,,,,,
1,62162.0,12.7,94.7,14.2,45.4,7.0,2.0,3.0,1.0,1.0,,,0.6,,15457.736897,16116.35401,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,0.0,1.0,3.0,0.0,121.0,1.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,
2,62163.0,49.4,168.9,17.3,64.6,7.0,1.0,14.0,6.0,1.0,,,4.07,,7397.684828,7869.485117,112.0,108.0,106.0,38.0,36.0,38.0,72.0,,,,5.5,3.98,1.14,,,,,,,,,,,,,,2.0,2.0,,,0.0,2.0,3.0,0.0,,,,1.0,3.0,,,,,,,,,,,2.0,2.0,1.0,2.0,2.0,720.0,,,,,,,,
3,62164.0,67.2,170.1,23.2,80.1,7.0,2.0,44.0,3.0,1.0,4.0,2.0,1.67,1.0,127351.373299,127965.226204,116.0,118.0,120.0,56.0,66.0,58.0,82.0,288182.78,4.552,3.51,4.9,4.91,0.72,56.0,0.632,151.0,,,,,,2.0,,,2.0,,2.0,2.0,,0.0,0.0,0.0,0.0,3.0,,,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,,,,,,1.0,2.0,2.0,1.0,1.0,300.0,8.0,2.0,,,,145.0,118.0,172.0
4,62165.0,69.1,159.4,27.2,86.7,7.0,2.0,14.0,4.0,1.0,,,0.57,,12209.74498,13384.042162,110.0,104.0,106.0,64.0,72.0,78.0,70.0,32747.025,4.885,15.35,5.9,4.16,1.63,71.0,0.802,84.0,,,,,,,,,,,2.0,2.0,,2.0,5.0,3.0,3.0,2.0,,,,2.0,1.0,,,,,,,,,,,2.0,2.0,1.0,1.0,1.0,600.0,,,,,,,,


In [79]:
# Update the SDDSRVYR column to represent the period
final_dataset_2011['SDDSRVYR'] = '2011-2012'

# Add the period to SEQN by concatenating the values
final_dataset_2011['SEQN'] = final_dataset_2011['SEQN'].astype(str) + '_' + final_dataset_2011['SDDSRVYR']

# Validate the changes
print(final_dataset_2011[['SEQN', 'SDDSRVYR']].head())

                SEQN   SDDSRVYR
0  62161.0_2011-2012  2011-2012
1  62162.0_2011-2012  2011-2012
2  62163.0_2011-2012  2011-2012
3  62164.0_2011-2012  2011-2012
4  62165.0_2011-2012  2011-2012


In [80]:
# Create the WHtR feature
final_dataset_2011['WHtR'] = round(final_dataset_2011['BMXWAIST'] / final_dataset_2011['BMXHT'],2)
# Checking
#final_dataset_2017[['BMXWAIST', 'BMXHT', 'WHtR']].head(20)

# Calculate derived features
final_dataset_2011['AvgSystolicBP'] = final_dataset_2011[['BPXSY1', 'BPXSY2', 'BPXSY3']].mean(axis=1)
final_dataset_2011['AvgDiastolicBP'] = final_dataset_2011[['BPXDI1', 'BPXDI2', 'BPXDI3']].mean(axis=1)
final_dataset_2011['PulsePressure'] = final_dataset_2011['AvgSystolicBP'] - final_dataset_2011['AvgDiastolicBP']
final_dataset_2011['AvgPulseRate'] = final_dataset_2011['BPXPLS']

# Drop original columns
columns_to_drop = ['BPXSY1', 'BPXSY2', 'BPXSY3',
                   'BPXDI1', 'BPXDI2', 'BPXDI3',
                   'BPXPLS']

In [81]:
final_dataset_2011.drop(columns=columns_to_drop, inplace=True)
final_dataset_2011[['AvgSystolicBP', 'AvgDiastolicBP', 'PulsePressure', 'AvgPulseRate']].head()


print(final_dataset_2011.shape)
print(final_dataset_2011.columns)

final_dataset_2011.head()

(9756, 77)
Index(['SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'SDDSRVYR', 'RIAGENDR',
       'RIDAGEYR', 'RIDRETH3', 'DMDBORN4', 'DMDEDUC2', 'RIDEXPRG', 'INDFMPIR',
       'DMDMARTL', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'LBDGLUSI', 'LBXIN',
       'LBXGH', 'LBDTCSI', 'LBDHDDSI', 'LBXTR', 'LBDTRSI', 'LBDLDL', 'ALQ110',
       'ALQ120Q', 'ALQ130', 'ALQ141Q', 'ALQ151', 'BPQ020', 'BPQ040A',
       'BPQ050A', 'BPQ080', 'BPQ090D', 'DIQ160', 'DIQ010', 'DID040', 'DBD900',
       'DBD905', 'DBD910', 'DBQ197', 'DBD895', 'DBD030', 'DBD041', 'DBQ700',
       'HIQ011', 'INDFMMPC', 'MCQ080', 'MCQ160B', 'MCQ160C', 'MCQ160M',
       'MCQ300C', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ060', 'DPQ070', 'PAQ605',
       'PAQ620', 'PAQ635', 'PAQ650', 'PAQ665', 'PAD680', 'SLD010H', 'SMQ020',
       'SMD650', 'SMD030', 'SMQ040', 'WHD110', 'WHD120', 'WHD140', 'WHtR',
       'AvgSystolicBP', 'AvgDiastolicBP', 'PulsePressure', 'AvgPulseRate'],
      dtype='object')


Unnamed: 0,SEQN,BMXWT,BMXHT,BMXBMI,BMXWAIST,SDDSRVYR,RIAGENDR,RIDAGEYR,RIDRETH3,DMDBORN4,DMDEDUC2,RIDEXPRG,INDFMPIR,DMDMARTL,WTINT2YR,WTMEC2YR,WTSAF2YR,LBDGLUSI,LBXIN,LBXGH,LBDTCSI,LBDHDDSI,LBXTR,LBDTRSI,LBDLDL,ALQ110,ALQ120Q,ALQ130,ALQ141Q,ALQ151,BPQ020,BPQ040A,BPQ050A,BPQ080,BPQ090D,DIQ160,DIQ010,DID040,DBD900,DBD905,DBD910,DBQ197,DBD895,DBD030,DBD041,DBQ700,HIQ011,INDFMMPC,MCQ080,MCQ160B,MCQ160C,MCQ160M,MCQ300C,DPQ020,DPQ030,DPQ040,DPQ060,DPQ070,PAQ605,PAQ620,PAQ635,PAQ650,PAQ665,PAD680,SLD010H,SMQ020,SMD650,SMD030,SMQ040,WHD110,WHD120,WHD140,WHtR,AvgSystolicBP,AvgDiastolicBP,PulsePressure,AvgPulseRate
0,62161.0_2011-2012,69.2,172.3,23.3,81.0,2011-2012,1.0,22.0,3.0,1.0,3.0,,3.15,5.0,102641.406474,104236.582554,240011.713,5.107,18.65,5.1,4.34,1.06,84.0,0.948,110.0,2.0,,,,,2.0,,,2.0,,2.0,2.0,,2.0,0.0,0.0,3.0,2.0,,,2.0,1.0,3.0,2.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,300.0,8.0,2.0,,,,,,,0.47,110.666667,74.666667,36.0,82.0
1,62162.0_2011-2012,12.7,94.7,14.2,45.4,2011-2012,2.0,3.0,1.0,1.0,,,0.6,,15457.736897,16116.35401,,,,,,,,,,,,,,,,,,,,,2.0,,,0.0,1.0,3.0,0.0,121.0,1.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,0.48,,,,
2,62163.0_2011-2012,49.4,168.9,17.3,64.6,2011-2012,1.0,14.0,6.0,1.0,,,4.07,,7397.684828,7869.485117,,,,5.5,3.98,1.14,,,,,,,,,,,,,,2.0,2.0,,,0.0,2.0,3.0,0.0,,,,1.0,3.0,,,,,,,,,,,2.0,2.0,1.0,2.0,2.0,720.0,,,,,,,,,0.38,108.666667,37.333333,71.333333,72.0
3,62164.0_2011-2012,67.2,170.1,23.2,80.1,2011-2012,2.0,44.0,3.0,1.0,4.0,2.0,1.67,1.0,127351.373299,127965.226204,288182.78,4.552,3.51,4.9,4.91,0.72,56.0,0.632,151.0,,,,,,2.0,,,2.0,,2.0,2.0,,0.0,0.0,0.0,0.0,3.0,,,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,,,,,,1.0,2.0,2.0,1.0,1.0,300.0,8.0,2.0,,,,145.0,118.0,172.0,0.47,118.0,60.0,58.0,82.0
4,62165.0_2011-2012,69.1,159.4,27.2,86.7,2011-2012,2.0,14.0,4.0,1.0,,,0.57,,12209.74498,13384.042162,32747.025,4.885,15.35,5.9,4.16,1.63,71.0,0.802,84.0,,,,,,,,,,,2.0,2.0,,2.0,5.0,3.0,3.0,2.0,,,,2.0,1.0,,,,,,,,,,,2.0,2.0,1.0,1.0,1.0,600.0,,,,,,,,,0.54,106.666667,71.333333,35.333333,70.0


In [82]:
# Функция для преобразования SLD010H → SLD012
def convert_sld010h_to_sld012(value):
    if pd.isna(value):  
        return None
    
    value = float(value)  
    
    if value < 3.0: 
        return 2  
    elif 3.0 <= value <= 11.0:  
        return value  
    elif value >= 12.0:  
        return 14  # 12 часов и больше → 14
    else:
        return None  # Все неожиданные значения превращаем в None

final_dataset_2011["SLD012"] = final_dataset_2011["SLD010H"].apply(convert_sld010h_to_sld012)

# Удаляем строк где SLD010H == 77 или 99
final_dataset_2011 = final_dataset_2011[~final_dataset_2011["SLD010H"].isin([77, 99])]

# Удаляем старую колонку SLD010H
final_dataset_2011.drop(columns=["SLD010H"], inplace=True)

final_dataset_2011["SLD012"].value_counts()

8.0     1706
7.0     1605
6.0     1409
5.0      622
9.0      319
4.0      246
10.0     136
3.0       57
14.0      35
11.0      14
2.0        9
Name: SLD012, dtype: int64

In [83]:
# convertir les valeurs de glucose (C501 → C311)
def adjust_glucose_C501_to_C311(glucose_value):
    return 1.023 * glucose_value - 0.5108

# Appliquer l'ajustement 
final_dataset_2011["LBDGLUSI"] = final_dataset_2011["LBDGLUSI"].apply(adjust_glucose_C501_to_C311)

In [84]:
def convert_alq120q_to_alq121(value):
    if pd.isna(value):  # Valeurs manquantes restent NaN
        return None
    if value == 0:
        return 0  # Never in the last year
    elif value == 365:
        return 1  # Every day
    elif 300 <= value < 365:
        return 2  # Nearly every day
    elif 156 <= value < 300:
        return 3  # 3 to 4 times a week
    elif 104 <= value < 156:
        return 4  # 2 times a week
    elif 52 <= value < 104:
        return 5  # Once a week
    elif 24 <= value < 52:
        return 6  # 2 to 3 times a month
    elif 12 <= value < 24:
        return 7  # Once a month
    elif 7 <= value < 12:
        return 8  # 7 to 11 times a year
    elif 3 <= value < 7:
        return 9  # 3 to 6 times a year
    elif 1 <= value < 3:
        return 10  # 1 to 2 times a year
    else:
        return None  # Valeur inconnue

final_dataset_2011["ALQ121"] = final_dataset_2011["ALQ120Q"].apply(convert_alq120q_to_alq121)
final_dataset_2011["ALQ142"] = final_dataset_2011["ALQ141Q"].apply(convert_alq120q_to_alq121)

#ALQ110 (2015-2016) devient ALQ111 pour être compatible avec 2017-2021.
final_dataset_2011.rename(columns={"ALQ110": "ALQ111"}, inplace=True)

final_dataset_2011.rename(columns={"DMDMARTL": "DMDMARTZ"}, inplace=True)

rename_dict = {
    "WTINT2YR": "WTINTPRP",
    "WTSAF2YR": "WTSAFPRP",
    "WTMEC2YR": "WTMECPRP"
}

# Переименуем колонки в 2015-2016
final_dataset_2011.rename(columns=rename_dict, inplace=True)


final_dataset_2011.drop(columns=["ALQ120Q", "ALQ141Q"], inplace=True)




In [85]:

print(final_dataset_2011.shape)
print(final_dataset_2011.columns)
print(final_dataset_2011.shape)

final_dataset_2011.head()



# Sauvegarde du dataset transformé et fusionné
final_dataset_2011.to_csv("dataset_2011.csv", index=False, encoding="utf-8")

(9748, 77)
Index(['SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'SDDSRVYR', 'RIAGENDR',
       'RIDAGEYR', 'RIDRETH3', 'DMDBORN4', 'DMDEDUC2', 'RIDEXPRG', 'INDFMPIR',
       'DMDMARTZ', 'WTINTPRP', 'WTMECPRP', 'WTSAFPRP', 'LBDGLUSI', 'LBXIN',
       'LBXGH', 'LBDTCSI', 'LBDHDDSI', 'LBXTR', 'LBDTRSI', 'LBDLDL', 'ALQ111',
       'ALQ130', 'ALQ151', 'BPQ020', 'BPQ040A', 'BPQ050A', 'BPQ080', 'BPQ090D',
       'DIQ160', 'DIQ010', 'DID040', 'DBD900', 'DBD905', 'DBD910', 'DBQ197',
       'DBD895', 'DBD030', 'DBD041', 'DBQ700', 'HIQ011', 'INDFMMPC', 'MCQ080',
       'MCQ160B', 'MCQ160C', 'MCQ160M', 'MCQ300C', 'DPQ020', 'DPQ030',
       'DPQ040', 'DPQ060', 'DPQ070', 'PAQ605', 'PAQ620', 'PAQ635', 'PAQ650',
       'PAQ665', 'PAD680', 'SMQ020', 'SMD650', 'SMD030', 'SMQ040', 'WHD110',
       'WHD120', 'WHD140', 'WHtR', 'AvgSystolicBP', 'AvgDiastolicBP',
       'PulsePressure', 'AvgPulseRate', 'SLD012', 'ALQ121', 'ALQ142'],
      dtype='object')
(9748, 77)
