In [1]:
import os
import pyreadstat
import pandas as pd
from functools import reduce
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
# Paths for different periods
base_path_2013 = "../../data/raw/2013-2014"

# Relative paths (common for all periods)
file_paths = {
    "body_measures": "BMX_H.xpt",
    "demographics": "DEMO_H.xpt",
    "blood_pressure_2015": "BPX_H.xpt",
    "LabData_fast_glucose": "GLU_H.xpt",
    "LabData_glycohemog": "GHB_H.xpt",
    "LabData_insulin": "INS_H.xpt",
    "LabData_Cholest_total": "TCHOL_H.xpt",
    "LabData_Cholest_HDL": "HDL_H.xpt",
    "LabData_Cholest_LDL": "TRIGLY_H.xpt",
    "LabData_C_protein": "HSCRP_H.xpt",
    "LabData_Feretin": "FERTIN_H.xpt",
    "SurveyData_Alcohol_2015": "ALQ_H.xpt",
    "SurveyData_Pressure_Cholest": "BPQ_H.xpt",
    "SurveyData_Diabetes": "DIQ_H.xpt",
    "SurveyData_Diet_Behavior": "DBQ_H.xpt",
    "SurveyData_Insurance": "HIQ_H.xpt",
    "SurveyData_Income": "INQ_H.xpt",
    "SurveyData_Medical": "MCQ_H.xpt",
    "SurveyData_MentalHealth": "DPQ_H.xpt",
    "SurveyData_PhysicalActivity": "PAQ_H.xpt",
    "SurveyData_SleepDisorders_2013": "SLQ_H.xpt",
    "SurveyData_Smoking": "SMQ_H.xpt",
    "SurveyData_WeightHistory": "WHQ_H.xpt"
}


columns_to_keep_2013 = {
    "body_measures": ["SEQN", "BMXWT", "BMXHT", "BMXBMI", "BMXWAIST"],
    "demographics": ["SEQN", "SDDSRVYR", "RIAGENDR", "RIDAGEYR","RIDRETH3","DMDBORN4","DMDEDUC2","RIDEXPRG","INDFMPIR","DMDMARTL","WTINT2YR","WTMEC2YR"],
    "blood_pressure_2015": ["SEQN", "BPXSY1", "BPXSY2", "BPXSY3", "BPXDI1", "BPXDI2", "BPXDI3", "BPXPLS"],
    "LabData_fast_glucose": ["SEQN","WTSAF2YR" ,"LBDGLUSI"],
    "LabData_glycohemog": ["SEQN","LBXGH"],
    "LabData_insulin": ["SEQN","LBXIN"],      
    "LabData_Cholest_total": ["SEQN","LBDTCSI"],
    "LabData_Cholest_HDL": ["SEQN","LBDHDDSI"],
    "LabData_Cholest_LDL": ["SEQN","LBXTR", "LBDTRSI","LBDLDL"],         
    "LabData_C_protein": ["SEQN","LBXHSCRP"],
    "LabData_Feretin": ["SEQN","LBDFERSI"], 
    
    "SurveyData_Alcohol_2015" : ["SEQN","ALQ110","ALQ120Q","ALQ130","ALQ141Q","ALQ151"],

    "SurveyData_Pressure_Cholest" : ["SEQN","BPQ020","BPQ040A","BPQ050A","BPQ080", "BPQ090D"],
    "SurveyData_Diabetes" : ["SEQN","DIQ160", "DIQ010", "DID040"],
    "SurveyData_Diet_Behavior" : ["SEQN","DBD900","DBD905","DBD910","DBQ197","DBD895","DBD030", "DBD041", "DBQ700"],
    "SurveyData_Insurance" : ["SEQN","HIQ011"],
    "SurveyData_Income" : ["SEQN","INDFMMPC"],
    "SurveyData_Medical" : ["SEQN","MCQ080","MCQ160B","MCQ160C","MCQ160M","MCQ300C"],
    "SurveyData_MentalHealth" : ["SEQN","DPQ020","DPQ030","DPQ040","DPQ060","DPQ070"],
    "SurveyData_PhysicalActivity" : ["SEQN","PAQ605","PAQ620","PAQ635","PAQ650","PAQ665","PAD680"],

    "SurveyData_SleepDisorders_2013" : ["SEQN","SLD010H"],
    "SurveyData_Smoking" : ["SEQN","SMQ020","SMD650","SMD030","SMQ040"],
    "SurveyData_WeightHistory" : ["SEQN","WHD110","WHD120","WHD140"]  
}




In [3]:
def load_data(base_path, file_paths, columns_to_keep):
    """
    Loads data from the specified file paths and filters the columns.

    """

    dataframes = {}
    for key, relative_path in file_paths.items():
        # Construct the full file path
        file_path = os.path.join(base_path, relative_path)
        try:
            # Check if the file exists
            if not os.path.exists(file_path):
                print(f"Error: File '{file_path}' does not exist!")
                continue

            # Handle specific encoding if needed (e.g., for 'LabData_insulin')
            if key == "LabData_insulin":
                data, _ = pyreadstat.read_xport(file_path, encoding="latin1")
            else:
                # Default loading
                data, _ = pyreadstat.read_xport(file_path)

            # Filter columns based on the specified list
            dataframes[key] = data[columns_to_keep[key]]
            print(f"The file '{key}' has been successfully loaded.")
        except Exception as e:
            print(f"Error loading the file '{key}': {e}")
    return dataframes

In [4]:
# Load data 
dataframes_2013 = load_data(base_path_2013, file_paths, columns_to_keep_2013)

# Check the loaded datasets
print("Loaded datasets:")
for key, df in dataframes_2013.items():
    print(f"{key}: {len(df)} rows, {len(df.columns)} columns")

The file 'body_measures' has been successfully loaded.
The file 'demographics' has been successfully loaded.
The file 'blood_pressure_2015' has been successfully loaded.
The file 'LabData_fast_glucose' has been successfully loaded.
The file 'LabData_glycohemog' has been successfully loaded.
The file 'LabData_insulin' has been successfully loaded.
The file 'LabData_Cholest_total' has been successfully loaded.
The file 'LabData_Cholest_HDL' has been successfully loaded.
The file 'LabData_Cholest_LDL' has been successfully loaded.
Error: File '../../data/raw/2013-2014/HSCRP_H.xpt' does not exist!
Error: File '../../data/raw/2013-2014/FERTIN_H.xpt' does not exist!
The file 'SurveyData_Alcohol_2015' has been successfully loaded.
The file 'SurveyData_Pressure_Cholest' has been successfully loaded.
The file 'SurveyData_Diabetes' has been successfully loaded.
The file 'SurveyData_Diet_Behavior' has been successfully loaded.
The file 'SurveyData_Insurance' has been successfully loaded.
The file

In [5]:
final_dataset_2013 = reduce(
    lambda left, right: left.merge(right, on="SEQN", how="outer"), 
    dataframes_2013.values())

print(final_dataset_2013.shape)

final_dataset_2013.head()

(10175, 79)


Unnamed: 0,SEQN,BMXWT,BMXHT,BMXBMI,BMXWAIST,SDDSRVYR,RIAGENDR,RIDAGEYR,RIDRETH3,DMDBORN4,DMDEDUC2,RIDEXPRG,INDFMPIR,DMDMARTL,WTINT2YR,WTMEC2YR,BPXSY1,BPXSY2,BPXSY3,BPXDI1,BPXDI2,BPXDI3,BPXPLS,WTSAF2YR,LBDGLUSI,LBXGH,LBXIN,LBDTCSI,LBDHDDSI,LBXTR,LBDTRSI,LBDLDL,ALQ110,ALQ120Q,ALQ130,ALQ141Q,ALQ151,BPQ020,BPQ040A,BPQ050A,BPQ080,BPQ090D,DIQ160,DIQ010,DID040,DBD900,DBD905,DBD910,DBQ197,DBD895,DBD030,DBD041,DBQ700,HIQ011,INDFMMPC,MCQ080,MCQ160B,MCQ160C,MCQ160M,MCQ300C,DPQ020,DPQ030,DPQ040,DPQ060,DPQ070,PAQ605,PAQ620,PAQ635,PAQ650,PAQ665,PAD680,SLD010H,SMQ020,SMD650,SMD030,SMQ040,WHD110,WHD120,WHD140
0,73557.0,78.3,171.3,26.7,100.0,8.0,1.0,69.0,4.0,1.0,3.0,,0.84,4.0,13281.237386,13481.042095,122.0,114.0,102.0,72.0,76.0,74.0,86.0,,,13.9,,4.32,1.68,,,,,1.0,1.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,,1.0,62.0,8.0,0.0,4.0,1.0,8.0,,,2.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,600.0,7.0,1.0,,17.0,3.0,270.0,200.0,270.0
1,73558.0,89.5,176.8,28.6,107.6,8.0,1.0,54.0,3.0,1.0,3.0,,1.78,1.0,23682.057386,24471.769625,156.0,160.0,156.0,62.0,80.0,42.0,74.0,,,9.1,,4.4,1.29,,,,,7.0,4.0,2.0,1.0,1.0,2.0,,1.0,1.0,,1.0,23.0,,0.0,2.0,3.0,0.0,,,3.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,2.0,2.0,540.0,9.0,1.0,1.0,0.0,2.0,240.0,250.0,250.0
2,73559.0,88.9,175.3,28.9,109.2,8.0,1.0,72.0,3.0,1.0,4.0,,4.51,1.0,57214.803319,57193.285376,140.0,140.0,146.0,90.0,76.0,80.0,68.0,142196.890197,10.713,8.9,5.83,3.26,1.55,51.0,0.576,56.0,,0.0,,,2.0,1.0,1.0,1.0,1.0,1.0,,1.0,57.0,0.0,0.0,0.0,3.0,1.0,,,3.0,1.0,3.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,2.0,1.0,300.0,8.0,1.0,,20.0,3.0,180.0,190.0,228.0
3,73560.0,32.2,137.3,17.1,61.0,8.0,1.0,9.0,3.0,1.0,,,2.52,,55201.178592,55766.512438,108.0,102.0,104.0,38.0,34.0,38.0,64.0,,,,,4.34,1.58,,,,,,,,,,,,,,,2.0,,,0.0,6.0,3.0,0.0,,,,1.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,
4,73561.0,52.0,162.4,19.7,,8.0,2.0,73.0,3.0,1.0,5.0,,5.0,1.0,63709.667069,65541.871229,136.0,134.0,142.0,86.0,88.0,86.0,92.0,142266.006548,5.94,4.9,6.12,5.2,2.2,75.0,0.847,101.0,,0.0,,,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,,,4.0,1.0,3.0,0.0,,,2.0,1.0,3.0,2.0,2.0,2.0,1.0,2.0,1.0,0.0,3.0,0.0,0.0,2.0,1.0,2.0,2.0,2.0,480.0,9.0,2.0,,,,150.0,135.0,170.0


In [6]:
# Update the SDDSRVYR column to represent the period
final_dataset_2013['SDDSRVYR'] = '2013-2013'

# Add the period to SEQN by concatenating the values
final_dataset_2013['SEQN'] = final_dataset_2013['SEQN'].astype(str) + '_' + final_dataset_2013['SDDSRVYR']

# Validate the changes
print(final_dataset_2013[['SEQN', 'SDDSRVYR']].head())

                SEQN   SDDSRVYR
0  73557.0_2013-2013  2013-2013
1  73558.0_2013-2013  2013-2013
2  73559.0_2013-2013  2013-2013
3  73560.0_2013-2013  2013-2013
4  73561.0_2013-2013  2013-2013


In [7]:
# Create the WHtR feature
final_dataset_2013['WHtR'] = round(final_dataset_2013['BMXWAIST'] / final_dataset_2013['BMXHT'],2)
# Checking
#final_dataset_2017[['BMXWAIST', 'BMXHT', 'WHtR']].head(20)

# Calculate derived features
final_dataset_2013['AvgSystolicBP'] = final_dataset_2013[['BPXSY1', 'BPXSY2', 'BPXSY3']].replace(0, np.nan).mean(axis=1)
final_dataset_2013['AvgDiastolicBP'] = final_dataset_2013[['BPXDI1', 'BPXDI2', 'BPXDI3']].replace(0, np.nan).mean(axis=1)
final_dataset_2013['PulsePressure'] = final_dataset_2013['AvgSystolicBP'] - final_dataset_2013['AvgDiastolicBP']
final_dataset_2013['AvgPulseRate'] = final_dataset_2013['BPXPLS']

# Drop original columns
columns_to_drop = ['BPXSY1', 'BPXSY2', 'BPXSY3',
                   'BPXDI1', 'BPXDI2', 'BPXDI3',
                   'BPXPLS']

In [8]:
final_dataset_2013.drop(columns=columns_to_drop, inplace=True)
final_dataset_2013[['AvgSystolicBP', 'AvgDiastolicBP', 'PulsePressure', 'AvgPulseRate']].head()


print(final_dataset_2013.shape)
print(final_dataset_2013.columns)

final_dataset_2013.head()

(10175, 77)
Index(['SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'SDDSRVYR', 'RIAGENDR',
       'RIDAGEYR', 'RIDRETH3', 'DMDBORN4', 'DMDEDUC2', 'RIDEXPRG', 'INDFMPIR',
       'DMDMARTL', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'LBDGLUSI', 'LBXGH',
       'LBXIN', 'LBDTCSI', 'LBDHDDSI', 'LBXTR', 'LBDTRSI', 'LBDLDL', 'ALQ110',
       'ALQ120Q', 'ALQ130', 'ALQ141Q', 'ALQ151', 'BPQ020', 'BPQ040A',
       'BPQ050A', 'BPQ080', 'BPQ090D', 'DIQ160', 'DIQ010', 'DID040', 'DBD900',
       'DBD905', 'DBD910', 'DBQ197', 'DBD895', 'DBD030', 'DBD041', 'DBQ700',
       'HIQ011', 'INDFMMPC', 'MCQ080', 'MCQ160B', 'MCQ160C', 'MCQ160M',
       'MCQ300C', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ060', 'DPQ070', 'PAQ605',
       'PAQ620', 'PAQ635', 'PAQ650', 'PAQ665', 'PAD680', 'SLD010H', 'SMQ020',
       'SMD650', 'SMD030', 'SMQ040', 'WHD110', 'WHD120', 'WHD140', 'WHtR',
       'AvgSystolicBP', 'AvgDiastolicBP', 'PulsePressure', 'AvgPulseRate'],
      dtype='object')


Unnamed: 0,SEQN,BMXWT,BMXHT,BMXBMI,BMXWAIST,SDDSRVYR,RIAGENDR,RIDAGEYR,RIDRETH3,DMDBORN4,DMDEDUC2,RIDEXPRG,INDFMPIR,DMDMARTL,WTINT2YR,WTMEC2YR,WTSAF2YR,LBDGLUSI,LBXGH,LBXIN,LBDTCSI,LBDHDDSI,LBXTR,LBDTRSI,LBDLDL,ALQ110,ALQ120Q,ALQ130,ALQ141Q,ALQ151,BPQ020,BPQ040A,BPQ050A,BPQ080,BPQ090D,DIQ160,DIQ010,DID040,DBD900,DBD905,DBD910,DBQ197,DBD895,DBD030,DBD041,DBQ700,HIQ011,INDFMMPC,MCQ080,MCQ160B,MCQ160C,MCQ160M,MCQ300C,DPQ020,DPQ030,DPQ040,DPQ060,DPQ070,PAQ605,PAQ620,PAQ635,PAQ650,PAQ665,PAD680,SLD010H,SMQ020,SMD650,SMD030,SMQ040,WHD110,WHD120,WHD140,WHtR,AvgSystolicBP,AvgDiastolicBP,PulsePressure,AvgPulseRate
0,73557.0_2013-2013,78.3,171.3,26.7,100.0,2013-2013,1.0,69.0,4.0,1.0,3.0,,0.84,4.0,13281.237386,13481.042095,,,13.9,,4.32,1.68,,,,,1.0,1.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,,1.0,62.0,8.0,0.0,4.0,1.0,8.0,,,2.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,600.0,7.0,1.0,,17.0,3.0,270.0,200.0,270.0,0.58,112.666667,74.0,38.666667,86.0
1,73558.0_2013-2013,89.5,176.8,28.6,107.6,2013-2013,1.0,54.0,3.0,1.0,3.0,,1.78,1.0,23682.057386,24471.769625,,,9.1,,4.4,1.29,,,,,7.0,4.0,2.0,1.0,1.0,2.0,,1.0,1.0,,1.0,23.0,,0.0,2.0,3.0,0.0,,,3.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,2.0,2.0,540.0,9.0,1.0,1.0,0.0,2.0,240.0,250.0,250.0,0.61,157.333333,61.333333,96.0,74.0
2,73559.0_2013-2013,88.9,175.3,28.9,109.2,2013-2013,1.0,72.0,3.0,1.0,4.0,,4.51,1.0,57214.803319,57193.285376,142196.890197,10.713,8.9,5.83,3.26,1.55,51.0,0.576,56.0,,0.0,,,2.0,1.0,1.0,1.0,1.0,1.0,,1.0,57.0,0.0,0.0,0.0,3.0,1.0,,,3.0,1.0,3.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,2.0,1.0,300.0,8.0,1.0,,20.0,3.0,180.0,190.0,228.0,0.62,142.0,82.0,60.0,68.0
3,73560.0_2013-2013,32.2,137.3,17.1,61.0,2013-2013,1.0,9.0,3.0,1.0,,,2.52,,55201.178592,55766.512438,,,,,4.34,1.58,,,,,,,,,,,,,,,2.0,,,0.0,6.0,3.0,0.0,,,,1.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,0.44,104.666667,36.666667,68.0,64.0
4,73561.0_2013-2013,52.0,162.4,19.7,,2013-2013,2.0,73.0,3.0,1.0,5.0,,5.0,1.0,63709.667069,65541.871229,142266.006548,5.94,4.9,6.12,5.2,2.2,75.0,0.847,101.0,,0.0,,,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,,,4.0,1.0,3.0,0.0,,,2.0,1.0,3.0,2.0,2.0,2.0,1.0,2.0,1.0,0.0,3.0,0.0,0.0,2.0,1.0,2.0,2.0,2.0,480.0,9.0,2.0,,,,150.0,135.0,170.0,,137.333333,86.666667,50.666667,92.0


In [9]:
# Функция для преобразования SLD010H → SLD012
def convert_sld010h_to_sld012(value):
    if pd.isna(value):  
        return np.nan  # Используем np.nan вместо None
    value = float(value)  
    
    if value < 3.0: 
        return 2  
    elif value <= 11.0:  # Упрощенное условие (3.0 <= value <= 11.0)
        return int(value)  
    elif value >= 12.0:  
        return 14  # 12 часов и больше → 14
    return np.nan  #

# Удаляем строки с кодами 77 и 99
final_dataset_2013 = final_dataset_2013[~final_dataset_2013["SLD010H"].isin([77, 99])]

# Применяем функцию преобразования
final_dataset_2013["SLD012"] = final_dataset_2013["SLD010H"].apply(convert_sld010h_to_sld012)

# Удаляем старый столбец
final_dataset_2013.drop(columns=["SLD010H"], inplace=True)

In [10]:
final_dataset_2013["SLD012"].value_counts()

8.0     1790
7.0     1707
6.0     1507
5.0      557
9.0      366
4.0      240
10.0     168
14.0      46
3.0       41
2.0       22
11.0      10
Name: SLD012, dtype: int64

In [11]:
# convertir les valeurs de glucose (C501 → C311)
def adjust_glucose_C501_to_C311(glucose_value):
    return 1.023 * glucose_value - 0.5108

# Appliquer l'ajustement 
final_dataset_2013["LBDGLUSI"] = final_dataset_2013["LBDGLUSI"].apply(adjust_glucose_C501_to_C311)


### **Transformation** 
### **ALQ120Q → ALQ121 (Fréquence consommation alcool)**
### **ALQ141Q → ALQ142 (Fréquence 4/5 verres consommés)**

In [12]:
def convert_alq120q_to_alq121(value):
    if pd.isna(value):  # Valeurs manquantes restent NaN
        return None
    if value == 0:
        return 0  # Never in the last year
    elif value == 365:
        return 1  # Every day
    elif 300 <= value < 365:
        return 2  # Nearly every day
    elif 156 <= value < 300:
        return 3  # 3 to 4 times a week
    elif 104 <= value < 156:
        return 4  # 2 times a week
    elif 52 <= value < 104:
        return 5  # Once a week
    elif 24 <= value < 52:
        return 6  # 2 to 3 times a month
    elif 12 <= value < 24:
        return 7  # Once a month
    elif 7 <= value < 12:
        return 8  # 7 to 11 times a year
    elif 3 <= value < 7:
        return 9  # 3 to 6 times a year
    elif 1 <= value < 3:
        return 10  # 1 to 2 times a year
    else:
        return None  # Valeur inconnue

final_dataset_2013["ALQ121"] = final_dataset_2013["ALQ120Q"].apply(convert_alq120q_to_alq121)
final_dataset_2013["ALQ142"] = final_dataset_2013["ALQ141Q"].apply(convert_alq120q_to_alq121)

#ALQ110 (2015-2016) devient ALQ111 pour être compatible avec 2017-2021.
final_dataset_2013.rename(columns={"ALQ110": "ALQ111"}, inplace=True)

# "DMDMARTL" (before 2017): -> "DMDMARTZ" (after 2017)
marital_status_mapping = {
    1: 1,  # Married -> Married/Living with Partner
    2: 2,  # Widowed -> Widowed/Divorced/Separated
    3: 2,  # Divorced -> Widowed/Divorced/Separated
    4: 2,  # Separated -> Widowed/Divorced/Separated
    5: 3,  # Never married -> Never married
    6: 1,  # Living with partner -> Married/Living with Partner
    77: 77,  # Refused -> Refused
    99: 99   # Don't know -> Don't know
}
if "DMDMARTL" in final_dataset_2013.columns:
    final_dataset_2013["DMDMARTZ"] = final_dataset_2013["DMDMARTL"].map(marital_status_mapping)
    final_dataset_2013.drop(columns=["DMDMARTL"], inplace=True)  # Удаляем старый столбец

rename_dict = {
    "WTINT2YR": "WTINTPRP",
    "WTSAF2YR": "WTSAFPRP",
    "WTMEC2YR": "WTMECPRP"
}

# Переименуем колонки в 2015-2016
final_dataset_2013.rename(columns=rename_dict, inplace=True)


final_dataset_2013.drop(columns=["ALQ120Q", "ALQ141Q"], inplace=True)


In [13]:

print(final_dataset_2013.shape)
print(final_dataset_2013.columns)
print(final_dataset_2013.shape)

final_dataset_2013.head()

# Sauvegarde du dataset transformé et fusionné
final_dataset_2013.to_csv("dataset_2013.csv", index=False, encoding="utf-8")

(10168, 77)
Index(['SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'SDDSRVYR', 'RIAGENDR',
       'RIDAGEYR', 'RIDRETH3', 'DMDBORN4', 'DMDEDUC2', 'RIDEXPRG', 'INDFMPIR',
       'WTINTPRP', 'WTMECPRP', 'WTSAFPRP', 'LBDGLUSI', 'LBXGH', 'LBXIN',
       'LBDTCSI', 'LBDHDDSI', 'LBXTR', 'LBDTRSI', 'LBDLDL', 'ALQ111', 'ALQ130',
       'ALQ151', 'BPQ020', 'BPQ040A', 'BPQ050A', 'BPQ080', 'BPQ090D', 'DIQ160',
       'DIQ010', 'DID040', 'DBD900', 'DBD905', 'DBD910', 'DBQ197', 'DBD895',
       'DBD030', 'DBD041', 'DBQ700', 'HIQ011', 'INDFMMPC', 'MCQ080', 'MCQ160B',
       'MCQ160C', 'MCQ160M', 'MCQ300C', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ060',
       'DPQ070', 'PAQ605', 'PAQ620', 'PAQ635', 'PAQ650', 'PAQ665', 'PAD680',
       'SMQ020', 'SMD650', 'SMD030', 'SMQ040', 'WHD110', 'WHD120', 'WHD140',
       'WHtR', 'AvgSystolicBP', 'AvgDiastolicBP', 'PulsePressure',
       'AvgPulseRate', 'SLD012', 'ALQ121', 'ALQ142', 'DMDMARTZ'],
      dtype='object')
(10168, 77)
