SOCIO ECONOMIC HEALTH DISPARITY

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load NHANES datasets
files = {
    "DEMO_L": "DEMO_L.XPT",     # Demographics
    "DIQ_L": "DIQ_L.XPT",       # Diabetes
    "HEPA_L": "HEPA_L.XPT",     # Hepatitis A
    "HEQ_L": "HEQ_L.XPT",       # Hepatitis B
    "HOQ_L": "HOQ_L.XPT",       # Housing
    "HSQ_L": "HSQ_L.XPT",       # HIV
    # "INQ_L": "INQ_L.XPT",       # Total Savings
    "KIQ_U_L": "KIQ_U_L.XPT",   # Weak Failing Kidneys
    "MCQ_L": "MCQ_L.XPT",       # Disease, ..., etc.
    "OCQ_L": "OCQ_L.XPT",       # Occupational
    "RXQ_RX_L": "RXQ_RX_L.XPT", # Taken Prescription Medicine
}

dataframes = {name: pd.read_sas(f"{filename}", format="xport") for name, filename in files.items()}

# Merge datasets on 'SEQN'
df = dataframes["DEMO_L"]
for name, df_other in dataframes.items():
    if name != "DEMO_L":
        df = df.merge(df_other, on="SEQN", how="inner")

# Select relevant columns
selected_columns = [
    "SEQN", "RIDAGEYR", "RIAGENDR", "DMDEDUC2", "INDFMPIR",  # Demographics (DEMO_L)
    # "IND310", # Total savings (INQ_L)
    "OCD150",  # Occupational (OCQ_L)
    "HOD051", # Housing (HOQ_L)
    "DIQ010", # Diabetes (DIQ_L)
    "MCQ010", "MCQ160B", "MCQ160C", "MCQ160E", "MCQ160M", "MCQ160P", "MCQ160L", "MCQ550", "MCQ220", # Disease (MCQ_L) 
    "HSQ590", # HIV (HSQ_L)
    "LBXHA", # Hepatitis A (HEPA_L)
    "HEQ010", # Hepatitis B (HEQ_L)
    "KIQ022", # Kidney (KIQ_U_L)
    "RXQ033" # Prescription (RXQ_RX_L)
]
df = df[[col for col in selected_columns if col in df.columns]]

# Convert non-numeric columns to NaN before computing median
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle missing values: Fill NaNs with column medians
df.fillna(df.median(), inplace=True)

# # Normalize numerical features for clustering
# numeric_features = ["BMXBMI", "BMXWAIST", "BPXSY1", "BPXDI1", "LBXGLU", "LBXINS", "LBXHSCRP", "LBXTC"]
# numeric_features = [col for col in numeric_features if col in df.columns]
# scaler = StandardScaler()
# df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Save final dataset
df.to_csv("NHANES_SES_RawData.csv", index=False)
print("Final dataset saved as 'NHANES_SES_RawData.csv'")


Final dataset saved as 'NHANES_SES_RawData.csv'


In [2]:
file = "NHANES_SES_RawData.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,DMDEDUC2,INDFMPIR,OCD150,HOD051,DIQ010,MCQ010,MCQ160B,...,MCQ160M,MCQ160P,MCQ160L,MCQ550,MCQ220,HSQ590,LBXHA,HEQ010,KIQ022,RXQ033
0,130378.0,43.0,1.0,5.0,5.0,1.0,10.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0
1,130379.0,66.0,1.0,5.0,5.0,1.0,9.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0
2,130380.0,44.0,2.0,3.0,1.41,1.0,6.0,1.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0
3,130386.0,34.0,1.0,4.0,1.33,1.0,3.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0
4,130387.0,68.0,2.0,5.0,1.32,4.0,7.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0


In [3]:
df.columns

Index(['SEQN', 'RIDAGEYR', 'RIAGENDR', 'DMDEDUC2', 'INDFMPIR', 'OCD150',
       'HOD051', 'DIQ010', 'MCQ010', 'MCQ160B', 'MCQ160C', 'MCQ160E',
       'MCQ160M', 'MCQ160P', 'MCQ160L', 'MCQ550', 'MCQ220', 'HSQ590', 'LBXHA',
       'HEQ010', 'KIQ022', 'RXQ033'],
      dtype='object')

In [4]:
# rename columns
df.columns = ['SEQN',
    'Age', 'Gender', 'Education', 'FamilyIncomeRatio', # Demographics
    'Occupation', # Occupational
    'Housing', # Housing
    'Diabetes', # Diabetes
    'Asthma', 'CongestiveHeartFailure', 'CoronaryHeartDisease', 'HeartAttack', 
    'ThyroidProblem', 'Emphysema', 'LiverCondition', 'Gallstone', 'CancerOrMalignancy', # Disease
    'HIV', # HIV
    'HepatitisA', # Hepatitis A
    'HepatitisB', # Hepatitis B
    'WeakFailingKidney', # Kidney
    'TakenPrescriptionMedicine' # Prescription
]

df.head()

Unnamed: 0,SEQN,Age,Gender,Education,FamilyIncomeRatio,Occupation,Housing,Diabetes,Asthma,CongestiveHeartFailure,...,ThyroidProblem,Emphysema,LiverCondition,Gallstone,CancerOrMalignancy,HIV,HepatitisA,HepatitisB,WeakFailingKidney,TakenPrescriptionMedicine
0,130378.0,43.0,1.0,5.0,5.0,1.0,10.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0
1,130379.0,66.0,1.0,5.0,5.0,1.0,9.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0
2,130380.0,44.0,2.0,3.0,1.41,1.0,6.0,1.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0
3,130386.0,34.0,1.0,4.0,1.33,1.0,3.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0
4,130387.0,68.0,2.0,5.0,1.32,4.0,7.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0


In [5]:
df.drop(columns=['SEQN'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6064 entries, 0 to 6063
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        6064 non-null   float64
 1   Gender                     6064 non-null   float64
 2   Education                  6064 non-null   float64
 3   FamilyIncomeRatio          6064 non-null   float64
 4   Occupation                 6064 non-null   float64
 5   Housing                    6064 non-null   float64
 6   Diabetes                   6064 non-null   float64
 7   Asthma                     6064 non-null   float64
 8   CongestiveHeartFailure     6064 non-null   float64
 9   CoronaryHeartDisease       6064 non-null   float64
 10  HeartAttack                6064 non-null   float64
 11  ThyroidProblem             6064 non-null   float64
 12  Emphysema                  6064 non-null   float64
 13  LiverCondition             6064 non-null   float

In [6]:
df.describe()

Unnamed: 0,Age,Gender,Education,FamilyIncomeRatio,Occupation,Housing,Diabetes,Asthma,CongestiveHeartFailure,CoronaryHeartDisease,...,ThyroidProblem,Emphysema,LiverCondition,Gallstone,CancerOrMalignancy,HIV,HepatitisA,HepatitisB,WeakFailingKidney,TakenPrescriptionMedicine
count,6064.0,6064.0,6064.0,6064.0,6064.0,6064.0,6064.0,6064.0,6064.0,6064.0,...,6064.0,6064.0,6064.0,6064.0,6064.0,6064.0,6064.0,6064.0,6064.0,6064.0
mean,53.850099,1.551451,3.83938,2.906971,2.370218,5.695086,1.893305,1.824703,1.967843,1.974604,...,1.875495,1.9375,1.952342,1.905508,1.85653,1.743734,1.618734,2.017645,1.974604,1.307058
std,17.18333,0.497387,1.144338,1.543564,1.448999,4.572527,0.410156,0.457434,0.338017,0.496414,...,0.464295,0.388545,0.365335,0.453487,0.417176,0.510125,0.497149,0.486179,0.355464,0.523292
min,20.0,1.0,1.0,5.397605e-79,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,39.0,1.0,3.0,1.62,1.0,4.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0
50%,57.0,2.0,4.0,2.82,1.0,5.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
75%,68.0,2.0,5.0,4.56,4.0,7.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
max,80.0,2.0,9.0,5.0,9.0,99.0,3.0,9.0,9.0,9.0,...,9.0,9.0,9.0,9.0,9.0,9.0,3.0,9.0,9.0,9.0


In [7]:
# display columns with missing values

df.isnull().sum()

Age                          0
Gender                       0
Education                    0
FamilyIncomeRatio            0
Occupation                   0
Housing                      0
Diabetes                     0
Asthma                       0
CongestiveHeartFailure       0
CoronaryHeartDisease         0
HeartAttack                  0
ThyroidProblem               0
Emphysema                    0
LiverCondition               0
Gallstone                    0
CancerOrMalignancy           0
HIV                          0
HepatitisA                   0
HepatitisB                   0
WeakFailingKidney            0
TakenPrescriptionMedicine    0
dtype: int64