In [None]:
%cd ~/Documents/cvd-predictor
import polars as pl
import numpy as np

In [None]:
# df: pl.DataFrame = pl.read_csv("data/raw/LLCP2023-new.csv")
# df.write_parquet("data/intermediate/LLCP2023.parquet")
df: pl.DataFrame = pl.read_parquet("data/intermediate/LLCP2023.parquet")

In [None]:
variables: pl.DataFrame = pl.read_csv("data/raw/selected_variables.csv")
df: pl.DataFrame = df[variables["Variable"]]
df.columns = variables["Renamed"]
df

In [None]:
STATE: dict[int, str] = {
    1: "Alabama",
    2: "Alaska",
    4: "Arizona",
    5: "Arkansas",
    6: "California",
    8: "Colorado",
    9: "Connecticut",
    10: "Delaware",
    11: "District of Columbia",
    12: "Florida",
    13: "Georgia",
    15: "Hawaii",
    16: "Idaho",
    17: "Illinois",
    18: "Indiana",
    19: "Iowa",
    20: "Kansas",
    21: "Kentucky",
    22: "Louisiana",
    23: "Maine",
    24: "Maryland",
    25: "Massachusetts",
    26: "Michigan",
    27: "Minnesota",
    28: "Mississippi",
    29: "Missouri",
    30: "Montana",
    31: "Nebraska",
    32: "Nevada",
    33: "New Hampshire",
    34: "New Jersey",
    35: "New Mexico",
    36: "New York",
    37: "North Carolina",
    38: "North Dakota",
    39: "Ohio",
    40: "Oklahoma",
    41: "Oregon",
    42: "Pennsylvania",
    44: "Rhode Island",
    45: "South Carolina",
    46: "South Dakota",
    47: "Tennessee",
    48: "Texas",
    49: "Utah",
    50: "Vermont",
    51: "Virginia",
    53: "Washington",
    54: "West Virginia",
    55: "Wisconsin",
    56: "Wyoming",
    66: "Guam",
    72: "Puerto Rico",
    78: "Virgin Islands",
}

SEX: dict[int, str] = {1: "Male", 2: "Female"}

GEN_HEALTH: dict[int, str] = {
    1: "Excellent",
    2: "Very good",
    3: "Good",
    4: "Fair",
    5: "Poor",
}

PHYS_MEN_HEALTH: dict[int, float] = {77: -1, 88: 0, 99: -1}

LAST_CHECKUP: dict[int, str] = {
    1: "Within past year (anytime less than 12 months ago)",
    2: "Within past 2 years (1 year but less than 2 years ago)",
    3: "Within past 5 years (2 years but less than 5 years ago)",
    4: "5 or more years ago",
}

YES_NO_QUESTIONS: dict[int, str] = {1: "Yes", 2: "No"}

DIABETES: dict[int, str] = {
    1: "Yes",
    2: "Yes, but only during pregnancy (female)",
    3: "No",
    4: "No, pre-diabetes or borderline diabetes",
    7: "Don't know",
    9: "Refused to answer",
}

SMOKER_STATUS: dict[int, str] = {
    1: "Current smoker - now smokes every day",
    2: "Current smoker - now smokes some days",
    3: "Former smoker",
    4: "Never smoked",
}

ECIGARETTES: dict[int, str] = {
    1: "Never used e-cigarettes in my entire life",
    2: "Use them every day",
    3: "Use them some days",
    4: "Not at all (right now)",
}

RACE: dict[int, str] = {
    1: "White only, Non-Hispanic",
    2: "Black only, Non-Hispanic",
    3: "Other race only, Non-Hispanic",
    4: "Multiracial, Non-Hispanic",
    5: "Hispanic",
}

AGE_CATEGORY: dict[int, str] = {
    1: "Age 18 to 24",
    2: "Age 25 to 29",
    3: "Age 30 to 34",
    4: "Age 35 to 39",
    5: "Age 40 to 44",
    6: "Age 45 to 49",
    7: "Age 50 to 54",
    8: "Age 55 to 59",
    9: "Age 60 to 64",
    10: "Age 65 to 69",
    11: "Age 70 to 74",
    12: "Age 75 to 79",
    13: "Age 80 or older",
}

In [None]:
df: pl.DataFrame = df.with_columns(
    pl.col("State").map_elements(STATE.get, return_dtype=pl.Utf8).alias("State"),
    pl.col("AgeCategory").map_elements(AGE_CATEGORY.get, return_dtype=pl.Utf8).alias("AgeCategory"),
    (pl.col("HeightInMeters") / 100).alias("HeightInMeters"),
    (pl.col("WeightInKilograms") / 100).alias("WeightInKilograms"),
    (pl.col("BMI") / 100).alias("BMI"),
    pl.col("Sex").map_elements(SEX.get, return_dtype=pl.Utf8).alias("Sex"),
    pl.col("GeneralHealth").map_elements(GEN_HEALTH.get, return_dtype=pl.Utf8).alias("GeneralHealth"),  
    pl.col("PhysicalHealthDays").map_elements(lambda x: PHYS_MEN_HEALTH.get(x, int(x)), return_dtype=pl.Int64).alias("PhysicalHealthDays"),
    pl.col("MentalHealthDays").map_elements(lambda x: PHYS_MEN_HEALTH.get(x, int(x)), return_dtype=pl.Int64).alias("MentalHealthDays"),
    pl.col("LastCheckupTime").map_elements(LAST_CHECKUP.get, return_dtype=pl.Utf8).alias("LastCheckupTime"),
    pl.col("PhysicalActivities").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("PhysicalActivities"),
    pl.col("HadAsthma").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("HadAsthma"),
    pl.col("HadSkinCancer").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("HadSkinCancer"),
    pl.col("HadCOPD").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("HadCOPD"),
    pl.col("HadDepressiveDisorder").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("HadDepressiveDisorder"),
    pl.col("HadKidneyDisease").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("HadKidneyDisease"),
    pl.col("HadArthritis").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("HadArthritis"),
    pl.col("HadDiabetes").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("HadDiabetes"),
    pl.col("DeafOrHardOfHearing").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("DeafOrHardOfHearing"),
    pl.col("BlindOrVisionDifficulty").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("BlindOrVisionDifficulty"),
    pl.col("DifficultyConcentrating").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("DifficultyConcentrating"),
    pl.col("DifficultyWalking").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("DifficultyWalking"),
    pl.col("DifficultyDressingBathing").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("DifficultyDressingBathing"),
    pl.col("DifficultyErrands").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("DifficultyErrands"),
    pl.col("AlcoholDrinkers").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("AlcoholDrinkers"),
    pl.col("FluVaxLast12").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("FluVaxLast12"),
    pl.col("PneumoVaxEver").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("PneumoVaxEver"),
    pl.col("ECigaretteUsage").map_elements(ECIGARETTES.get, return_dtype=pl.Utf8).alias("ECigaretteUsage"),
    pl.col("SmokerStatus").map_elements(SMOKER_STATUS.get, return_dtype=pl.Utf8).alias("SmokerStatus"),
    pl.col("HadHeartAttack").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("HadHeartAttack"),
    pl.col("HadAngina").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("HadAngina"),
    pl.col("HadStroke").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("HadStroke"),
    pl.col("HaveHighCholesterol").map_elements(YES_NO_QUESTIONS.get, return_dtype=pl.Utf8).alias("HaveHighCholesterol"),
)

In [None]:
df: pl.DataFrame = df.with_columns(
    pl.when(
        (pl.col("BlindOrVisionDifficulty") == "Yes")
        | (pl.col("DeafOrHardOfHearing") == "Yes")
    )
    .then(1)
    .when(
        (pl.col("BlindOrVisionDifficulty") == "No")
        | (pl.col("DeafOrHardOfHearing") == "No")
    )
    .then(0)
    .otherwise(None) 
    .cast(pl.Int8)  
    .alias("Sensory Impairments")
)
df: pl.DataFrame = df.drop(["BlindOrVisionDifficulty", "DeafOrHardOfHearing"])

In [None]:
df: pl.DataFrame = df.with_columns(
    pl.when(
        (pl.col("FluVaxLast12") == "Yes")
        | (pl.col("PneumoVaxEver") == "Yes")
    )
    .then(1)
    .when(
        (pl.col("FluVaxLast12") == "No")
        | (pl.col("PneumoVaxEver") == "No")
    )
    .then(0)
    .otherwise(None) 
    .cast(pl.Int8)  
    .alias("Vaccinated")
)
df: pl.DataFrame = df.drop(["FluVaxLast12", "PneumoVaxEver"])

In [None]:
df: pl.DataFrame = df.with_columns(
    pl.when(
        (pl.col("DifficultyWalking") == "Yes")
        | (pl.col("DifficultyDressingBathing") == "Yes")
        | (pl.col("DifficultyErrands") == "Yes")
    )
    .then(1)
    .when(
        (pl.col("DifficultyWalking") == "No")
        | (pl.col("DifficultyDressingBathing") == "No")
        | (pl.col("DifficultyErrands") == "No")
    )
    .then(0)
    .otherwise(None) 
    .cast(pl.Int8)  
    .alias("Mobility")
)
df: pl.DataFrame = df.drop(["DifficultyWalking", "DifficultyDressingBathing", "DifficultyErrands"])

In [None]:
df: pl.DataFrame = df.with_columns(
    pl.when(
        (pl.col("HadHeartAttack") == "Yes")
        | (pl.col("HadAngina") == "Yes")
        | (pl.col("HadStroke") == "Yes")
    )
    .then(1)
    .when(
        (pl.col("HadHeartAttack") == "No")
        | (pl.col("HadAngina") == "No")
        | (pl.col("HadStroke") == "No")
    )
    .then(0)
    .otherwise(None) 
    .cast(pl.Int8)  
    .alias("CVD")
)
df: pl.DataFrame = df.drop(["HadHeartAttack", "HadAngina", "HadStroke", "State", "DifficultyConcentrating"])

In [None]:
df.drop_nans().write_csv("data/intermediate/heart_2023_no_nans.csv")
df.write_csv("data/intermediate/heart_2023.csv")
df