In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
import pickle

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

<h1 style="text-align:center">Dataset Loading</h1>

In [2]:
def read_data():
    df = pd.read_csv("./input/telomere_geography_health.csv")
    df = df.drop(
        columns=[
            "socioeconomic_status",
            "bp",
            "bmi_category",
            "hr_category",
            "rr_category",
            "health_condition",
        ]
    )
    return df


def categorical_to_numeric(df, column, mapping, regex=False):
    df[column] = df[column].replace(mapping, regex=regex)


def fill_na(df, column, default):
    fill_value = np.nan
    if default == "median":
        fill_value = df[column].median()
    elif default == "mode":
        fill_value = df[column].mode()[0]

    df[column] = df[column].fillna(fill_value)

<h1 style="text-align:center">Data Preprocessing</h1>

In [3]:
def preprocess_data(df):

    categorical_to_numeric(
        df,
        "cigarette_smoking",
        {
            "No information": np.nan,
            "Former Smoker": np.nan,
            "Never Smoker": 0,
            "Occasional Smoker": 1,
            "Regular Smoker": 2,
        },
    )

    categorical_to_numeric(
        df,
        "physical_activity_cohort",
        {
            "No information": np.nan,
            "Sedentary (Inactive)": 0,
            "Minimally Active": 1,
            "Lightly Active": 2,
            "Moderately Active": 3,
            "Highly Active": 4,
        },
    )

    categorical_to_numeric(
        df,
        "alcohol_drinking",
        {
            "No information": np.nan,
            "Former Drinker": np.nan,
            "Never Drinker": 0,
            "Occasional Drinker": 1,
            "Moderate Drinker": 2,
            "Heavy Drinker": 3,
        },
    )

    categorical_to_numeric(
        df,
        "education_cohort",
        {
            "No information": np.nan,
            "Elementary Graduate": 0,
            "High School Graduate": 1,
            "College Undergraduate": 2,
            "Vocational Graduate": 3,
            "College Graduate": 4,
            "Postgraduate (Master's or Doctorate)": 5,
        },
    )

    categorical_to_numeric(
        df,
        "bp_category",
        {
            "No information": np.nan,
            "Hypotension (Low BP)": 0,
            "Normal BP": 1,
            "Elevated BP": 2,
            "Hypertension Stage 1": 3,
            "Hypertension Stage 2": 4,
            "Hypertensive Crisis": 5,
        },
    )

    categorical_to_numeric(
        df,
        "cardiovascular_disease_diagnosis",
        {
            "^No known*": 0,
            "^Non-cardiovascular*": 0,
            "^Single.*": 1,
            "^Multi.*": 1,
        },
        regex=True,
    )

    categorical_to_numeric(
        df,
        "allergy_disease_diagnosis",
        {
            "^No Diagnosed*": 0,
            "^Single.*": 1,
            "^Multi.*": 1,
        },
        regex=True,
    )

    df["hr"] = pd.to_numeric(df["hr"], errors="coerce")
    df["rr"] = pd.to_numeric(df["rr"], errors="coerce")

    fill_na(df, "hr", "median")
    fill_na(df, "rr", "median")
    fill_na(df, "bmi", "median")
    fill_na(df, "education_cohort", "mode")
    fill_na(df, "alcohol_drinking", "mode")
    fill_na(df, "cigarette_smoking", "mode")
    fill_na(df, "bp_category", "mode")
    fill_na(df, "physical_activity_cohort", "mode")

    df = pd.get_dummies(df, columns=["rural_or_urban", "sex", "marital_status"])
    return df

<h1 style="text-align:center">Model Training</h1>

In [4]:
def train_model(df):
    y = df.pop("cardiovascular_disease_diagnosis")
    X = df

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    selector = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))
    X_train_selected = pd.DataFrame(selector.fit_transform(X_train))
    X_test_selected = selector.transform(X_test)

    selected_features = X_train.columns[selector.get_support()]
    selected_variances = selector.variances_[selector.get_support()]

    base_svc = LinearSVC(tol=1e-5, dual=False).fit(X_train_selected, y_train)
    calibrated_svc = CalibratedClassifierCV(estimator=base_svc, method='sigmoid')

    pipeline = make_pipeline(StandardScaler(), calibrated_svc)
    pipeline.fit(X_train_selected, y_train)

    score = pipeline.score(X_test_selected, y_test)
    probabilties = np.array(pipeline.predict_proba(X_test_selected)[:,1])

    base_model_trained = pipeline.named_steps['calibratedclassifiercv'].estimator
    selected_weights = np.array(base_model_trained.coef_[0])

    y_prediction = pipeline.predict(X_test_selected)

    prediction_df = pd.DataFrame(
        data={
            "Actual": y_test.reset_index(drop=True).map({1: "Present", 0: "Absent"}),
            "Predicted": pd.Series(y_prediction)
            .reset_index(drop=True)
            .map({1: "Present", 0: "Absent"}),
            "Probability": np.round(probabilties*100, decimals=2),
            "Match": pd.Series((y_prediction == y_test).astype(bool))
            .reset_index(drop=True)
            .map({True: "Correct", False: "Incorrect"}),
        }
    )

    feature_vw = pd.DataFrame(
    data={
        "Selected Features": selected_features,
        "Variances": selected_variances,
        "Weights": selected_weights
    }
    )
    prediction_df["Accuracy"] = pd.Series(round(score*100, 2))
    return (pipeline, feature_vw, prediction_df.fillna(""))


<h1 style="text-align:center">Model Assessment</h1>

In [11]:
if __name__ == "__main__":
    df = read_data()
    df_preprocessed = preprocess_data(df)
    pipeline, feature_vw, prediction_df = train_model(df_preprocessed)

In [12]:
feature_vw

Unnamed: 0,Selected Features,Variances,Weights
0,telomere_length,55.308954,-0.016515
1,age,345.948426,0.03082
2,cigarette_smoking,0.409104,-0.046281
3,alcohol_drinking,0.565921,-0.010713
4,physical_activity_cohort,1.627438,-0.139213
5,education_cohort,3.240558,0.067076
6,bmi,530.501276,-0.003619
7,hr,144.899765,0.00271
8,rr,4.563226,0.034286
9,bp_category,1.338707,0.239614


In [13]:
def match_highlight(value):
    color = "green" if value == "Correct" else "red" if value == "Incorrect" else None
    return f"background-color: {color}; color: white"

prediction_df = prediction_df.applymap(lambda x: ('%.10g' % x) if isinstance(x, (int, float)) else x)
styled_prediction_df = prediction_df.style.applymap(match_highlight)

display(styled_prediction_df)

Unnamed: 0,Actual,Predicted,Probability,Match,Accuracy
0,Absent,Present,56.72,Incorrect,83.76
1,Absent,Absent,3.09,Correct,
2,Absent,Absent,6.08,Correct,
3,Absent,Absent,2.98,Correct,
4,Absent,Absent,2.32,Correct,
5,Absent,Absent,10.42,Correct,
6,Absent,Absent,13.77,Correct,
7,Absent,Absent,10.51,Correct,
8,Absent,Absent,8.4,Correct,
9,Absent,Absent,39.57,Correct,
